Merge branch 'master' of ctdb into 'master' of samba
authorStefan Metzmacher <metze@samba.org>
Wed, 13 Nov 2013 13:17:32 +0000 (14:17 +0100)
committerMichael Adam <obnox@samba.org>
Wed, 13 Nov 2013 13:18:52 +0000 (14:18 +0100)
Signed-off-by: Stefan Metzmacher <metze@samba.org>
Signed-off-by: Michael Adam <obnox@samba.org>
786 files changed:
ctdb/.bzrignore [new file with mode: 0644]
ctdb/.gitignore [new file with mode: 0644]
ctdb/COPYING [new file with mode: 0644]
ctdb/Makefile.in [new file with mode: 0755]
ctdb/NEWS [new file with mode: 0644]
ctdb/README [new file with mode: 0644]
ctdb/README.Coding [new file with mode: 0644]
ctdb/aclocal.m4 [new file with mode: 0644]
ctdb/autogen.sh [new file with mode: 0755]
ctdb/client/ctdb_client.c [new file with mode: 0644]
ctdb/common/cmdline.c [new file with mode: 0644]
ctdb/common/ctdb_fork.c [new file with mode: 0644]
ctdb/common/ctdb_io.c [new file with mode: 0644]
ctdb/common/ctdb_logging.c [new file with mode: 0644]
ctdb/common/ctdb_ltdb.c [new file with mode: 0644]
ctdb/common/ctdb_message.c [new file with mode: 0644]
ctdb/common/ctdb_util.c [new file with mode: 0644]
ctdb/common/rb_tree.c [new file with mode: 0644]
ctdb/common/rb_tree.h [new file with mode: 0644]
ctdb/common/system_aix.c [new file with mode: 0644]
ctdb/common/system_common.c [new file with mode: 0644]
ctdb/common/system_freebsd.c [new file with mode: 0644]
ctdb/common/system_gnu.c [new file with mode: 0644]
ctdb/common/system_kfreebsd.c [new file with mode: 0644]
ctdb/common/system_linux.c [new file with mode: 0644]
ctdb/config.guess [new file with mode: 0644]
ctdb/config.mk [new file with mode: 0644]
ctdb/config.sub [new file with mode: 0644]
ctdb/config/README [new file with mode: 0644]
ctdb/config/ctdb-crash-cleanup.sh [new file with mode: 0755]
ctdb/config/ctdb.init [new file with mode: 0755]
ctdb/config/ctdb.service [new file with mode: 0644]
ctdb/config/ctdb.sudoers [new file with mode: 0644]
ctdb/config/ctdb.sysconfig [new file with mode: 0644]
ctdb/config/ctdbd_wrapper [new file with mode: 0755]
ctdb/config/debug-hung-script.sh [new file with mode: 0755]
ctdb/config/debug_locks.sh [new file with mode: 0644]
ctdb/config/events.d/00.ctdb [new file with mode: 0755]
ctdb/config/events.d/01.reclock [new file with mode: 0755]
ctdb/config/events.d/10.interface [new file with mode: 0755]
ctdb/config/events.d/11.natgw [new file with mode: 0755]
ctdb/config/events.d/11.routing [new file with mode: 0755]
ctdb/config/events.d/13.per_ip_routing [new file with mode: 0755]
ctdb/config/events.d/20.multipathd [new file with mode: 0755]
ctdb/config/events.d/31.clamd [new file with mode: 0755]
ctdb/config/events.d/40.fs_use [new file with mode: 0644]
ctdb/config/events.d/40.vsftpd [new file with mode: 0755]
ctdb/config/events.d/41.httpd [new file with mode: 0755]
ctdb/config/events.d/49.winbind [new file with mode: 0755]
ctdb/config/events.d/50.samba [new file with mode: 0755]
ctdb/config/events.d/60.ganesha [new file with mode: 0755]
ctdb/config/events.d/60.nfs [new file with mode: 0755]
ctdb/config/events.d/62.cnfs [new file with mode: 0755]
ctdb/config/events.d/70.iscsi [new file with mode: 0755]
ctdb/config/events.d/91.lvs [new file with mode: 0755]
ctdb/config/events.d/99.timeout [new file with mode: 0755]
ctdb/config/events.d/README [new file with mode: 0644]
ctdb/config/functions [new file with mode: 0755]
ctdb/config/gcore_trace.sh [new file with mode: 0755]
ctdb/config/nfs-rpc-checks.d/10.statd.check [new file with mode: 0644]
ctdb/config/nfs-rpc-checks.d/20.nfsd.check [new file with mode: 0644]
ctdb/config/nfs-rpc-checks.d/30.lockd.check [new file with mode: 0644]
ctdb/config/nfs-rpc-checks.d/40.mountd.check [new file with mode: 0644]
ctdb/config/nfs-rpc-checks.d/50.rquotad.check [new file with mode: 0644]
ctdb/config/notify.d.README [new file with mode: 0755]
ctdb/config/notify.sh [new file with mode: 0755]
ctdb/config/statd-callout [new file with mode: 0755]
ctdb/configure.ac [new file with mode: 0644]
ctdb/configure.rpm [new file with mode: 0755]
ctdb/ctdb.pc.in [new file with mode: 0644]
ctdb/doc/Makefile [new file with mode: 0644]
ctdb/doc/ctdb-tunables.7.xml [new file with mode: 0644]
ctdb/doc/ctdb.1.xml [new file with mode: 0644]
ctdb/doc/ctdb.7.xml [new file with mode: 0644]
ctdb/doc/ctdbd.1.xml [new file with mode: 0644]
ctdb/doc/ctdbd.conf.5.xml [new file with mode: 0644]
ctdb/doc/ctdbd_wrapper.1.xml [new file with mode: 0644]
ctdb/doc/examples/README [new file with mode: 0644]
ctdb/doc/examples/cluster.conf [new file with mode: 0644]
ctdb/doc/examples/natgw.conf [new file with mode: 0644]
ctdb/doc/ltdbtool.1.xml [new file with mode: 0644]
ctdb/doc/onnode.1.xml [new file with mode: 0644]
ctdb/doc/ping_pong.1.xml [new file with mode: 0644]
ctdb/doc/readonlyrecords.txt [new file with mode: 0644]
ctdb/doc/recovery-process.txt [new file with mode: 0644]
ctdb/ib/README.txt [new file with mode: 0644]
ctdb/ib/config.m4 [new file with mode: 0644]
ctdb/ib/ibw_ctdb.c [new file with mode: 0644]
ctdb/ib/ibw_ctdb.h [new file with mode: 0644]
ctdb/ib/ibw_ctdb_init.c [new file with mode: 0644]
ctdb/ib/ibwrapper.c [new file with mode: 0644]
ctdb/ib/ibwrapper.h [new file with mode: 0644]
ctdb/ib/ibwrapper_internal.h [new file with mode: 0644]
ctdb/ib/ibwrapper_test.c [new file with mode: 0644]
ctdb/include/cmdline.h [new file with mode: 0644]
ctdb/include/ctdb.h [new file with mode: 0644]
ctdb/include/ctdb_client.h [new file with mode: 0644]
ctdb/include/ctdb_private.h [new file with mode: 0644]
ctdb/include/ctdb_protocol.h [new file with mode: 0644]
ctdb/include/ctdb_typesafe_cb.h [new file with mode: 0644]
ctdb/include/idtree.h [new file with mode: 0644]
ctdb/include/includes.h [new file with mode: 0644]
ctdb/install-sh [new file with mode: 0755]
ctdb/lib/popt/CHANGES [new file with mode: 0644]
ctdb/lib/popt/COPYING [new file with mode: 0644]
ctdb/lib/popt/README [new file with mode: 0644]
ctdb/lib/popt/findme.c [new file with mode: 0644]
ctdb/lib/popt/findme.h [new file with mode: 0644]
ctdb/lib/popt/libpopt.m4 [new file with mode: 0644]
ctdb/lib/popt/popt.c [new file with mode: 0644]
ctdb/lib/popt/popt.h [new file with mode: 0644]
ctdb/lib/popt/poptconfig.c [new file with mode: 0644]
ctdb/lib/popt/popthelp.c [new file with mode: 0644]
ctdb/lib/popt/poptint.h [new file with mode: 0644]
ctdb/lib/popt/poptparse.c [new file with mode: 0644]
ctdb/lib/popt/samba.m4 [new file with mode: 0644]
ctdb/lib/popt/system.h [new file with mode: 0644]
ctdb/lib/replace/.checker_innocent [new file with mode: 0644]
ctdb/lib/replace/Makefile [new file with mode: 0644]
ctdb/lib/replace/README [new file with mode: 0644]
ctdb/lib/replace/autoconf-2.60.m4 [new file with mode: 0644]
ctdb/lib/replace/configure [new file with mode: 0755]
ctdb/lib/replace/crypt.c [new file with mode: 0644]
ctdb/lib/replace/crypt.m4 [new file with mode: 0644]
ctdb/lib/replace/dlfcn.c [new file with mode: 0644]
ctdb/lib/replace/dlfcn.m4 [new file with mode: 0644]
ctdb/lib/replace/getaddrinfo.c [new file with mode: 0644]
ctdb/lib/replace/getaddrinfo.h [new file with mode: 0644]
ctdb/lib/replace/getifaddrs.c [new file with mode: 0644]
ctdb/lib/replace/hdr_replace.h [new file with mode: 0644]
ctdb/lib/replace/inet_aton.c [new file with mode: 0644]
ctdb/lib/replace/inet_ntoa.c [new file with mode: 0644]
ctdb/lib/replace/inet_ntop.c [new file with mode: 0644]
ctdb/lib/replace/inet_pton.c [new file with mode: 0644]
ctdb/lib/replace/install-sh [new file with mode: 0755]
ctdb/lib/replace/libreplace.m4 [new file with mode: 0644]
ctdb/lib/replace/libreplace_cc.m4 [new file with mode: 0644]
ctdb/lib/replace/libreplace_ld.m4 [new file with mode: 0644]
ctdb/lib/replace/libreplace_macros.m4 [new file with mode: 0644]
ctdb/lib/replace/libreplace_network.m4 [new file with mode: 0644]
ctdb/lib/replace/poll.c [new file with mode: 0644]
ctdb/lib/replace/repdir.m4 [new file with mode: 0644]
ctdb/lib/replace/repdir_getdents.c [new file with mode: 0644]
ctdb/lib/replace/repdir_getdirentries.c [new file with mode: 0644]
ctdb/lib/replace/replace-test.h [new file with mode: 0644]
ctdb/lib/replace/replace-testsuite.h [new file with mode: 0644]
ctdb/lib/replace/replace.c [new file with mode: 0644]
ctdb/lib/replace/replace.h [new file with mode: 0644]
ctdb/lib/replace/snprintf.c [new file with mode: 0644]
ctdb/lib/replace/socket.c [new file with mode: 0644]
ctdb/lib/replace/socketpair.c [new file with mode: 0644]
ctdb/lib/replace/strptime.c [new file with mode: 0644]
ctdb/lib/replace/strptime.m4 [new file with mode: 0644]
ctdb/lib/replace/system/README [new file with mode: 0644]
ctdb/lib/replace/system/aio.h [new file with mode: 0644]
ctdb/lib/replace/system/capability.h [new file with mode: 0644]
ctdb/lib/replace/system/config.m4 [new file with mode: 0644]
ctdb/lib/replace/system/dir.h [new file with mode: 0644]
ctdb/lib/replace/system/filesys.h [new file with mode: 0644]
ctdb/lib/replace/system/glob.h [new file with mode: 0644]
ctdb/lib/replace/system/gssapi.h [new file with mode: 0644]
ctdb/lib/replace/system/iconv.h [new file with mode: 0644]
ctdb/lib/replace/system/kerberos.h [new file with mode: 0644]
ctdb/lib/replace/system/locale.h [new file with mode: 0644]
ctdb/lib/replace/system/network.h [new file with mode: 0644]
ctdb/lib/replace/system/passwd.h [new file with mode: 0644]
ctdb/lib/replace/system/readline.h [new file with mode: 0644]
ctdb/lib/replace/system/select.h [new file with mode: 0644]
ctdb/lib/replace/system/shmem.h [new file with mode: 0644]
ctdb/lib/replace/system/syslog.h [new file with mode: 0644]
ctdb/lib/replace/system/terminal.h [new file with mode: 0644]
ctdb/lib/replace/system/time.h [new file with mode: 0644]
ctdb/lib/replace/system/wait.h [new file with mode: 0644]
ctdb/lib/replace/system/wscript_configure [new file with mode: 0644]
ctdb/lib/replace/test/getifaddrs.c [new file with mode: 0644]
ctdb/lib/replace/test/incoherent_mmap.c [new file with mode: 0644]
ctdb/lib/replace/test/main.c [new file with mode: 0644]
ctdb/lib/replace/test/os2_delete.c [new file with mode: 0644]
ctdb/lib/replace/test/shared_mmap.c [new file with mode: 0644]
ctdb/lib/replace/test/shared_mremap.c [new file with mode: 0644]
ctdb/lib/replace/test/snprintf.c [new file with mode: 0644]
ctdb/lib/replace/test/strptime.c [new file with mode: 0644]
ctdb/lib/replace/test/testsuite.c [new file with mode: 0644]
ctdb/lib/replace/timegm.c [new file with mode: 0644]
ctdb/lib/replace/timegm.m4 [new file with mode: 0644]
ctdb/lib/replace/win32.m4 [new file with mode: 0644]
ctdb/lib/replace/win32_replace.h [new file with mode: 0644]
ctdb/lib/replace/wscript [new file with mode: 0644]
ctdb/lib/replace/xattr.c [new file with mode: 0644]
ctdb/lib/socket_wrapper/config.m4 [new file with mode: 0644]
ctdb/lib/socket_wrapper/socket_wrapper.c [new file with mode: 0644]
ctdb/lib/socket_wrapper/socket_wrapper.h [new file with mode: 0644]
ctdb/lib/socket_wrapper/testsuite.c [new file with mode: 0644]
ctdb/lib/socket_wrapper/wscript [new file with mode: 0644]
ctdb/lib/socket_wrapper/wscript_build [new file with mode: 0644]
ctdb/lib/talloc/ABI/pytalloc-util-2.0.6.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/pytalloc-util-2.0.7.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/pytalloc-util-2.0.8.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/talloc-2.0.2.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/talloc-2.0.3.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/talloc-2.0.4.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/talloc-2.0.5.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/talloc-2.0.6.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/talloc-2.0.7.sigs [new file with mode: 0644]
ctdb/lib/talloc/ABI/talloc-2.0.8.sigs [new file with mode: 0644]
ctdb/lib/talloc/NEWS [new file with mode: 0644]
ctdb/lib/talloc/compat/talloc_compat1.c [new file with mode: 0644]
ctdb/lib/talloc/compat/talloc_compat1.m4 [new file with mode: 0644]
ctdb/lib/talloc/compat/talloc_compat1.mk [new file with mode: 0644]
ctdb/lib/talloc/doc/context.png [new file with mode: 0644]
ctdb/lib/talloc/doc/context_tree.png [new file with mode: 0644]
ctdb/lib/talloc/doc/mainpage.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/stealing.png [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_bestpractices.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_context.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_debugging.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_destructors.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_dts.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_introduction.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_pools.dox [new file with mode: 0644]
ctdb/lib/talloc/doc/tutorial_stealing.dox [new file with mode: 0644]
ctdb/lib/talloc/doxy.config [new file with mode: 0644]
ctdb/lib/talloc/install-sh [new file with mode: 0755]
ctdb/lib/talloc/libtalloc.m4 [new file with mode: 0644]
ctdb/lib/talloc/pytalloc-util.pc.in [new file with mode: 0644]
ctdb/lib/talloc/pytalloc.c [new file with mode: 0644]
ctdb/lib/talloc/pytalloc.h [new file with mode: 0644]
ctdb/lib/talloc/pytalloc_util.c [new file with mode: 0644]
ctdb/lib/talloc/talloc.3.xml [new file with mode: 0644]
ctdb/lib/talloc/talloc.c [new file with mode: 0644]
ctdb/lib/talloc/talloc.h [new file with mode: 0644]
ctdb/lib/talloc/talloc.i [new file with mode: 0644]
ctdb/lib/talloc/talloc.pc.in [new file with mode: 0644]
ctdb/lib/talloc/talloc_guide.txt [new file with mode: 0644]
ctdb/lib/talloc/talloc_testsuite.h [new file with mode: 0644]
ctdb/lib/talloc/testsuite.c [new file with mode: 0644]
ctdb/lib/talloc/testsuite_main.c [new file with mode: 0644]
ctdb/lib/talloc/web/index.html [new file with mode: 0644]
ctdb/lib/talloc/wscript [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.1.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.10.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.11.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.2.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.3.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.4.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.5.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.6.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.7.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.8.sigs [new file with mode: 0644]
ctdb/lib/tdb/ABI/tdb-1.2.9.sigs [new file with mode: 0644]
ctdb/lib/tdb/common/check.c [new file with mode: 0644]
ctdb/lib/tdb/common/dump.c [new file with mode: 0644]
ctdb/lib/tdb/common/error.c [new file with mode: 0644]
ctdb/lib/tdb/common/freelist.c [new file with mode: 0644]
ctdb/lib/tdb/common/freelistcheck.c [new file with mode: 0644]
ctdb/lib/tdb/common/hash.c [new file with mode: 0644]
ctdb/lib/tdb/common/io.c [new file with mode: 0644]
ctdb/lib/tdb/common/lock.c [new file with mode: 0644]
ctdb/lib/tdb/common/open.c [new file with mode: 0644]
ctdb/lib/tdb/common/rescue.c [new file with mode: 0644]
ctdb/lib/tdb/common/summary.c [new file with mode: 0644]
ctdb/lib/tdb/common/tdb.c [new file with mode: 0644]
ctdb/lib/tdb/common/tdb_private.h [new file with mode: 0644]
ctdb/lib/tdb/common/transaction.c [new file with mode: 0644]
ctdb/lib/tdb/common/traverse.c [new file with mode: 0644]
ctdb/lib/tdb/docs/README [new file with mode: 0644]
ctdb/lib/tdb/docs/mainpage.dox [new file with mode: 0644]
ctdb/lib/tdb/docs/tdb.magic [new file with mode: 0644]
ctdb/lib/tdb/docs/tracing.txt [new file with mode: 0644]
ctdb/lib/tdb/doxy.config [new file with mode: 0644]
ctdb/lib/tdb/include/tdb.h [new file with mode: 0644]
ctdb/lib/tdb/libtdb.m4 [new file with mode: 0644]
ctdb/lib/tdb/manpages/tdbbackup.8.xml [new file with mode: 0644]
ctdb/lib/tdb/manpages/tdbdump.8.xml [new file with mode: 0644]
ctdb/lib/tdb/manpages/tdbrestore.8.xml [new file with mode: 0644]
ctdb/lib/tdb/manpages/tdbtool.8.xml [new file with mode: 0644]
ctdb/lib/tdb/pytdb.c [new file with mode: 0644]
ctdb/lib/tdb/python/tdbdump.py [new file with mode: 0644]
ctdb/lib/tdb/python/tests/simple.py [new file with mode: 0644]
ctdb/lib/tdb/tdb.pc.in [new file with mode: 0644]
ctdb/lib/tdb/test/external-agent.c [new file with mode: 0644]
ctdb/lib/tdb/test/external-agent.h [new file with mode: 0644]
ctdb/lib/tdb/test/jenkins-be-hash.tdb [new file with mode: 0644]
ctdb/lib/tdb/test/jenkins-le-hash.tdb [new file with mode: 0644]
ctdb/lib/tdb/test/lock-tracking.c [new file with mode: 0644]
ctdb/lib/tdb/test/lock-tracking.h [new file with mode: 0644]
ctdb/lib/tdb/test/logging.c [new file with mode: 0644]
ctdb/lib/tdb/test/logging.h [new file with mode: 0644]
ctdb/lib/tdb/test/old-nohash-be.tdb [new file with mode: 0644]
ctdb/lib/tdb/test/old-nohash-le.tdb [new file with mode: 0644]
ctdb/lib/tdb/test/run-3G-file.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-bad-tdb-header.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-check.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-corrupt.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-die-during-transaction.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-endian.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-incompatible.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-nested-transactions.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-nested-traverse.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-no-lock-during-traverse.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-oldhash.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-open-during-transaction.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-readonly-check.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-rescue-find_entry.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-rescue.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-rwlock-check.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-summary.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-transaction-expand.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-traverse-in-transaction.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-wronghash-fail.c [new file with mode: 0644]
ctdb/lib/tdb/test/run-zero-append.c [new file with mode: 0644]
ctdb/lib/tdb/test/run.c [new file with mode: 0644]
ctdb/lib/tdb/test/rwlock-be.tdb [new file with mode: 0644]
ctdb/lib/tdb/test/rwlock-le.tdb [new file with mode: 0644]
ctdb/lib/tdb/test/tap-interface.h [new file with mode: 0644]
ctdb/lib/tdb/test/tap-to-subunit.h [new file with mode: 0644]
ctdb/lib/tdb/test/tdb.corrupt [new file with mode: 0644]
ctdb/lib/tdb/tools/tdbbackup.c [new file with mode: 0644]
ctdb/lib/tdb/tools/tdbdump.c [new file with mode: 0644]
ctdb/lib/tdb/tools/tdbrestore.c [new file with mode: 0644]
ctdb/lib/tdb/tools/tdbtest.c [new file with mode: 0644]
ctdb/lib/tdb/tools/tdbtool.c [new file with mode: 0644]
ctdb/lib/tdb/tools/tdbtorture.c [new file with mode: 0644]
ctdb/lib/tdb/web/index.html [new file with mode: 0644]
ctdb/lib/tdb/wscript [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.10.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.11.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.12.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.13.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.14.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.15.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.16.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.17.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.18.sigs [new file with mode: 0644]
ctdb/lib/tevent/ABI/tevent-0.9.9.sigs [new file with mode: 0644]
ctdb/lib/tevent/bindings.py [new file with mode: 0644]
ctdb/lib/tevent/doc/mainpage.dox [new file with mode: 0644]
ctdb/lib/tevent/doc/tutorials.dox [new file with mode: 0644]
ctdb/lib/tevent/doxy.config [new file with mode: 0644]
ctdb/lib/tevent/libtevent.m4 [new file with mode: 0644]
ctdb/lib/tevent/pytevent.c [new file with mode: 0644]
ctdb/lib/tevent/release-script.sh [new file with mode: 0755]
ctdb/lib/tevent/testsuite.c [new file with mode: 0644]
ctdb/lib/tevent/tevent.c [new file with mode: 0644]
ctdb/lib/tevent/tevent.h [new file with mode: 0644]
ctdb/lib/tevent/tevent.pc.in [new file with mode: 0644]
ctdb/lib/tevent/tevent.py [new file with mode: 0644]
ctdb/lib/tevent/tevent_debug.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_epoll.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_fd.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_immediate.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_internal.h [new file with mode: 0644]
ctdb/lib/tevent/tevent_liboop.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_poll.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_queue.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_req.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_select.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_signal.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_standard.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_timed.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_util.c [new file with mode: 0644]
ctdb/lib/tevent/tevent_util.h [new file with mode: 0644]
ctdb/lib/tevent/tevent_wakeup.c [new file with mode: 0644]
ctdb/lib/tevent/wscript [new file with mode: 0755]
ctdb/lib/util/db_wrap.c [new file with mode: 0644]
ctdb/lib/util/db_wrap.h [new file with mode: 0644]
ctdb/lib/util/debug.c [new file with mode: 0644]
ctdb/lib/util/debug.h [new file with mode: 0644]
ctdb/lib/util/dlinklist.h [new file with mode: 0644]
ctdb/lib/util/fault.c [new file with mode: 0644]
ctdb/lib/util/fault.m4 [new file with mode: 0644]
ctdb/lib/util/idtree.c [new file with mode: 0644]
ctdb/lib/util/signal.c [new file with mode: 0644]
ctdb/lib/util/signal.m4 [new file with mode: 0644]
ctdb/lib/util/strlist.c [new file with mode: 0644]
ctdb/lib/util/substitute.c [new file with mode: 0644]
ctdb/lib/util/util.c [new file with mode: 0644]
ctdb/lib/util/util.h [new file with mode: 0644]
ctdb/lib/util/util_file.c [new file with mode: 0644]
ctdb/lib/util/util_time.c [new file with mode: 0644]
ctdb/packaging/RPM/ctdb.spec.in [new file with mode: 0644]
ctdb/packaging/RPM/makerpms.sh [new file with mode: 0755]
ctdb/packaging/maketarball.sh [new file with mode: 0755]
ctdb/packaging/mkversion.sh [new file with mode: 0755]
ctdb/server/ctdb_banning.c [new file with mode: 0644]
ctdb/server/ctdb_call.c [new file with mode: 0644]
ctdb/server/ctdb_control.c [new file with mode: 0644]
ctdb/server/ctdb_daemon.c [new file with mode: 0644]
ctdb/server/ctdb_freeze.c [new file with mode: 0644]
ctdb/server/ctdb_keepalive.c [new file with mode: 0644]
ctdb/server/ctdb_lock.c [new file with mode: 0644]
ctdb/server/ctdb_lock_helper.c [new file with mode: 0644]
ctdb/server/ctdb_logging.c [new file with mode: 0644]
ctdb/server/ctdb_ltdb_server.c [new file with mode: 0644]
ctdb/server/ctdb_monitor.c [new file with mode: 0644]
ctdb/server/ctdb_persistent.c [new file with mode: 0644]
ctdb/server/ctdb_recover.c [new file with mode: 0644]
ctdb/server/ctdb_recoverd.c [new file with mode: 0644]
ctdb/server/ctdb_server.c [new file with mode: 0644]
ctdb/server/ctdb_serverids.c [new file with mode: 0644]
ctdb/server/ctdb_statistics.c [new file with mode: 0644]
ctdb/server/ctdb_takeover.c [new file with mode: 0644]
ctdb/server/ctdb_traverse.c [new file with mode: 0644]
ctdb/server/ctdb_tunables.c [new file with mode: 0644]
ctdb/server/ctdb_update_record.c [new file with mode: 0644]
ctdb/server/ctdb_uptime.c [new file with mode: 0644]
ctdb/server/ctdb_vacuum.c [new file with mode: 0644]
ctdb/server/ctdbd.c [new file with mode: 0644]
ctdb/server/eventscript.c [new file with mode: 0644]
ctdb/tcp/ctdb_tcp.h [new file with mode: 0644]
ctdb/tcp/tcp_connect.c [new file with mode: 0644]
ctdb/tcp/tcp_init.c [new file with mode: 0644]
ctdb/tcp/tcp_io.c [new file with mode: 0644]
ctdb/tests/INSTALL [new file with mode: 0755]
ctdb/tests/README [new file with mode: 0644]
ctdb/tests/TODO [new file with mode: 0644]
ctdb/tests/complex/11_ctdb_delip_removes_ip.sh [new file with mode: 0755]
ctdb/tests/complex/31_nfs_tickle.sh [new file with mode: 0755]
ctdb/tests/complex/32_cifs_tickle.sh [new file with mode: 0755]
ctdb/tests/complex/33_gratuitous_arp.sh [new file with mode: 0755]
ctdb/tests/complex/41_failover_ping_discrete.sh [new file with mode: 0755]
ctdb/tests/complex/42_failover_ssh_hostname.sh [new file with mode: 0755]
ctdb/tests/complex/43_failover_nfs_basic.sh [new file with mode: 0755]
ctdb/tests/complex/44_failover_nfs_oneway.sh [new file with mode: 0755]
ctdb/tests/complex/45_failover_nfs_kill.sh [new file with mode: 0755]
ctdb/tests/complex/README [new file with mode: 0644]
ctdb/tests/complex/scripts/local.bash [new file with mode: 0644]
ctdb/tests/events.d/00.test [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.003.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.004.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.005.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.006.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.007.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.008.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.021.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.022.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.init.023.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.monitor.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.monitor.003.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.monitor.004.sh [new file with mode: 0755]
ctdb/tests/eventscripts/00.ctdb.monitor.005.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.init.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.init.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.003.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.004.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.005.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.006.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.007.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.008.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.009.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.010.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.011.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.012.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.013.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.014.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.015.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.monitor.016.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.multi.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.releaseip.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.releaseip.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.releaseip.010.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.releaseip.011.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.startup.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.startup.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.takeip.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.takeip.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/10.interface.takeip.003.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.003.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.004.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.005.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.006.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.007.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.008.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.009.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.010.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.011.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.012.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.013.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.014.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.015.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.016.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.017.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.018.sh [new file with mode: 0755]
ctdb/tests/eventscripts/13.per_ip_routing.019.sh [new file with mode: 0755]
ctdb/tests/eventscripts/20.multipathd.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/20.multipathd.monitor.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/20.multipathd.monitor.003.sh [new file with mode: 0755]
ctdb/tests/eventscripts/20.multipathd.monitor.004.sh [new file with mode: 0755]
ctdb/tests/eventscripts/40.vsftpd.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/41.httpd.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/49.winbind.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/49.winbind.monitor.050.sh [new file with mode: 0755]
ctdb/tests/eventscripts/49.winbind.monitor.051.sh [new file with mode: 0755]
ctdb/tests/eventscripts/49.winbind.monitor.101.sh [new file with mode: 0755]
ctdb/tests/eventscripts/49.winbind.monitor.102.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.050.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.051.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.101.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.103.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.104.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.105.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.106.sh [new file with mode: 0755]
ctdb/tests/eventscripts/50.samba.monitor.107.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.ganesha.monitor.101.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.ganesha.monitor.131.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.ganesha.monitor.141.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.101.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.102.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.103.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.104.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.111.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.112.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.113.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.114.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.121.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.122.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.131.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.132.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.141.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.142.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.151.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.152.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.153.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.161.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.monitor.162.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.multi.001.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.multi.002.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.multi.003.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.multi.004.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.multi.005.sh [new file with mode: 0755]
ctdb/tests/eventscripts/60.nfs.multi.006.sh [new file with mode: 0755]
ctdb/tests/eventscripts/README [new file with mode: 0644]
ctdb/tests/eventscripts/etc-ctdb/events.d [new symlink]
ctdb/tests/eventscripts/etc-ctdb/functions [new symlink]
ctdb/tests/eventscripts/etc-ctdb/nfs-rpc-checks.d [new symlink]
ctdb/tests/eventscripts/etc-ctdb/public_addresses [new file with mode: 0644]
ctdb/tests/eventscripts/etc-ctdb/rc.local [new file with mode: 0755]
ctdb/tests/eventscripts/etc-ctdb/statd-callout [new file with mode: 0755]
ctdb/tests/eventscripts/etc/init.d/nfs [new file with mode: 0755]
ctdb/tests/eventscripts/etc/init.d/nfslock [new file with mode: 0755]
ctdb/tests/eventscripts/etc/samba/smb.conf [new file with mode: 0644]
ctdb/tests/eventscripts/etc/sysconfig/ctdb [new file with mode: 0644]
ctdb/tests/eventscripts/etc/sysconfig/nfs [new file with mode: 0644]
ctdb/tests/eventscripts/scripts/local.sh [new file with mode: 0644]
ctdb/tests/eventscripts/stubs/ctdb [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/date [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/ethtool [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/exportfs [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/free [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/ip [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/iptables [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/kill [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/killall [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/multipath [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/net [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/netstat [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/nmap [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/pidof [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/pkill [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/ps [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/rpc.lockd [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/rpc.mountd [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/rpc.rquotad [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/rpc.statd [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/rpcinfo [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/service [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/sleep [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/tdbdump [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/tdbtool [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/testparm [new file with mode: 0755]
ctdb/tests/eventscripts/stubs/wbinfo [new file with mode: 0755]
ctdb/tests/onnode/0001.sh [new file with mode: 0755]
ctdb/tests/onnode/0002.sh [new file with mode: 0755]
ctdb/tests/onnode/0003.sh [new file with mode: 0755]
ctdb/tests/onnode/0004.sh [new file with mode: 0755]
ctdb/tests/onnode/0005.sh [new file with mode: 0755]
ctdb/tests/onnode/0006.sh [new file with mode: 0755]
ctdb/tests/onnode/0070.sh [new file with mode: 0755]
ctdb/tests/onnode/0071.sh [new file with mode: 0755]
ctdb/tests/onnode/0072.sh [new file with mode: 0755]
ctdb/tests/onnode/0075.sh [new file with mode: 0755]
ctdb/tests/onnode/0080.sh [new file with mode: 0755]
ctdb/tests/onnode/0081.sh [new file with mode: 0755]
ctdb/tests/onnode/0090.sh [new file with mode: 0755]
ctdb/tests/onnode/0091.sh [new file with mode: 0755]
ctdb/tests/onnode/README [new file with mode: 0644]
ctdb/tests/onnode/nodes [new file with mode: 0644]
ctdb/tests/onnode/scripts/local.sh [new file with mode: 0644]
ctdb/tests/onnode/stubs/ctdb [new file with mode: 0755]
ctdb/tests/onnode/stubs/onnode-buggy-001 [new file with mode: 0755]
ctdb/tests/onnode/stubs/ssh [new file with mode: 0755]
ctdb/tests/recover.sh [new file with mode: 0755]
ctdb/tests/run_cluster_tests.sh [new symlink]
ctdb/tests/run_tests.sh [new file with mode: 0755]
ctdb/tests/scripts/common.sh [new file with mode: 0644]
ctdb/tests/scripts/integration.bash [new file with mode: 0644]
ctdb/tests/scripts/run_tests [new file with mode: 0755]
ctdb/tests/scripts/test_wrap [new file with mode: 0755]
ctdb/tests/scripts/unit.sh [new file with mode: 0644]
ctdb/tests/simple/00_ctdb_init.sh [new file with mode: 0755]
ctdb/tests/simple/00_ctdb_onnode.sh [new file with mode: 0755]
ctdb/tests/simple/01_ctdb_version.sh [new file with mode: 0755]
ctdb/tests/simple/02_ctdb_listvars.sh [new file with mode: 0755]
ctdb/tests/simple/03_ctdb_getvar.sh [new file with mode: 0755]
ctdb/tests/simple/04_ctdb_setvar.sh [new file with mode: 0755]
ctdb/tests/simple/05_ctdb_listnodes.sh [new file with mode: 0755]
ctdb/tests/simple/06_ctdb_getpid.sh [new file with mode: 0755]
ctdb/tests/simple/07_ctdb_process_exists.sh [new file with mode: 0755]
ctdb/tests/simple/08_ctdb_isnotrecmaster.sh [new file with mode: 0755]
ctdb/tests/simple/09_ctdb_ping.sh [new file with mode: 0755]
ctdb/tests/simple/11_ctdb_ip.sh [new file with mode: 0755]
ctdb/tests/simple/12_ctdb_getdebug.sh [new file with mode: 0755]
ctdb/tests/simple/13_ctdb_setdebug.sh [new file with mode: 0755]
ctdb/tests/simple/14_ctdb_statistics.sh [new file with mode: 0755]
ctdb/tests/simple/15_ctdb_statisticsreset.sh [new file with mode: 0755]
ctdb/tests/simple/16_ctdb_config_add_ip.sh [new file with mode: 0755]
ctdb/tests/simple/17_ctdb_config_delete_ip.sh [new file with mode: 0755]
ctdb/tests/simple/18_ctdb_reloadips.sh [new file with mode: 0755]
ctdb/tests/simple/20_delip_iface_gc.sh [new file with mode: 0755]
ctdb/tests/simple/23_ctdb_moveip.sh [new file with mode: 0755]
ctdb/tests/simple/24_ctdb_getdbmap.sh [new file with mode: 0755]
ctdb/tests/simple/25_dumpmemory.sh [new file with mode: 0755]
ctdb/tests/simple/26_ctdb_config_check_error_on_unreachable_ctdb.sh [new file with mode: 0755]
ctdb/tests/simple/31_ctdb_disable.sh [new file with mode: 0755]
ctdb/tests/simple/32_ctdb_enable.sh [new file with mode: 0755]
ctdb/tests/simple/41_ctdb_stop.sh [new file with mode: 0755]
ctdb/tests/simple/42_ctdb_continue.sh [new file with mode: 0755]
ctdb/tests/simple/43_stop_recmaster_yield.sh [new file with mode: 0755]
ctdb/tests/simple/51_ctdb_bench.sh [new file with mode: 0755]
ctdb/tests/simple/52_ctdb_fetch.sh [new file with mode: 0755]
ctdb/tests/simple/53_ctdb_transaction.sh [new file with mode: 0755]
ctdb/tests/simple/54_ctdb_transaction_recovery.sh [new file with mode: 0755]
ctdb/tests/simple/60_recoverd_missing_ip.sh [new file with mode: 0755]
ctdb/tests/simple/70_recoverpdbbyseqnum.sh [new file with mode: 0755]
ctdb/tests/simple/71_ctdb_wipedb.sh [new file with mode: 0755]
ctdb/tests/simple/72_update_record_persistent.sh [new file with mode: 0755]
ctdb/tests/simple/73_tunable_NoIPTakeover.sh [new file with mode: 0755]
ctdb/tests/simple/75_readonly_records_basic.sh [new file with mode: 0755]
ctdb/tests/simple/76_ctdb_pdb_recovery.sh [new file with mode: 0755]
ctdb/tests/simple/77_ctdb_db_recovery.sh [new file with mode: 0755]
ctdb/tests/simple/80_ctdb_traverse.sh [new file with mode: 0755]
ctdb/tests/simple/99_daemons_shutdown.sh [new file with mode: 0755]
ctdb/tests/simple/README [new file with mode: 0644]
ctdb/tests/src/ctdb_bench.c [new file with mode: 0644]
ctdb/tests/src/ctdb_fetch.c [new file with mode: 0644]
ctdb/tests/src/ctdb_fetch_one.c [new file with mode: 0644]
ctdb/tests/src/ctdb_fetch_readonly_loop.c [new file with mode: 0644]
ctdb/tests/src/ctdb_fetch_readonly_once.c [new file with mode: 0644]
ctdb/tests/src/ctdb_functest.c [new file with mode: 0644]
ctdb/tests/src/ctdb_lock_tdb.c [new file with mode: 0644]
ctdb/tests/src/ctdb_persistent.c [new file with mode: 0644]
ctdb/tests/src/ctdb_porting_tests.c [new file with mode: 0644]
ctdb/tests/src/ctdb_randrec.c [new file with mode: 0644]
ctdb/tests/src/ctdb_store.c [new file with mode: 0644]
ctdb/tests/src/ctdb_takeover_tests.c [new file with mode: 0644]
ctdb/tests/src/ctdb_test.c [new file with mode: 0644]
ctdb/tests/src/ctdb_test_stubs.c [new file with mode: 0644]
ctdb/tests/src/ctdb_trackingdb_test.c [new file with mode: 0644]
ctdb/tests/src/ctdb_transaction.c [new file with mode: 0644]
ctdb/tests/src/ctdb_traverse.c [new file with mode: 0644]
ctdb/tests/src/ctdb_update_record.c [new file with mode: 0644]
ctdb/tests/src/ctdb_update_record_persistent.c [new file with mode: 0644]
ctdb/tests/src/ctdbd_test.c [new file with mode: 0644]
ctdb/tests/src/rb_perftest.c [new file with mode: 0644]
ctdb/tests/src/rb_test.c [new file with mode: 0644]
ctdb/tests/takeover/README [new file with mode: 0644]
ctdb/tests/takeover/det.001.sh [new file with mode: 0755]
ctdb/tests/takeover/det.002.sh [new file with mode: 0755]
ctdb/tests/takeover/det.003.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.001.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.002.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.003.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.004.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.005.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.006.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.007.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.008.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.009.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.010.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.011.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.012.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.013.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.014.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.015.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.016.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.017.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.018.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.019.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.020.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.021.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.022.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.023.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.024.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.025.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.026.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.027.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.028.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.029.sh [new file with mode: 0755]
ctdb/tests/takeover/lcp2.030.sh [new file with mode: 0755]
ctdb/tests/takeover/nondet.001.sh [new file with mode: 0755]
ctdb/tests/takeover/nondet.002.sh [new file with mode: 0755]
ctdb/tests/takeover/nondet.003.sh [new file with mode: 0755]
ctdb/tests/takeover/scripts/local.sh [new file with mode: 0644]
ctdb/tests/takeover/simulation/README [new file with mode: 0644]
ctdb/tests/takeover/simulation/ctdb_takeover.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/hey_jude.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/ip_groups1.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/ip_groups2.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/ip_groups3.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/ip_groups4.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/ip_groups5.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/mgmt_simple.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/node_group.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/node_group_extra.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/node_group_simple.py [new file with mode: 0755]
ctdb/tests/takeover/simulation/nondet_path_01.py [new file with mode: 0755]
ctdb/tests/test_check_tcp_ports.sh [new file with mode: 0755]
ctdb/tests/tool/README [new file with mode: 0644]
ctdb/tests/tool/func.parse_nodestring.001.sh [new file with mode: 0755]
ctdb/tests/tool/func.parse_nodestring.002.sh [new file with mode: 0755]
ctdb/tests/tool/func.parse_nodestring.003.sh [new file with mode: 0755]
ctdb/tests/tool/scripts/local.sh [new file with mode: 0644]
ctdb/tests/tool/stubby.getcapabilities.001.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.getcapabilities.002.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.getcapabilities.003.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.lvs.001.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.lvsmaster.001.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.lvsmaster.002.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.natgwlist.001.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.natgwlist.002.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.natgwlist.003.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.natgwlist.004.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.natgwlist.005.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.nodestatus.001.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.nodestatus.002.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.nodestatus.003.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.nodestatus.004.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.nodestatus.005.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.status.001.sh [new file with mode: 0755]
ctdb/tests/tool/stubby.status.002.sh [new file with mode: 0755]
ctdb/tests/tool/testcases/stubby.nodestatus.005.sh [new file with mode: 0755]
ctdb/tools/ctdb.c [new file with mode: 0644]
ctdb/tools/ctdb_diagnostics [new file with mode: 0755]
ctdb/tools/ctdb_vacuum.c [new file with mode: 0644]
ctdb/tools/ltdbtool.c [new file with mode: 0644]
ctdb/tools/onnode [new file with mode: 0755]
ctdb/utils/nagios/README [new file with mode: 0644]
ctdb/utils/nagios/check_ctdb [new file with mode: 0644]
ctdb/utils/ping_pong/ping_pong.c [new file with mode: 0644]
ctdb/utils/pmda/Install [new file with mode: 0644]
ctdb/utils/pmda/README [new file with mode: 0644]
ctdb/utils/pmda/Remove [new file with mode: 0644]
ctdb/utils/pmda/config.m4 [new file with mode: 0644]
ctdb/utils/pmda/domain.h [new file with mode: 0644]
ctdb/utils/pmda/help [new file with mode: 0644]
ctdb/utils/pmda/pmda_ctdb.c [new file with mode: 0644]
ctdb/utils/pmda/pmns [new file with mode: 0644]
ctdb/utils/pmda/root [new file with mode: 0644]
ctdb/utils/scsi_io/scsi_io.c [new file with mode: 0644]
ctdb/utils/smnotify/smnotify.c [new file with mode: 0644]
ctdb/utils/smnotify/smnotify.x [new file with mode: 0644]
ctdb/web/bar1.jpg [new file with mode: 0644]
ctdb/web/building.html [new file with mode: 0644]
ctdb/web/clamd.html [new file with mode: 0644]
ctdb/web/configuring.html [new file with mode: 0644]
ctdb/web/ctdblogo.png [new file with mode: 0644]
ctdb/web/documentation.html [new file with mode: 0644]
ctdb/web/download.html [new file with mode: 0644]
ctdb/web/footer.html [new file with mode: 0644]
ctdb/web/ftp.html [new file with mode: 0644]
ctdb/web/header.html [new file with mode: 0644]
ctdb/web/index.html [new file with mode: 0644]
ctdb/web/iscsi.html [new file with mode: 0644]
ctdb/web/nfs.html [new file with mode: 0644]
ctdb/web/prerequisites.html [new file with mode: 0644]
ctdb/web/samba.html [new file with mode: 0644]
ctdb/web/testing.html [new file with mode: 0644]

diff --git a/ctdb/.bzrignore b/ctdb/.bzrignore
new file mode 100644 (file)
index 0000000..6560aa4
--- /dev/null
@@ -0,0 +1,24 @@
+config.status
+Makefile
+bin
+config.log
+push.sh
+ctdb_test
+config.cache
+configure
+config.h
+config.h.in
+nodes-ssh.txt
+TAGS
+ctdb-0
+ctdb-1
+ctdb-2
+ctdb-3
+nodes.txt
+TAGS
+web/packages
+rec.lock
+test.db
+sock.1
+sock.3
+sock.4
diff --git a/ctdb/.gitignore b/ctdb/.gitignore
new file mode 100644 (file)
index 0000000..9e8c581
--- /dev/null
@@ -0,0 +1,34 @@
+*.[oa]
+*~
+*.swp
+config.status
+configure
+ctdb.pc
+publish*.sh
+push*.sh
+web/packages
+TAGS
+tags
+bin
+Makefile
+config.h
+config.h.in
+config.log
+utils/smnotify/gen_smnotify.c
+utils/smnotify/gen_xdr.c
+utils/smnotify/smnotify.h
+nodes.txt
+public_addresses.txt
+rec.lock
+test.db
+tests/bin
+tests/events.d/00.ctdb_test_trigger
+tests/var
+tests/takeover/ctdb_takeover.pyc
+tests/eventscripts/var
+tests/eventscripts/etc/iproute2
+tests/eventscripts/etc-ctdb/policy_routing
+include/ctdb_version.h
+packaging/RPM/ctdb.spec
+doc/*.[1-7]
+doc/*.[1-7].html
diff --git a/ctdb/COPYING b/ctdb/COPYING
new file mode 100644 (file)
index 0000000..94a9ed0
--- /dev/null
@@ -0,0 +1,674 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/ctdb/Makefile.in b/ctdb/Makefile.in
new file mode 100755 (executable)
index 0000000..55b21b7
--- /dev/null
@@ -0,0 +1,408 @@
+#!gmake
+
+
+CC = @CC@
+AR = ar
+ARFLAGS = cru
+RANLIB = ranlib
+prefix = @prefix@
+exec_prefix = @exec_prefix@
+datarootdir = @datarootdir@
+includedir = @includedir@
+docdir = @docdir@
+libdir = @libdir@
+bindir = @bindir@
+sbindir = @sbindir@
+mandir = @mandir@
+localstatedir = @localstatedir@
+logdir = @LOGDIR@
+sockpath = @SOCKPATH@
+VPATH = @srcdir@:@tdbdir@:@tallocdir@:@libreplacedir@:@poptdir@:@teventdir@:
+srcdir = @srcdir@
+etcdir = @sysconfdir@
+builddir = @builddir@
+DESTDIR = /
+EXTRA_OBJ=@EXTRA_OBJ@
+XSLTPROC = /usr/bin/xsltproc
+INSTALLCMD = @INSTALL@
+
+POPT_LIBS = @POPT_LIBS@
+POPT_CFLAGS = @POPT_CFLAGS@
+POPT_OBJ = @POPT_OBJ@
+
+TALLOC_LIBS = @TALLOC_LIBS@
+TALLOC_CFLAGS = @TALLOC_CFLAGS@
+TALLOC_OBJ = @TALLOC_OBJ@
+
+TEVENT_LIBS = @TEVENT_LIBS@
+TEVENT_CFLAGS = @TEVENT_CFLAGS@
+TEVENT_OBJ = @TEVENT_OBJ@
+
+TDB_LIBS = @TDB_LIBS@
+TDB_CFLAGS = @TDB_CFLAGS@
+TDB_OBJ = @TDB_OBJ@
+
+REPLACE_OBJ = @LIBREPLACEOBJ@
+
+SOCKET_WRAPPER_OBJ = @SOCKET_WRAPPER_OBJS@
+
+PMDA_LIBS = -lpcp -lpcp_pmda
+PMDA_INSTALL = @CTDB_PMDA_INSTALL@
+PMDA_DEST_DIR = /var/lib/pcp/pmdas/ctdb
+
+WRAPPER=@
+ifeq ($(V),1)
+WRAPPER=
+endif
+ifeq ($(VERBOSE),1)
+WRAPPER=
+endif
+
+ifeq ($(CC),gcc)
+EXTRA_CFLAGS=-Wno-format-zero-length -Wno-deprecated-declarations -fPIC
+endif
+
+CFLAGS=@CPPFLAGS@ -g -I$(srcdir)/include -Iinclude -Ilib -Ilib/util -I$(srcdir) \
+       $(TALLOC_CFLAGS) $(TEVENT_CFLAGS) $(TDB_CFLAGS) -I@libreplacedir@ \
+       -DVARDIR=\"$(localstatedir)\" -DETCDIR=\"$(etcdir)\" \
+       -DCTDB_VARDIR=\"$(localstatedir)/lib/ctdb\" \
+       -DLOGDIR=\"$(logdir)\" -DBINDIR=\"$(bindir)\" \
+       -DSOCKPATH=\"$(sockpath)\" \
+       -DUSE_MMAP=1 @CFLAGS@ $(POPT_CFLAGS) \
+       $(EXTRA_CFLAGS)
+
+LDSHFLAGS=-fPIC -shared
+#LDSHFLAGS=-fPIC -shared -Wl,-Bsymbolic -Wl,-z,relo -Wl,-Bsymbolic-funtions -Wl,--as-needed -Wl,-z,defs
+SHLD=${CC} ${CFLAGS} ${LDSHFLAGS} -o $@
+
+LIB_FLAGS=@LDFLAGS@ -Llib @LIBS@ $(POPT_LIBS) $(TALLOC_LIBS) $(TEVENT_LIBS) $(TDB_LIBS) \
+                 @INFINIBAND_LIBS@ @CTDB_PCAP_LDFLAGS@
+
+CTDB_VERSION_H = include/ctdb_version.h
+
+UTIL_OBJ = lib/util/idtree.o lib/util/db_wrap.o lib/util/strlist.o lib/util/util.o \
+       lib/util/util_time.o lib/util/util_file.o lib/util/fault.o lib/util/substitute.o \
+       lib/util/signal.o
+
+CTDB_COMMON_OBJ =  common/ctdb_io.o common/ctdb_util.o \
+       common/ctdb_ltdb.o common/ctdb_message.o common/cmdline.o  \
+       lib/util/debug.o common/rb_tree.o @CTDB_SYSTEM_OBJ@ common/system_common.o \
+       common/ctdb_logging.o common/ctdb_fork.o
+
+CTDB_TCP_OBJ = tcp/tcp_connect.o tcp/tcp_io.o tcp/tcp_init.o
+
+CTDB_EXTERNAL_OBJ = $(POPT_OBJ) $(TALLOC_OBJ) $(TDB_OBJ) \
+       $(REPLACE_OBJ) $(EXTRA_OBJ) $(TEVENT_OBJ) $(SOCKET_WRAPPER_OBJ)
+
+CTDB_CLIENT_OBJ = client/ctdb_client.o \
+       $(CTDB_COMMON_OBJ) $(UTIL_OBJ) $(CTDB_EXTERNAL_OBJ)
+
+CTDB_SERVER_OBJ = server/ctdbd.o server/ctdb_daemon.o \
+       server/ctdb_recoverd.o server/ctdb_recover.o server/ctdb_freeze.o \
+       server/ctdb_tunables.o server/ctdb_monitor.o server/ctdb_server.o \
+       server/ctdb_control.o server/ctdb_call.o server/ctdb_ltdb_server.o \
+       server/ctdb_traverse.o server/eventscript.o server/ctdb_takeover.o \
+       server/ctdb_serverids.o server/ctdb_persistent.o \
+       server/ctdb_keepalive.o server/ctdb_logging.o server/ctdb_uptime.o \
+       server/ctdb_vacuum.o server/ctdb_banning.o server/ctdb_statistics.o \
+       server/ctdb_update_record.o server/ctdb_lock.o \
+       $(CTDB_CLIENT_OBJ) $(CTDB_TCP_OBJ) @INFINIBAND_WRAPPER_OBJ@
+
+TEST_BINS=tests/bin/ctdb_bench tests/bin/ctdb_fetch tests/bin/ctdb_fetch_one \
+       tests/bin/ctdb_fetch_readonly_once tests/bin/ctdb_fetch_readonly_loop \
+       tests/bin/ctdb_store tests/bin/ctdb_trackingdb_test \
+       tests/bin/ctdb_randrec tests/bin/ctdb_persistent \
+       tests/bin/ctdb_traverse tests/bin/rb_test tests/bin/ctdb_transaction \
+       tests/bin/ctdb_takeover_tests tests/bin/ctdb_update_record \
+       tests/bin/ctdb_update_record_persistent \
+       tests/bin/ctdb_functest tests/bin/ctdb_stubtest \
+       tests/bin/ctdb_porting_tests tests/bin/ctdb_lock_tdb \
+       @INFINIBAND_BINS@
+
+BINS = bin/ctdb @CTDB_SCSI_IO@ bin/smnotify bin/ping_pong bin/ltdbtool \
+       bin/ctdb_lock_helper @CTDB_PMDA@
+
+SBINS = bin/ctdbd
+
+DIRS = lib bin tests/bin
+
+.SUFFIXES: .c .o .h
+
+all: showflags dirs $(CTDB_VERSION_H) $(CTDB_SERVER_OBJ) $(CTDB_CLIENT_OBJ) $(BINS) $(SBINS) $(TEST_BINS)
+
+showflags:
+       @echo 'ctdb will be compiled with flags:'
+       @echo '  CFLAGS = $(CFLAGS)'
+       @echo '  LIBS = $(LIB_FLAGS)'
+
+showlayout::
+       @echo "ctdb will be installed into:"
+       @echo "  prefix:      $(prefix)"
+       @echo "  bindir:      $(bindir)"
+       @echo "  sbindir:     $(sbindir)"
+       @echo "  libdir:      $(libdir)"
+       @echo "  vardir:      $(localstatedir)"
+       @echo "  logdir:      $(logdir)"
+       @echo "  mandir:      $(mandir)"
+       @echo "  etcdir:      $(etcdir)"
+
+.c.o:
+       @echo Compiling $*.c
+       @mkdir -p `dirname $@`
+       $(WRAPPER) $(CC) $(CFLAGS) -c $< -o $@
+
+dirs:
+       $(WRAPPER) mkdir -p $(DIRS)
+
+$(CTDB_VERSION_H):
+       @echo Generating $@
+       $(WRAPPER) ./packaging/mkversion.sh
+
+bin/ctdbd: $(CTDB_SERVER_OBJ)
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ $(CTDB_SERVER_OBJ) $(LIB_FLAGS)
+
+bin/scsi_io: $(CTDB_CLIENT_OBJ) utils/scsi_io/scsi_io.o 
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ utils/scsi_io/scsi_io.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+bin/ctdb: $(CTDB_CLIENT_OBJ) tools/ctdb.o tools/ctdb_vacuum.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tools/ctdb.o tools/ctdb_vacuum.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+bin/ltdbtool: tools/ltdbtool.o $(TDB_OBJ)
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ $+ $(TDB_LIBS) $(LIB_FLAGS)
+
+bin/ctdb_lock_helper: server/ctdb_lock_helper.o lib/util/util_file.o $(CTDB_EXTERNAL_OBJ)
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ server/ctdb_lock_helper.o lib/util/util_file.o $(CTDB_EXTERNAL_OBJ) $(TDB_LIBS) $(LIB_FLAGS)
+
+bin/smnotify: utils/smnotify/gen_xdr.o utils/smnotify/gen_smnotify.o utils/smnotify/smnotify.o $(POPT_OBJ)
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ utils/smnotify/smnotify.o utils/smnotify/gen_xdr.o utils/smnotify/gen_smnotify.o $(POPT_OBJ) $(LIB_FLAGS)
+
+utils/smnotify/smnotify.o: utils/smnotify/smnotify.c utils/smnotify/smnotify.h
+
+utils/smnotify/smnotify.h:  utils/smnotify/smnotify.x
+       @echo Generating $@
+       $(WRAPPER) rpcgen -h utils/smnotify/smnotify.x > utils/smnotify/smnotify.h
+
+utils/smnotify/gen_xdr.c: utils/smnotify/smnotify.x utils/smnotify/smnotify.h
+       @echo Generating $@
+       $(WRAPPER) rpcgen -c utils/smnotify/smnotify.x | grep -Ev '^[[:space:]]+register int32_t \*buf;' > utils/smnotify/gen_xdr.c 
+
+utils/smnotify/gen_smnotify.c: utils/smnotify/smnotify.x utils/smnotify/smnotify.h
+       @echo Generating $@
+       $(WRAPPER) rpcgen -l utils/smnotify/smnotify.x > utils/smnotify/gen_smnotify.c 
+
+bin/ping_pong: utils/ping_pong/ping_pong.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ utils/ping_pong/ping_pong.o $(LIB_FLAGS)
+
+bin/pmdactdb: $(CTDB_CLIENT_OBJ) utils/pmda/pmda_ctdb.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ utils/pmda/pmda_ctdb.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS) $(PMDA_LIBS)
+
+tests/bin/rb_test: $(CTDB_CLIENT_OBJ) tests/src/rb_test.o 
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/rb_test.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_bench: $(CTDB_CLIENT_OBJ) tests/src/ctdb_bench.o 
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_bench.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_fetch: $(CTDB_CLIENT_OBJ) tests/src/ctdb_fetch.o 
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_fetch_one: $(CTDB_CLIENT_OBJ) tests/src/ctdb_fetch_one.o 
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch_one.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_fetch_readonly_once: tests/src/ctdb_fetch_readonly_once.o $(CTDB_CLIENT_OBJ)
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch_readonly_once.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_fetch_readonly_loop: $(CTDB_CLIENT_OBJ) tests/src/ctdb_fetch_readonly_loop.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_fetch_readonly_loop.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_trackingdb_test: $(CTDB_CLIENT_OBJ) tests/src/ctdb_trackingdb_test.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_trackingdb_test.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_update_record: $(CTDB_CLIENT_OBJ) tests/src/ctdb_update_record.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_update_record.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_update_record_persistent: $(CTDB_CLIENT_OBJ) tests/src/ctdb_update_record_persistent.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_update_record_persistent.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_store: $(CTDB_CLIENT_OBJ) tests/src/ctdb_store.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_store.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_traverse: $(CTDB_CLIENT_OBJ) tests/src/ctdb_traverse.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_traverse.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_randrec: $(CTDB_CLIENT_OBJ) tests/src/ctdb_randrec.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_randrec.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_persistent: $(CTDB_CLIENT_OBJ) tests/src/ctdb_persistent.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_persistent.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_porting_tests: $(CTDB_CLIENT_OBJ) tests/src/ctdb_porting_tests.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_porting_tests.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_transaction: $(CTDB_CLIENT_OBJ) tests/src/ctdb_transaction.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_transaction.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+CTDB_SERVER_MOST_OBJ = $(CTDB_SERVER_OBJ:server/ctdbd.o=)
+CTDBD_TEST_C = $(CTDB_SERVER_MOST_OBJ:.o=.c) tests/src/ctdbd_test.c
+
+CTDB_TEST_C =  $(CTDB_CLIENT_OBJ:.o=.c) tools/ctdb.c tools/ctdb_vacuum.c tests/src/ctdb_test_stubs.c
+
+CTDB_TEST_OBJ =  $(TALLOC_OBJ) $(TDB_OBJ) \
+       @CTDB_SYSTEM_OBJ@ $(REPLACE_OBJ) $(EXTRA_OBJ) $(TEVENT_OBJ) $(SOCKET_WRAPPER_OBJ)
+
+tests/src/ctdb_takeover_tests.o: tests/src/ctdb_takeover_tests.c $(CTDBD_TEST_C)
+
+tests/bin/ctdb_takeover_tests: $(CTDB_TEST_OBJ) tests/src/ctdb_takeover_tests.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ tests/src/ctdb_takeover_tests.o $(CTDB_TEST_OBJ) $(LIB_FLAGS)
+
+tests/src/ctdb_functest.o: tests/src/ctdb_functest.c tests/src/ctdb_test.c $(CTDB_TEST_C)
+
+tests/bin/ctdb_functest: tests/src/ctdb_functest.o $(CTDB_TEST_OBJ) 
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ $^ $(POPT_OBJ) $(LIB_FLAGS)
+
+tests/src/ctdb_test.o: tests/src/ctdb_test.c $(CTDB_TEST_C)
+
+tests/bin/ctdb_stubtest: tests/src/ctdb_test.o $(CTDB_TEST_OBJ)
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ $^ $(POPT_OBJ) $(LIB_FLAGS)
+
+tests/bin/ctdb_lock_tdb: tests/src/ctdb_lock_tdb.o $(CTDB_CLIENT_OBJ)
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ $^ $(LIB_FLAGS)
+
+tests/bin/ibwrapper_test: $(CTDB_CLIENT_OBJ) ib/ibwrapper_test.o
+       @echo Linking $@
+       $(WRAPPER) $(CC) $(CFLAGS) -o $@ ib/ibwrapper_test.o $(CTDB_CLIENT_OBJ) $(LIB_FLAGS)
+
+manpages:
+       $(MAKE) -C doc
+
+clean:
+       rm -f *.o */*.o */*.a */*/*.o */*~
+       rm -f utils/smnotify/gen_xdr.c
+       rm -f $(BINS) $(SBINS) $(TEST_BINS)
+
+distclean: clean
+       rm -f *~ */*~
+       rm -rf bin
+       rm -f config.log config.status config.cache config.h
+       rm -f Makefile
+
+install: all manpages $(PMDA_INSTALL)
+       mkdir -p $(DESTDIR)$(libdir)/pkgconfig
+       mkdir -p $(DESTDIR)$(bindir)
+       mkdir -p $(DESTDIR)$(sbindir)
+       mkdir -p $(DESTDIR)$(includedir)
+       mkdir -p $(DESTDIR)$(etcdir)/ctdb
+       mkdir -p $(DESTDIR)$(etcdir)/ctdb/events.d
+       mkdir -p $(DESTDIR)$(etcdir)/ctdb/nfs-rpc-checks.d
+       mkdir -p $(DESTDIR)$(etcdir)/sudoers.d/
+       mkdir -p $(DESTDIR)$(etcdir)/ctdb/notify.d
+       mkdir -p $(DESTDIR)$(localstatedir)/lib/ctdb
+       mkdir -p $(DESTDIR)$(localstatedir)/run/ctdb
+       mkdir -p $(DESTDIR)$(logdir)
+       ${INSTALLCMD} -m 644 ctdb.pc $(DESTDIR)$(libdir)/pkgconfig
+       ${INSTALLCMD} -m 755 bin/ctdb $(DESTDIR)$(bindir)
+       ${INSTALLCMD} -m 755 bin/ctdbd $(DESTDIR)$(sbindir)
+       ${INSTALLCMD} -m 755 bin/smnotify $(DESTDIR)$(bindir)
+       $(INSTALLCMD) -m 755 bin/ping_pong $(DESTDIR)$(bindir)
+       $(INSTALLCMD) -m 755 bin/ltdbtool $(DESTDIR)$(bindir)
+       $(INSTALLCMD) -m 755 bin/ctdb_lock_helper $(DESTDIR)$(bindir)
+       ${INSTALLCMD} -m 644 include/ctdb.h $(DESTDIR)$(includedir)
+       ${INSTALLCMD} -m 644 include/ctdb_client.h $(DESTDIR)$(includedir)
+       ${INSTALLCMD} -m 644 include/ctdb_protocol.h $(DESTDIR)$(includedir)
+       ${INSTALLCMD} -m 644 include/ctdb_private.h $(DESTDIR)$(includedir) # for samba3
+       ${INSTALLCMD} -m 644 include/ctdb_typesafe_cb.h $(DESTDIR)$(includedir)
+       ${INSTALLCMD} -m 440 config/ctdb.sudoers $(DESTDIR)$(etcdir)/sudoers.d/ctdb
+       ${INSTALLCMD} -m 644 config/functions $(DESTDIR)$(etcdir)/ctdb
+       ${INSTALLCMD} -m 755 config/statd-callout $(DESTDIR)$(etcdir)/ctdb
+       ${INSTALLCMD} -m 755 config/ctdbd_wrapper $(DESTDIR)$(sbindir)
+       ${INSTALLCMD} -m 755 config/events.d/00.ctdb $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/01.reclock $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/10.interface $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/11.natgw $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/11.routing $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/13.per_ip_routing $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 644 config/events.d/20.multipathd $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 644 config/events.d/31.clamd $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/40.vsftpd $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 644 config/events.d/40.fs_use $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/41.httpd $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/49.winbind $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/50.samba $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/60.nfs $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/60.ganesha $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/62.cnfs $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/70.iscsi $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 755 config/events.d/91.lvs $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 644 config/events.d/99.timeout $(DESTDIR)$(etcdir)/ctdb/events.d
+       ${INSTALLCMD} -m 644 config/nfs-rpc-checks.d/10.statd.check $(DESTDIR)$(etcdir)/ctdb/nfs-rpc-checks.d/
+       ${INSTALLCMD} -m 644 config/nfs-rpc-checks.d/20.nfsd.check $(DESTDIR)$(etcdir)/ctdb/nfs-rpc-checks.d/
+       ${INSTALLCMD} -m 644 config/nfs-rpc-checks.d/30.lockd.check $(DESTDIR)$(etcdir)/ctdb/nfs-rpc-checks.d/
+       ${INSTALLCMD} -m 644 config/nfs-rpc-checks.d/40.mountd.check $(DESTDIR)$(etcdir)/ctdb/nfs-rpc-checks.d/
+       ${INSTALLCMD} -m 644 config/nfs-rpc-checks.d/50.rquotad.check $(DESTDIR)$(etcdir)/ctdb/nfs-rpc-checks.d/
+       ${INSTALLCMD} -m 755 tools/ctdb_diagnostics $(DESTDIR)$(bindir)
+       ${INSTALLCMD} -m 755 tools/onnode $(DESTDIR)$(bindir)
+       if [ -f doc/ctdb.1 ];then ${INSTALLCMD} -d $(DESTDIR)$(mandir)/man1; fi
+       if [ -f doc/ctdb.1 ];then ${INSTALLCMD} -m 644 doc/ctdb.1 $(DESTDIR)$(mandir)/man1; fi
+       if [ -f doc/ctdbd.1 ];then ${INSTALLCMD} -m 644 doc/ctdbd.1 $(DESTDIR)$(mandir)/man1; fi
+       if [ -f doc/onnode.1 ];then ${INSTALLCMD} -m 644 doc/onnode.1 $(DESTDIR)$(mandir)/man1; fi
+       if [ -f doc/ltdbtool.1 ]; then ${INSTALLCMD} -m 644 doc/ltdbtool.1 $(DESTDIR)$(mandir)/man1; fi
+       if [ -f doc/ping_pong.1 ];then ${INSTALLCMD} -m 644 doc/ping_pong.1 $(DESTDIR)$(mandir)/man1; fi
+       ${INSTALLCMD} -m 755 config/notify.sh $(DESTDIR)$(etcdir)/ctdb
+       ${INSTALLCMD} -m 755 config/debug-hung-script.sh $(DESTDIR)$(etcdir)/ctdb
+       ${INSTALLCMD} -m 755 config/ctdb-crash-cleanup.sh $(DESTDIR)$(etcdir)/ctdb
+       ${INSTALLCMD} -m 755 config/gcore_trace.sh $(DESTDIR)$(etcdir)/ctdb
+       ${INSTALLCMD} -m 755 config/debug_locks.sh $(DESTDIR)$(etcdir)/ctdb
+
+install_pmda:
+       $(INSTALLCMD) -m 755 -d $(DESTDIR)$(PMDA_DEST_DIR)
+       $(INSTALLCMD) -m 755 utils/pmda/Install utils/pmda/Remove $(DESTDIR)$(PMDA_DEST_DIR)
+       $(INSTALLCMD) -m 644 utils/pmda/pmns utils/pmda/domain.h utils/pmda/help utils/pmda/README $(DESTDIR)$(PMDA_DEST_DIR)
+       $(INSTALLCMD) -m 755 bin/pmdactdb $(DESTDIR)$(PMDA_DEST_DIR)
+
+# Should use $(datarootdir) but older autoconfs don't do this.  :-(
+install_tests: all
+       tests/INSTALL --destdir=$(DESTDIR) --datarootdir=$(prefix)/share --libdir=$(libdir) --bindir=$(bindir) --etcdir=$(etcdir)
+
+test: all
+       tests/run_tests.sh -V tests/var
+
+test_cluster: all
+       tests/run_cluster_tests.sh
+
+valgrindtest: all
+       VALGRIND="valgrind -q --trace-children=yes" tests/run_tests.sh
+
+ctags:
+       find . -name "*.[ch]" | xargs ctags
+
+etags:
+       find . -name "*.[ch]" | xargs etags
+
+realdistclean: distclean
+       rm -f configure config.h.in ctdb.pc
diff --git a/ctdb/NEWS b/ctdb/NEWS
new file mode 100644 (file)
index 0000000..ae4cff6
--- /dev/null
+++ b/ctdb/NEWS
@@ -0,0 +1,316 @@
+Changes in CTDB 2.5
+===================
+
+User-visible changes
+--------------------
+
+* The default location of the ctdbd socket is now:
+
+    /var/run/ctdb/ctdbd.socket
+
+  If you currently set CTDB_SOCKET in configuration then unsetting it
+  will probably do what you want.
+
+* The default location of CTDB TDB databases is now:
+
+    /var/lib/ctdb
+
+  If you only set CTDB_DBDIR (to the old default of /var/ctdb) then
+  you probably want to move your databases to /var/lib/ctdb, drop your
+  setting of CTDB_DBDIR and just use the default.
+
+  To maintain the database files in /var/ctdb you will need to set
+  CTDB_DBDIR, CTDB_DBDIR_PERSISTENT and CTDB_DBDIR_STATE, since all of
+  these have moved.
+
+* Use of CTDB_OPTIONS to set ctdbd command-line options is no longer
+  supported.  Please use individual configuration variables instead.
+
+* Obsolete tunables VacuumDefaultInterval, VacuumMinInterval and
+  VacuumMaxInterval have been removed.  Setting them had no effect but
+  if you now try to set them in a configuration files via CTDB_SET_X=Y
+  then CTDB will not start.
+
+* Much improved manual pages.  Added new manpages ctdb(7),
+  ctdbd.conf(5), ctdb-tunables(7).  Still some work to do.
+
+* Most CTDB-specific configuration can now be set in
+  /etc/ctdb/ctdbd.conf.
+
+  This avoids cluttering distribution-specific configuration files,
+  such as /etc/sysconfig/ctdb.  It also means that we can say: see
+  ctdbd.conf(5) for more details.  :-)
+
+* Configuration variable NFS_SERVER_MODE is deprecated and has been
+  replaced by CTDB_NFS_SERVER_MODE.  See ctdbd.conf(5) for more
+  details.
+
+* "ctdb reloadips" is much improved and should be used for reloading
+  the public IP configuration.
+
+  This commands attempts to yield much more predictable IP allocations
+  than using sequences of delip and addip commands.  See ctdb(1) for
+  details.
+
+* Ability to pass comma-separated string to ctdb(1) tool commands via
+  the -n option is now documented and works for most commands.  See
+  ctdb(1) for details.
+
+* "ctdb rebalancenode" is now a debugging command and should not be
+  used in normal operation.  See ctdb(1) for details.
+
+* "ctdb ban 0" is now invalid.
+
+  This was documented as causing a permanent ban.  However, this was
+  not implemented and caused an "unban" instead.  To avoid confusion,
+  0 is now an invalid ban duration.  To administratively "ban" a node
+  use "ctdb stop" instead.
+
+* The systemd configuration now puts the PID file in /run/ctdb (rather
+  than /run/ctdbd) for consistency with the initscript and other uses
+  of /var/run/ctdb.
+
+Important bug fixes
+-------------------
+
+* Traverse regression fixed.
+
+* The default recovery method for persistent databases has been
+  changed to use database sequence numbers instead of doing
+  record-by-record recovery (using record sequence numbers).  This
+  fixes issues including registry corruption.
+
+* Banned nodes are no longer told to run the "ipreallocated" event
+  during a takeover run, when in fallback mode with nodes that don't
+  support the IPREALLOCATED control.
+
+Important internal changes
+--------------------------
+
+* Persistent transactions are now compatible with Samba and work
+  reliably.
+
+* The recovery master role has been made more stable by resetting the
+  priority time each time a node becomes inactive.  This means that
+  nodes that are active for a long time are more likely to retain the
+  recovery master role.
+
+* The incomplete libctdb library has been removed.
+
+* Test suite now starts ctdbd with the --sloppy-start option to speed
+  up startup.  However, this should not be done in production.
+
+
+Changes in CTDB 2.4
+===================
+
+User-visible changes
+--------------------
+
+* A missing network interface now causes monitoring to fail and the
+  node to become unhealthy.
+
+* Changed ctdb command's default control timeout from 3s to 10s.
+
+* debug-hung-script.sh now includes the output of "ctdb scriptstatus"
+  to provide more information.
+
+Important bug fixes
+-------------------
+
+* Starting CTDB daemon by running ctdbd directly should not remove
+  existing unix socket unconditionally.
+
+* ctdbd once again successfully kills client processes on releasing
+  public IPs.  It was checking for them as tracked child processes
+  and not finding them, so wasn't killing them.
+
+* ctdbd_wrapper now exports CTDB_SOCKET so that child processes of
+  ctdbd (such as uses of ctdb in eventscripts) use the correct socket.
+
+* Always use Jenkins hash when creating volatile databases.  There
+  were a few places where TDBs would be attached with the wrong flags.
+
+* Vacuuming code fixes in CTDB 2.2 introduced bugs in the new code
+  which led to header corruption for empty records.  This resulted
+  in inconsistent headers on two nodes and a request for such a record
+  keeps bouncing between nodes indefinitely and logs "High hopcount"
+  messages in the log. This also caused performance degradation.
+
+* ctdbd was losing log messages at shutdown because they weren't being
+  given time to flush.  ctdbd now sleeps for a second during shutdown
+  to allow time to flush log messages.
+
+* Improved socket handling introduced in CTDB 2.2 caused ctdbd to
+  process a large number of packets available on single FD before
+  polling other FDs.  Use fixed size queue buffers to allow fair
+  scheduling across multiple FDs.
+
+Important internal changes
+--------------------------
+
+* A node that fails to take/release multiple IPs will only incur a
+  single banning credit.  This makes a brief failure less likely to
+  cause node to be banned.
+
+* ctdb killtcp has been changed to read connections from stdin and
+  10.interface now uses this feature to improve the time taken to kill
+  connections.
+
+* Improvements to hot records statistics in ctdb dbstatistics.
+
+* Recovery daemon now assembles up-to-date node flags information
+  from remote nodes before checking if any flags are inconsistent and
+  forcing a recovery.
+
+* ctdbd no longer creates multiple lock sub-processes for the same
+  key.  This reduces the number of lock sub-processes substantially.
+
+* Changed the nfsd RPC check failure policy to failover quickly
+  instead of trying to repair a node first by restarting NFS.  Such
+  restarts would often hang if the cause of the RPC check failure was
+  the cluster filesystem or storage.
+
+* Logging improvements relating to high hopcounts and sticky records.
+
+* Make sure lower level tdb messages are logged correctly.
+
+* CTDB commands disable/enable/stop/continue are now resilient to
+  individual control failures and retry in case of failures.
+
+
+Changes in CTDB 2.3
+===================
+
+User-visible changes
+--------------------
+
+* 2 new configuration variables for 60.nfs eventscript:
+
+  - CTDB_MONITOR_NFS_THREAD_COUNT
+  - CTDB_NFS_DUMP_STUCK_THREADS
+
+  See ctdb.sysconfig for details.
+
+* Removed DeadlockTimeout tunable.  To enable debug of locking issues set
+
+   CTDB_DEBUG_LOCKS=/etc/ctdb/debug_locks.sh
+
+* In overall statistics and database statistics, lock buckets have been
+  updated to use following timings:
+
+   < 1ms, < 10ms, < 100ms, < 1s, < 2s, < 4s, < 8s, < 16s, < 32s, < 64s, >= 64s
+
+* Initscript is now simplified with most CTDB-specific functionality
+  split out to ctdbd_wrapper, which is used to start and stop ctdbd.
+
+* Add systemd support.
+
+* CTDB subprocesses are now given informative names to allow them to
+  be easily distinguished when using programs like "top" or "perf".
+
+Important bug fixes
+-------------------
+
+* ctdb tool should not exit from a retry loop if a control times out
+  (e.g. under high load).  This simple fix will stop an exit from the
+  retry loop on any error.
+
+* When updating flags on all nodes, use the correct updated flags.  This
+  should avoid wrong flag change messages in the logs.
+
+* The recovery daemon will not ban other nodes if the current node
+  is banned.
+
+* ctdb dbstatistics command now correctly outputs database statistics.
+
+* Fixed a panic with overlapping shutdowns (regression in 2.2).
+
+* Fixed 60.ganesha "monitor" event (regression in 2.2).
+
+* Fixed a buffer overflow in the "reloadips" implementation.
+
+* Fixed segmentation faults in ping_pong (called with incorrect
+  argument) and test binaries (called when ctdbd not running).
+
+Important internal changes
+--------------------------
+
+* The recovery daemon on stopped or banned node will stop participating in any
+  cluster activity.
+
+* Improve cluster wide database traverse by sending the records directly from
+  traverse child process to requesting node.
+
+* TDB checking and dropping of all IPs moved from initscript to "init"
+  event in 00.ctdb.
+
+* To avoid "rogue IPs" the release IP callback now fails if the
+  released IP is still present on an interface.
+
+
+Changes in CTDB 2.2
+===================
+
+User-visible changes
+--------------------
+
+* The "stopped" event has been removed.
+
+  The "ipreallocated" event is now run when a node is stopped.  Use
+  this instead of "stopped".
+
+* New --pidfile option for ctdbd, used by initscript
+
+* The 60.nfs eventscript now uses configuration files in
+  /etc/ctdb/nfs-rpc-checks.d/ for timeouts and actions instead of
+  hardcoding them into the script.
+
+* Notification handler scripts can now be dropped into /etc/ctdb/notify.d/.
+
+* The NoIPTakeoverOnDisabled tunable has been renamed to
+  NoIPHostOnAllDisabled and now works properly when set on individual
+  nodes.
+
+* New ctdb subcommand "runstate" prints the current internal runstate.
+  Runstates are used for serialising startup.
+
+Important bug fixes
+-------------------
+
+* The Unix domain socket is now set to non-blocking after the
+  connection succeeds.  This avoids connections failing with EAGAIN
+  and not being retried.
+
+* Fetching from the log ringbuffer now succeeds if the buffer is full.
+
+* Fix a severe recovery bug that can lead to data corruption for SMB clients.
+
+* The statd-callout script now runs as root via sudo.
+
+* "ctdb delip" no longer fails if it is unable to move the IP.
+
+* A race in the ctdb tool's ipreallocate code was fixed.  This fixes
+  potential bugs in the "disable", "enable", "stop", "continue",
+  "ban", "unban", "ipreallocate" and "sync" commands.
+
+* The monitor cancellation code could sometimes hang indefinitely.
+  This could cause "ctdb stop" and "ctdb shutdown" to fail.
+
+Important internal changes
+--------------------------
+
+* The socket I/O handling has been optimised to improve performance.
+
+* IPs will not be assigned to nodes during CTDB initialisation.  They
+  will only be assigned to nodes that are in the "running" runstate.
+
+* Improved database locking code.  One improvement is to use a
+  standalone locking helper executable - the avoids creating many
+  forked copies of ctdbd and potentially running a node out of memory.
+
+* New control CTDB_CONTROL_IPREALLOCATED is now used to generate
+  "ipreallocated" events.
+
+* Message handlers are now indexed, providing a significant
+  performance improvement.
diff --git a/ctdb/README b/ctdb/README
new file mode 100644 (file)
index 0000000..3099a6d
--- /dev/null
@@ -0,0 +1,8 @@
+This is the release version of CTDB, a clustered implementation of TDB
+database used by Samba and other projects to store temporary data.
+
+This software is freely distributable under the GNU public license,
+a copy of which you should have received with this software (in a file
+called COPYING).
+
+For documentation on CTDB, please visit CTDB website http://ctdb.samba.org.
diff --git a/ctdb/README.Coding b/ctdb/README.Coding
new file mode 100644 (file)
index 0000000..fd52dbe
--- /dev/null
@@ -0,0 +1,236 @@
+##
+## Coding conventions in the Samba 3.0 tree
+##
+
+===========
+Quick Start
+===========
+
+Coding style guidelines are about reducing the number of unnecessary
+reformatting patches and making things easier developers to work together.
+You don't have to like them or even agree with them, but once put in place
+we all have to abide by them (or vote to change them).  However, coding
+style should never outweigh coding itself and so the the guidelines
+described here are hopefully easier enough to follow as they are very
+common and supported by tools and editors.
+
+The basic style, also mentioned in the SAMBA_4_0/prog_guide.txt is the
+Linux kernel coding style (See Documentation/CodingStyle in the kernel
+source tree).  The closely matches what most Samba developers use already
+anyways.
+
+But to save you the trouble of reading the Linux kernel style guide, here
+are the highlights.
+
+
+* Maximum Line Width is 80 Characters
+  The reason is not for people with low-res screens but rather sticking
+  to 80 columns prevents you from easily nesting more than one level of
+  if statements or other code blocks.  Use source/script/count_80_col.pl
+  to check your changes.
+
+* Use 8 Space Tabs to Indent
+  No whitespace filler.
+
+* No Trailing Whitespace
+  Use source/script/strip_trail_ws.pl to clean you files before committing.
+
+* Follow the K&R guidelines.  We won't go throw them all here.  You have
+  a copy of "The C Programming Language" anyways right?  You can also use
+  the format_indent.sh script found in source/script/ if all else fails.
+
+
+
+============
+Editor Hints
+============
+
+Emacs
+-----
+Add the follow to your $HOME/.emacs file:
+
+  (add-hook 'c-mode-hook
+       (lambda ()
+               (c-set-style "linux")
+               (c-toggle-auto-state)))
+
+
+Vi
+--
+(Thanks to SATOH Fumiyasu <fumiyas@osstech.jp> for these hints):
+
+For the basic vi editor including with all variants of *nix, add the 
+following to $HOME/.exrc:
+
+  set tabstop=8
+  set shiftwidth=8
+
+For Vim, the following settings in $HOME/.vimrc will also deal with 
+displaying trailing whitespace:
+
+  if has("syntax") && (&t_Co > 2 || has("gui_running"))
+       syntax on
+       function! ActivateInvisibleCharIndicator()
+               syntax match TrailingSpace "[ \t]\+$" display containedin=ALL
+               highlight TrailingSpace ctermbg=Red
+       endf
+       autocmd BufNewFile,BufRead * call ActivateInvisibleCharIndicator()
+  endif
+  " Show tabs, trailing whitespace, and continued lines visually
+  set list listchars=tab:»·,trail:·,extends:…
+
+  " highlight overly long lines same as TODOs.
+  set textwidth=80
+  autocmd BufNewFile,BufRead *.c,*.h exec 'match Todo /\%>' . &textwidth . 'v.\+/'
+
+
+=========================
+FAQ & Statement Reference
+=========================
+
+Comments
+--------
+
+Comments should always use the standard C syntax.  I.e. /* ... */.  C++ 
+style comments are not currently allowed.
+
+
+Indention & Whitespace & 80 columns
+-----------------------------------
+
+To avoid confusion, indentations are to be 8 character with tab (not 
+8 ' ' characters.  When wrapping parameters for function calls, 
+alignment parameter list with the first parameter on the previous line.
+Use tabs to get as close as possible and then fill in the final 7 
+characters or less with whitespace.  For example,
+
+       var1 = foo(arg1, arg2,
+                  arg3);
+
+The previous example is intended to illustrate alignment of function 
+parameters across lines and not as encourage for gratuitous line 
+splitting.  Never split a line before columns 70 - 79 unless you
+have a really good reason.  Be smart about formatting.
+
+
+If, switch, & Code blocks
+-------------------------
+
+Always follow an 'if' keyword with a space but don't include additional
+spaces following or preceding the parentheses in the conditional.
+This is good:
+
+       if (x == 1)
+
+This is bad:
+
+       if ( x == 1 )
+
+Yes we have a lot of code that uses the second form and we are trying 
+to clean it up without being overly intrusive.
+
+Note that this is a rule about parentheses following keywords and not
+functions.  Don't insert a space between the name and left parentheses when 
+invoking functions.
+
+Braces for code blocks used by for, if, switch, while, do..while, etc...
+should begin on the same line as the statement keyword and end on a line 
+of their own.  NOTE: Functions are different and the beginning left brace
+should begin on a line of its own.
+
+If the beginning statement has to be broken across lines due to length,
+the beginning brace should be on a line of its own.
+
+The exception to the ending rule is when the closing brace is followed by 
+another language keyword such as else or the closing while in a do..while 
+loop.
+
+Good examples:
+
+       if (x == 1) {
+               printf("good\n");
+       }
+
+       for (x=1;
+            x<10;
+            x++)
+       {
+               print("%d\n", x);
+       }
+
+       do {
+               printf("also good\n");
+       } while (1);
+
+Bad examples:
+
+       while (1)
+       {
+               print("I'm in a loop!\n"); }
+       
+
+Goto
+----
+
+While many people have been academically taught that goto's are fundamentally
+evil, then can greatly enhance readability and reduce memory leaks when used
+as the single exit point from a function.  But in no Samba world what so ever 
+is a goto outside of a function or block of code a good idea.
+
+Good Examples:
+
+int function foo(int y)
+{
+       int *z = NULL;
+       int ret = 0;
+
+       if ( y < 10 ) {
+               z = malloc(sizeof(int)*y);
+               if (!z) {
+                       ret = 1;
+                       goto done;
+               }
+       }
+
+       print("Allocated %d elements.\n", y);
+
+ done: 
+       if (z)
+               free(z);
+
+       return ret;
+}
+
+
+Checking Pointer Values
+-----------------------
+
+When invoking functions that return pointer values, either of the following 
+are acceptable.  Use you best judgement and choose the more readable option.
+Remember that many other people will review it.
+
+       if ((x = malloc(sizeof(short)*10)) == NULL ) {
+               fprintf(stderr, "Unable to alloc memory!\n");
+       }
+
+or
+
+       x = malloc(sizeof(short)*10);
+       if (!x) {
+               fprintf(stderr, "Unable to alloc memory!\n");
+       }
+
+
+Primitive Data Types
+--------------------
+
+Samba has large amounts of historical code which makes use of data types 
+commonly supported by the C99 standard. However, at the time such types 
+as boolean and exact width integers did not exist and Samba developers 
+were forced to provide their own.  Now that these types are guaranteed to 
+be available either as part of the compiler C99 support or from lib/replace/, 
+new code should adhere to the following conventions:
+
+  * Booleans are of type "bool" (not BOOL)
+  * Boolean values are "true" and "false" (not True or False)
+  * Exact width integers are of type [u]int[8|16|32|64]_t
diff --git a/ctdb/aclocal.m4 b/ctdb/aclocal.m4
new file mode 100644 (file)
index 0000000..5605e47
--- /dev/null
@@ -0,0 +1 @@
+m4_include(libreplace.m4)
diff --git a/ctdb/autogen.sh b/ctdb/autogen.sh
new file mode 100755 (executable)
index 0000000..12603d2
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+rm -rf autom4te.cache
+rm -f configure config.h.in ctdb.pc
+
+IPATHS="-I libreplace -I lib/replace -I ../libreplace -I ../replace"
+IPATHS="$IPATHS -I lib/talloc -I talloc -I ../talloc"
+IPATHS="$IPATHS -I lib/tdb -I tdb -I ../tdb"
+IPATHS="$IPATHS -I lib/popt -I popt -I ../popt"
+IPATHS="$IPATHS -I lib/tevent"
+
+autoheader $IPATHS || exit 1
+autoconf $IPATHS || exit 1
+
+rm -rf autom4te.cache
+
+echo "Now run ./configure and then make."
+exit 0
+
diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c
new file mode 100644 (file)
index 0000000..fddbd02
--- /dev/null
@@ -0,0 +1,4877 @@
+/* 
+   ctdb daemon code
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "db_wrap.h"
+#include "tdb.h"
+#include "lib/util/dlinklist.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/locale.h"
+#include <stdlib.h>
+#include "../include/ctdb_private.h"
+#include "lib/util/dlinklist.h"
+
+pid_t ctdbd_pid;
+
+/*
+  allocate a packet for use in client<->daemon communication
+ */
+struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb,
+                                           TALLOC_CTX *mem_ctx, 
+                                           enum ctdb_operation operation, 
+                                           size_t length, size_t slength,
+                                           const char *type)
+{
+       int size;
+       struct ctdb_req_header *hdr;
+
+       length = MAX(length, slength);
+       size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
+
+       hdr = (struct ctdb_req_header *)talloc_zero_size(mem_ctx, size);
+       if (hdr == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to allocate packet for operation %u of length %u\n",
+                        operation, (unsigned)length));
+               return NULL;
+       }
+       talloc_set_name_const(hdr, type);
+       hdr->length       = length;
+       hdr->operation    = operation;
+       hdr->ctdb_magic   = CTDB_MAGIC;
+       hdr->ctdb_version = CTDB_VERSION;
+       hdr->srcnode      = ctdb->pnn;
+       if (ctdb->vnn_map) {
+               hdr->generation = ctdb->vnn_map->generation;
+       }
+
+       return hdr;
+}
+
+/*
+  local version of ctdb_call
+*/
+int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
+                   struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
+                   TDB_DATA *data, bool updatetdb)
+{
+       struct ctdb_call_info *c;
+       struct ctdb_registered_call *fn;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       
+       c = talloc(ctdb, struct ctdb_call_info);
+       CTDB_NO_MEMORY(ctdb, c);
+
+       c->key = call->key;
+       c->call_data = &call->call_data;
+       c->record_data.dptr = talloc_memdup(c, data->dptr, data->dsize);
+       c->record_data.dsize = data->dsize;
+       CTDB_NO_MEMORY(ctdb, c->record_data.dptr);
+       c->new_data = NULL;
+       c->reply_data = NULL;
+       c->status = 0;
+       c->header = header;
+
+       for (fn=ctdb_db->calls;fn;fn=fn->next) {
+               if (fn->id == call->call_id) break;
+       }
+       if (fn == NULL) {
+               ctdb_set_error(ctdb, "Unknown call id %u\n", call->call_id);
+               talloc_free(c);
+               return -1;
+       }
+
+       if (fn->fn(c) != 0) {
+               ctdb_set_error(ctdb, "ctdb_call %u failed\n", call->call_id);
+               talloc_free(c);
+               return -1;
+       }
+
+       /* we need to force the record to be written out if this was a remote access */
+       if (c->new_data == NULL) {
+               c->new_data = &c->record_data;
+       }
+
+       if (c->new_data && updatetdb) {
+               /* XXX check that we always have the lock here? */
+               if (ctdb_ltdb_store(ctdb_db, call->key, header, *c->new_data) != 0) {
+                       ctdb_set_error(ctdb, "ctdb_call tdb_store failed\n");
+                       talloc_free(c);
+                       return -1;
+               }
+       }
+
+       if (c->reply_data) {
+               call->reply_data = *c->reply_data;
+
+               talloc_steal(call, call->reply_data.dptr);
+               talloc_set_name_const(call->reply_data.dptr, __location__);
+       } else {
+               call->reply_data.dptr = NULL;
+               call->reply_data.dsize = 0;
+       }
+       call->status = c->status;
+
+       talloc_free(c);
+
+       return 0;
+}
+
+
+/*
+  queue a packet for sending from client to daemon
+*/
+static int ctdb_client_queue_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       return ctdb_queue_send(ctdb->daemon.queue, (uint8_t *)hdr, hdr->length);
+}
+
+
+/*
+  called when a CTDB_REPLY_CALL packet comes in in the client
+
+  This packet comes in response to a CTDB_REQ_CALL request packet. It
+  contains any reply data from the call
+*/
+static void ctdb_client_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_reply_call *c = (struct ctdb_reply_call *)hdr;
+       struct ctdb_client_call_state *state;
+
+       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_client_call_state);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
+               return;
+       }
+
+       if (hdr->reqid != state->reqid) {
+               /* we found a record  but it was the wrong one */
+               DEBUG(DEBUG_ERR, ("Dropped client call reply with reqid:%u\n",hdr->reqid));
+               return;
+       }
+
+       state->call->reply_data.dptr = c->data;
+       state->call->reply_data.dsize = c->datalen;
+       state->call->status = c->status;
+
+       talloc_steal(state, c);
+
+       state->state = CTDB_CALL_DONE;
+
+       if (state->async.fn) {
+               state->async.fn(state);
+       }
+}
+
+static void ctdb_client_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+
+/*
+  this is called in the client, when data comes in from the daemon
+ */
+void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+       struct ctdb_context *ctdb = talloc_get_type(args, struct ctdb_context);
+       struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
+       TALLOC_CTX *tmp_ctx;
+
+       /* place the packet as a child of a tmp_ctx. We then use
+          talloc_free() below to free it. If any of the calls want
+          to keep it, then they will steal it somewhere else, and the
+          talloc_free() will be a no-op */
+       tmp_ctx = talloc_new(ctdb);
+       talloc_steal(tmp_ctx, hdr);
+
+       if (cnt == 0) {
+               DEBUG(DEBUG_CRIT,("Daemon has exited - shutting down client\n"));
+               exit(1);
+       }
+
+       if (cnt < sizeof(*hdr)) {
+               DEBUG(DEBUG_CRIT,("Bad packet length %u in client\n", (unsigned)cnt));
+               goto done;
+       }
+       if (cnt != hdr->length) {
+               ctdb_set_error(ctdb, "Bad header length %u expected %u in client\n", 
+                              (unsigned)hdr->length, (unsigned)cnt);
+               goto done;
+       }
+
+       if (hdr->ctdb_magic != CTDB_MAGIC) {
+               ctdb_set_error(ctdb, "Non CTDB packet rejected in client\n");
+               goto done;
+       }
+
+       if (hdr->ctdb_version != CTDB_VERSION) {
+               ctdb_set_error(ctdb, "Bad CTDB version 0x%x rejected in client\n", hdr->ctdb_version);
+               goto done;
+       }
+
+       switch (hdr->operation) {
+       case CTDB_REPLY_CALL:
+               ctdb_client_reply_call(ctdb, hdr);
+               break;
+
+       case CTDB_REQ_MESSAGE:
+               ctdb_request_message(ctdb, hdr);
+               break;
+
+       case CTDB_REPLY_CONTROL:
+               ctdb_client_reply_control(ctdb, hdr);
+               break;
+
+       default:
+               DEBUG(DEBUG_CRIT,("bogus operation code:%u\n",hdr->operation));
+       }
+
+done:
+       talloc_free(tmp_ctx);
+}
+
+/*
+  connect to a unix domain socket
+*/
+int ctdb_socket_connect(struct ctdb_context *ctdb)
+{
+       struct sockaddr_un addr;
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sun_family = AF_UNIX;
+       strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path));
+
+       ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (ctdb->daemon.sd == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to open client socket. Errno:%s(%d)\n", strerror(errno), errno));
+               return -1;
+       }
+
+       if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+               close(ctdb->daemon.sd);
+               ctdb->daemon.sd = -1;
+               DEBUG(DEBUG_ERR,(__location__ " Failed to connect client socket to daemon. Errno:%s(%d)\n", strerror(errno), errno));
+               return -1;
+       }
+
+       set_nonblocking(ctdb->daemon.sd);
+       set_close_on_exec(ctdb->daemon.sd);
+       
+       ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd, 
+                                             CTDB_DS_ALIGNMENT, 
+                                             ctdb_client_read_cb, ctdb, "to-ctdbd");
+       return 0;
+}
+
+
+struct ctdb_record_handle {
+       struct ctdb_db_context *ctdb_db;
+       TDB_DATA key;
+       TDB_DATA *data;
+       struct ctdb_ltdb_header header;
+};
+
+
+/*
+  make a recv call to the local ctdb daemon - called from client context
+
+  This is called when the program wants to wait for a ctdb_call to complete and get the 
+  results. This call will block unless the call has already completed.
+*/
+int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call)
+{
+       if (state == NULL) {
+               return -1;
+       }
+
+       while (state->state < CTDB_CALL_DONE) {
+               event_loop_once(state->ctdb_db->ctdb->ev);
+       }
+       if (state->state != CTDB_CALL_DONE) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_call_recv failed\n"));
+               talloc_free(state);
+               return -1;
+       }
+
+       if (state->call->reply_data.dsize) {
+               call->reply_data.dptr = talloc_memdup(state->ctdb_db,
+                                                     state->call->reply_data.dptr,
+                                                     state->call->reply_data.dsize);
+               call->reply_data.dsize = state->call->reply_data.dsize;
+       } else {
+               call->reply_data.dptr = NULL;
+               call->reply_data.dsize = 0;
+       }
+       call->status = state->call->status;
+       talloc_free(state);
+
+       return call->status;
+}
+
+
+
+
+/*
+  destroy a ctdb_call in client
+*/
+static int ctdb_client_call_destructor(struct ctdb_client_call_state *state)   
+{
+       ctdb_reqid_remove(state->ctdb_db->ctdb, state->reqid);
+       return 0;
+}
+
+/*
+  construct an event driven local ctdb_call
+
+  this is used so that locally processed ctdb_call requests are processed
+  in an event driven manner
+*/
+static struct ctdb_client_call_state *ctdb_client_call_local_send(struct ctdb_db_context *ctdb_db, 
+                                                                 struct ctdb_call *call,
+                                                                 struct ctdb_ltdb_header *header,
+                                                                 TDB_DATA *data)
+{
+       struct ctdb_client_call_state *state;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       int ret;
+
+       state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
+       CTDB_NO_MEMORY_NULL(ctdb, state);
+       state->call = talloc_zero(state, struct ctdb_call);
+       CTDB_NO_MEMORY_NULL(ctdb, state->call);
+
+       talloc_steal(state, data->dptr);
+
+       state->state   = CTDB_CALL_DONE;
+       *(state->call) = *call;
+       state->ctdb_db = ctdb_db;
+
+       ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret));
+       }
+
+       return state;
+}
+
+/*
+  make a ctdb call to the local daemon - async send. Called from client context.
+
+  This constructs a ctdb_call request and queues it for processing. 
+  This call never blocks.
+*/
+struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, 
+                                             struct ctdb_call *call)
+{
+       struct ctdb_client_call_state *state;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       struct ctdb_ltdb_header header;
+       TDB_DATA data;
+       int ret;
+       size_t len;
+       struct ctdb_req_call *c;
+
+       /* if the domain socket is not yet open, open it */
+       if (ctdb->daemon.sd==-1) {
+               ctdb_socket_connect(ctdb);
+       }
+
+       ret = ctdb_ltdb_lock(ctdb_db, call->key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get chainlock\n"));
+               return NULL;
+       }
+
+       ret = ctdb_ltdb_fetch(ctdb_db, call->key, &header, ctdb_db, &data);
+
+       if ((call->flags & CTDB_IMMEDIATE_MIGRATION) && (header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+               ret = -1;
+       }
+
+       if (ret == 0 && header.dmaster == ctdb->pnn) {
+               state = ctdb_client_call_local_send(ctdb_db, call, &header, &data);
+               talloc_free(data.dptr);
+               ctdb_ltdb_unlock(ctdb_db, call->key);
+               return state;
+       }
+
+       ctdb_ltdb_unlock(ctdb_db, call->key);
+       talloc_free(data.dptr);
+
+       state = talloc_zero(ctdb_db, struct ctdb_client_call_state);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to allocate state\n"));
+               return NULL;
+       }
+       state->call = talloc_zero(state, struct ctdb_call);
+       if (state->call == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to allocate state->call\n"));
+               return NULL;
+       }
+
+       len = offsetof(struct ctdb_req_call, data) + call->key.dsize + call->call_data.dsize;
+       c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CALL, len, struct ctdb_req_call);
+       if (c == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to allocate packet\n"));
+               return NULL;
+       }
+
+       state->reqid     = ctdb_reqid_new(ctdb, state);
+       state->ctdb_db = ctdb_db;
+       talloc_set_destructor(state, ctdb_client_call_destructor);
+
+       c->hdr.reqid     = state->reqid;
+       c->flags         = call->flags;
+       c->db_id         = ctdb_db->db_id;
+       c->callid        = call->call_id;
+       c->hopcount      = 0;
+       c->keylen        = call->key.dsize;
+       c->calldatalen   = call->call_data.dsize;
+       memcpy(&c->data[0], call->key.dptr, call->key.dsize);
+       memcpy(&c->data[call->key.dsize], 
+              call->call_data.dptr, call->call_data.dsize);
+       *(state->call)              = *call;
+       state->call->call_data.dptr = &c->data[call->key.dsize];
+       state->call->key.dptr       = &c->data[0];
+
+       state->state  = CTDB_CALL_WAIT;
+
+
+       ctdb_client_queue_pkt(ctdb, &c->hdr);
+
+       return state;
+}
+
+
+/*
+  full ctdb_call. Equivalent to a ctdb_call_send() followed by a ctdb_call_recv()
+*/
+int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
+{
+       struct ctdb_client_call_state *state;
+
+       state = ctdb_call_send(ctdb_db, call);
+       return ctdb_call_recv(state, call);
+}
+
+
+/*
+  tell the daemon what messaging srvid we will use, and register the message
+  handler function in the client
+*/
+int ctdb_client_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            ctdb_msg_fn_t handler,
+                            void *private_data)
+{
+       int res;
+       int32_t status;
+
+       res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid, CTDB_CONTROL_REGISTER_SRVID, 0, 
+                          tdb_null, NULL, NULL, &status, NULL, NULL);
+       if (res != 0 || status != 0) {
+               DEBUG(DEBUG_ERR,("Failed to register srvid %llu\n", (unsigned long long)srvid));
+               return -1;
+       }
+
+       /* also need to register the handler with our own ctdb structure */
+       return ctdb_register_message_handler(ctdb, ctdb, srvid, handler, private_data);
+}
+
+/*
+  tell the daemon we no longer want a srvid
+*/
+int ctdb_client_remove_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void *private_data)
+{
+       int res;
+       int32_t status;
+
+       res = ctdb_control(ctdb, CTDB_CURRENT_NODE, srvid, CTDB_CONTROL_DEREGISTER_SRVID, 0, 
+                          tdb_null, NULL, NULL, &status, NULL, NULL);
+       if (res != 0 || status != 0) {
+               DEBUG(DEBUG_ERR,("Failed to deregister srvid %llu\n", (unsigned long long)srvid));
+               return -1;
+       }
+
+       /* also need to register the handler with our own ctdb structure */
+       ctdb_deregister_message_handler(ctdb, srvid, private_data);
+       return 0;
+}
+
+/*
+ * check server ids
+ */
+int ctdb_client_check_message_handlers(struct ctdb_context *ctdb, uint64_t *ids, uint32_t num,
+                                      uint8_t *result)
+{
+       TDB_DATA indata, outdata;
+       int res;
+       int32_t status;
+       int i;
+
+       indata.dptr = (uint8_t *)ids;
+       indata.dsize = num * sizeof(*ids);
+
+       res = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_CHECK_SRVIDS, 0,
+                          indata, ctdb, &outdata, &status, NULL, NULL);
+       if (res != 0 || status != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to check srvids\n"));
+               return -1;
+       }
+
+       if (outdata.dsize != num*sizeof(uint8_t)) {
+               DEBUG(DEBUG_ERR, (__location__ " expected %lu bytes, received %zi bytes\n",
+                                 (long unsigned int)num*sizeof(uint8_t),
+                                 outdata.dsize));
+               talloc_free(outdata.dptr);
+               return -1;
+       }
+
+       for (i=0; i<num; i++) {
+               result[i] = outdata.dptr[i];
+       }
+
+       talloc_free(outdata.dptr);
+       return 0;
+}
+
+/*
+  send a message - from client context
+ */
+int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+                     uint64_t srvid, TDB_DATA data)
+{
+       struct ctdb_req_message *r;
+       int len, res;
+
+       len = offsetof(struct ctdb_req_message, data) + data.dsize;
+       r = ctdbd_allocate_pkt(ctdb, ctdb, CTDB_REQ_MESSAGE, 
+                              len, struct ctdb_req_message);
+       CTDB_NO_MEMORY(ctdb, r);
+
+       r->hdr.destnode  = pnn;
+       r->srvid         = srvid;
+       r->datalen       = data.dsize;
+       memcpy(&r->data[0], data.dptr, data.dsize);
+       
+       res = ctdb_client_queue_pkt(ctdb, &r->hdr);
+       talloc_free(r);
+       return res;
+}
+
+
+/*
+  cancel a ctdb_fetch_lock operation, releasing the lock
+ */
+static int fetch_lock_destructor(struct ctdb_record_handle *h)
+{
+       ctdb_ltdb_unlock(h->ctdb_db, h->key);
+       return 0;
+}
+
+/*
+  force the migration of a record to this node
+ */
+static int ctdb_client_force_migration(struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       struct ctdb_call call;
+       ZERO_STRUCT(call);
+       call.call_id = CTDB_NULL_FUNC;
+       call.key = key;
+       call.flags = CTDB_IMMEDIATE_MIGRATION;
+       return ctdb_call(ctdb_db, &call);
+}
+
+/*
+  try to fetch a readonly copy of a record
+ */
+static int
+ctdb_client_fetch_readonly(struct ctdb_db_context *ctdb_db, TDB_DATA key, TALLOC_CTX *mem_ctx, struct ctdb_ltdb_header **hdr, TDB_DATA *data)
+{
+       int ret;
+
+       struct ctdb_call call;
+       ZERO_STRUCT(call);
+
+       call.call_id = CTDB_FETCH_WITH_HEADER_FUNC;
+       call.call_data.dptr = NULL;
+       call.call_data.dsize = 0;
+       call.key = key;
+       call.flags = CTDB_WANT_READONLY;
+       ret = ctdb_call(ctdb_db, &call);
+
+       if (ret != 0) {
+               return -1;
+       }
+       if (call.reply_data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               return -1;
+       }
+
+       *hdr = talloc_memdup(mem_ctx, &call.reply_data.dptr[0], sizeof(struct ctdb_ltdb_header));
+       if (*hdr == NULL) {
+               talloc_free(call.reply_data.dptr);
+               return -1;
+       }
+
+       data->dsize = call.reply_data.dsize - sizeof(struct ctdb_ltdb_header);
+       data->dptr  = talloc_memdup(mem_ctx, &call.reply_data.dptr[sizeof(struct ctdb_ltdb_header)], data->dsize);
+       if (data->dptr == NULL) {
+               talloc_free(call.reply_data.dptr);
+               talloc_free(hdr);
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  get a lock on a record, and return the records data. Blocks until it gets the lock
+ */
+struct ctdb_record_handle *ctdb_fetch_lock(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, 
+                                          TDB_DATA key, TDB_DATA *data)
+{
+       int ret;
+       struct ctdb_record_handle *h;
+
+       /*
+         procedure is as follows:
+
+         1) get the chain lock. 
+         2) check if we are dmaster
+         3) if we are the dmaster then return handle 
+         4) if not dmaster then ask ctdb daemon to make us dmaster, and wait for
+            reply from ctdbd
+         5) when we get the reply, goto (1)
+        */
+
+       h = talloc_zero(mem_ctx, struct ctdb_record_handle);
+       if (h == NULL) {
+               return NULL;
+       }
+
+       h->ctdb_db = ctdb_db;
+       h->key     = key;
+       h->key.dptr = talloc_memdup(h, key.dptr, key.dsize);
+       if (h->key.dptr == NULL) {
+               talloc_free(h);
+               return NULL;
+       }
+       h->data    = data;
+
+       DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: key=%*.*s\n", (int)key.dsize, (int)key.dsize, 
+                (const char *)key.dptr));
+
+again:
+       /* step 1 - get the chain lock */
+       ret = ctdb_ltdb_lock(ctdb_db, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to lock ltdb record\n"));
+               talloc_free(h);
+               return NULL;
+       }
+
+       DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: got chain lock\n"));
+
+       talloc_set_destructor(h, fetch_lock_destructor);
+
+       ret = ctdb_ltdb_fetch(ctdb_db, key, &h->header, h, data);
+
+       /* when torturing, ensure we test the remote path */
+       if ((ctdb_db->ctdb->flags & CTDB_FLAG_TORTURE) &&
+           random() % 5 == 0) {
+               h->header.dmaster = (uint32_t)-1;
+       }
+
+
+       DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: done local fetch\n"));
+
+       if (ret != 0 || h->header.dmaster != ctdb_db->ctdb->pnn) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               ret = ctdb_client_force_migration(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: force_migration failed\n"));
+                       talloc_free(h);
+                       return NULL;
+               }
+               goto again;
+       }
+
+       DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: we are dmaster - done\n"));
+       return h;
+}
+
+/*
+  get a readonly lock on a record, and return the records data. Blocks until it gets the lock
+ */
+struct ctdb_record_handle *
+ctdb_fetch_readonly_lock(
+       struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, 
+       TDB_DATA key, TDB_DATA *data,
+       int read_only)
+{
+       int ret;
+       struct ctdb_record_handle *h;
+       struct ctdb_ltdb_header *roheader = NULL;
+
+       h = talloc_zero(mem_ctx, struct ctdb_record_handle);
+       if (h == NULL) {
+               return NULL;
+       }
+
+       h->ctdb_db = ctdb_db;
+       h->key     = key;
+       h->key.dptr = talloc_memdup(h, key.dptr, key.dsize);
+       if (h->key.dptr == NULL) {
+               talloc_free(h);
+               return NULL;
+       }
+       h->data    = data;
+
+       data->dptr = NULL;
+       data->dsize = 0;
+
+
+again:
+       talloc_free(roheader);
+       roheader = NULL;
+
+       talloc_free(data->dptr);
+       data->dptr = NULL;
+       data->dsize = 0;
+
+       /* Lock the record/chain */
+       ret = ctdb_ltdb_lock(ctdb_db, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to lock ltdb record\n"));
+               talloc_free(h);
+               return NULL;
+       }
+
+       talloc_set_destructor(h, fetch_lock_destructor);
+
+       /* Check if record exists yet in the TDB */
+       ret = ctdb_ltdb_fetch_with_header(ctdb_db, key, &h->header, h, data);
+       if (ret != 0) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               ret = ctdb_client_force_migration(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                       talloc_free(h);
+                       return NULL;
+               }
+               goto again;
+       }
+
+       /* if this is a request for read/write and we have delegations
+          we have to revoke all delegations first
+       */
+       if ((read_only == 0) 
+       &&  (h->header.dmaster == ctdb_db->ctdb->pnn)
+       &&  (h->header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+               ctdb_ltdb_unlock(ctdb_db, key);
+               ret = ctdb_client_force_migration(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                       talloc_free(h);
+                       return NULL;
+               }
+               goto again;
+       }
+
+       /* if we are dmaster, just return the handle */
+       if (h->header.dmaster == ctdb_db->ctdb->pnn) {
+               return h;
+       }
+
+       if (read_only != 0) {
+               TDB_DATA rodata = {NULL, 0};
+
+               if ((h->header.flags & CTDB_REC_RO_HAVE_READONLY)
+               ||  (h->header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) {
+                       return h;
+               }
+
+               ctdb_ltdb_unlock(ctdb_db, key);
+               ret = ctdb_client_fetch_readonly(ctdb_db, key, h, &roheader, &rodata);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("ctdb_fetch_readonly_lock:  failed. force migration and try again\n"));
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+
+               if (!(roheader->flags&CTDB_REC_RO_HAVE_READONLY)) {
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+
+               ret = ctdb_ltdb_lock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " failed to lock ltdb record\n"));
+                       talloc_free(h);
+                       return NULL;
+               }
+
+               ret = ctdb_ltdb_fetch_with_header(ctdb_db, key, &h->header, h, data);
+               if (ret != 0) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+
+                       ret = ctdb_client_force_migration(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_DEBUG,("ctdb_fetch_readonly_lock: force_migration failed\n"));
+                               talloc_free(h);
+                               return NULL;
+                       }
+
+                       goto again;
+               }
+
+               return h;
+       }
+
+       /* we are not dmaster and this was not a request for a readonly lock
+        * so unlock the record, migrate it and try again
+        */
+       ctdb_ltdb_unlock(ctdb_db, key);
+       ret = ctdb_client_force_migration(ctdb_db, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_fetch_lock: force_migration failed\n"));
+               talloc_free(h);
+               return NULL;
+       }
+       goto again;
+}
+
+/*
+  store some data to the record that was locked with ctdb_fetch_lock()
+*/
+int ctdb_record_store(struct ctdb_record_handle *h, TDB_DATA data)
+{
+       if (h->ctdb_db->persistent) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb_record_store prohibited for persistent dbs\n"));
+               return -1;
+       }
+
+       return ctdb_ltdb_store(h->ctdb_db, h->key, &h->header, data);
+}
+
+/*
+  non-locking fetch of a record
+ */
+int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, 
+              TDB_DATA key, TDB_DATA *data)
+{
+       struct ctdb_call call;
+       int ret;
+
+       call.call_id = CTDB_FETCH_FUNC;
+       call.call_data.dptr = NULL;
+       call.call_data.dsize = 0;
+       call.key = key;
+
+       ret = ctdb_call(ctdb_db, &call);
+
+       if (ret == 0) {
+               *data = call.reply_data;
+               talloc_steal(mem_ctx, data->dptr);
+       }
+
+       return ret;
+}
+
+
+
+/*
+   called when a control completes or timesout to invoke the callback
+   function the user provided
+*/
+static void invoke_control_callback(struct event_context *ev, struct timed_event *te, 
+       struct timeval t, void *private_data)
+{
+       struct ctdb_client_control_state *state;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       int ret;
+
+       state = talloc_get_type(private_data, struct ctdb_client_control_state);
+       talloc_steal(tmp_ctx, state);
+
+       ret = ctdb_control_recv(state->ctdb, state, state,
+                       NULL, 
+                       NULL, 
+                       NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_control_recv() failed, ignoring return code %d\n", ret));
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  called when a CTDB_REPLY_CONTROL packet comes in in the client
+
+  This packet comes in response to a CTDB_REQ_CONTROL request packet. It
+  contains any reply data from the control
+*/
+static void ctdb_client_reply_control(struct ctdb_context *ctdb, 
+                                     struct ctdb_req_header *hdr)
+{
+       struct ctdb_reply_control *c = (struct ctdb_reply_control *)hdr;
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_client_control_state);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " reqid %u not found\n", hdr->reqid));
+               return;
+       }
+
+       if (hdr->reqid != state->reqid) {
+               /* we found a record  but it was the wrong one */
+               DEBUG(DEBUG_ERR, ("Dropped orphaned reply control with reqid:%u\n",hdr->reqid));
+               return;
+       }
+
+       state->outdata.dptr = c->data;
+       state->outdata.dsize = c->datalen;
+       state->status = c->status;
+       if (c->errorlen) {
+               state->errormsg = talloc_strndup(state, 
+                                                (char *)&c->data[c->datalen], 
+                                                c->errorlen);
+       }
+
+       /* state->outdata now uses resources from c so we dont want c
+          to just dissappear from under us while state is still alive
+       */
+       talloc_steal(state, c);
+
+       state->state = CTDB_CONTROL_DONE;
+
+       /* if we had a callback registered for this control, pull the response
+          and call the callback.
+       */
+       if (state->async.fn) {
+               event_add_timed(ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
+       }
+}
+
+
+/*
+  destroy a ctdb_control in client
+*/
+static int ctdb_client_control_destructor(struct ctdb_client_control_state *state)
+{
+       ctdb_reqid_remove(state->ctdb, state->reqid);
+       return 0;
+}
+
+
+/* time out handler for ctdb_control */
+static void control_timeout_func(struct event_context *ev, struct timed_event *te, 
+       struct timeval t, void *private_data)
+{
+       struct ctdb_client_control_state *state = talloc_get_type(private_data, struct ctdb_client_control_state);
+
+       DEBUG(DEBUG_ERR,(__location__ " control timed out. reqid:%u opcode:%u "
+                        "dstnode:%u\n", state->reqid, state->c->opcode,
+                        state->c->hdr.destnode));
+
+       state->state = CTDB_CONTROL_TIMEOUT;
+
+       /* if we had a callback registered for this control, pull the response
+          and call the callback.
+       */
+       if (state->async.fn) {
+               event_add_timed(state->ctdb->ev, state, timeval_zero(), invoke_control_callback, state);
+       }
+}
+
+/* async version of send control request */
+struct ctdb_client_control_state *ctdb_control_send(struct ctdb_context *ctdb, 
+               uint32_t destnode, uint64_t srvid, 
+               uint32_t opcode, uint32_t flags, TDB_DATA data, 
+               TALLOC_CTX *mem_ctx,
+               struct timeval *timeout,
+               char **errormsg)
+{
+       struct ctdb_client_control_state *state;
+       size_t len;
+       struct ctdb_req_control *c;
+       int ret;
+
+       if (errormsg) {
+               *errormsg = NULL;
+       }
+
+       /* if the domain socket is not yet open, open it */
+       if (ctdb->daemon.sd==-1) {
+               ctdb_socket_connect(ctdb);
+       }
+
+       state = talloc_zero(mem_ctx, struct ctdb_client_control_state);
+       CTDB_NO_MEMORY_NULL(ctdb, state);
+
+       state->ctdb       = ctdb;
+       state->reqid      = ctdb_reqid_new(ctdb, state);
+       state->state      = CTDB_CONTROL_WAIT;
+       state->errormsg   = NULL;
+
+       talloc_set_destructor(state, ctdb_client_control_destructor);
+
+       len = offsetof(struct ctdb_req_control, data) + data.dsize;
+       c = ctdbd_allocate_pkt(ctdb, state, CTDB_REQ_CONTROL, 
+                              len, struct ctdb_req_control);
+       state->c            = c;        
+       CTDB_NO_MEMORY_NULL(ctdb, c);
+       c->hdr.reqid        = state->reqid;
+       c->hdr.destnode     = destnode;
+       c->opcode           = opcode;
+       c->client_id        = 0;
+       c->flags            = flags;
+       c->srvid            = srvid;
+       c->datalen          = data.dsize;
+       if (data.dsize) {
+               memcpy(&c->data[0], data.dptr, data.dsize);
+       }
+
+       /* timeout */
+       if (timeout && !timeval_is_zero(timeout)) {
+               event_add_timed(ctdb->ev, state, *timeout, control_timeout_func, state);
+       }
+
+       ret = ctdb_client_queue_pkt(ctdb, &(c->hdr));
+       if (ret != 0) {
+               talloc_free(state);
+               return NULL;
+       }
+
+       if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+               talloc_free(state);
+               return NULL;
+       }
+
+       return state;
+}
+
+
+/* async version of receive control reply */
+int ctdb_control_recv(struct ctdb_context *ctdb, 
+               struct ctdb_client_control_state *state, 
+               TALLOC_CTX *mem_ctx,
+               TDB_DATA *outdata, int32_t *status, char **errormsg)
+{
+       TALLOC_CTX *tmp_ctx;
+
+       if (status != NULL) {
+               *status = -1;
+       }
+       if (errormsg != NULL) {
+               *errormsg = NULL;
+       }
+
+       if (state == NULL) {
+               return -1;
+       }
+
+       /* prevent double free of state */
+       tmp_ctx = talloc_new(ctdb);
+       talloc_steal(tmp_ctx, state);
+
+       /* loop one event at a time until we either timeout or the control
+          completes.
+       */
+       while (state->state == CTDB_CONTROL_WAIT) {
+               event_loop_once(ctdb->ev);
+       }
+
+       if (state->state != CTDB_CONTROL_DONE) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control_recv failed\n"));
+               if (state->async.fn) {
+                       state->async.fn(state);
+               }
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (state->errormsg) {
+               DEBUG(DEBUG_ERR,("ctdb_control error: '%s'\n", state->errormsg));
+               if (errormsg) {
+                       (*errormsg) = talloc_move(mem_ctx, &state->errormsg);
+               }
+               if (state->async.fn) {
+                       state->async.fn(state);
+               }
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (outdata) {
+               *outdata = state->outdata;
+               outdata->dptr = talloc_memdup(mem_ctx, outdata->dptr, outdata->dsize);
+       }
+
+       if (status) {
+               *status = state->status;
+       }
+
+       if (state->async.fn) {
+               state->async.fn(state);
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
+
+/*
+  send a ctdb control message
+  timeout specifies how long we should wait for a reply.
+  if timeout is NULL we wait indefinitely
+ */
+int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid, 
+                uint32_t opcode, uint32_t flags, TDB_DATA data, 
+                TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
+                struct timeval *timeout,
+                char **errormsg)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_control_send(ctdb, destnode, srvid, opcode, 
+                       flags, data, mem_ctx,
+                       timeout, errormsg);
+
+       /* FIXME: Error conditions in ctdb_control_send return NULL without
+        * setting errormsg.  So, there is no way to distinguish between sucess
+        * and failure when CTDB_CTRL_FLAG_NOREPLY is set */
+       if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+               if (status != NULL) {
+                       *status = 0;
+               }
+               return 0;
+       }
+
+       return ctdb_control_recv(ctdb, state, mem_ctx, outdata, status, 
+                       errormsg);
+}
+
+
+
+
+/*
+  a process exists call. Returns 0 if process exists, -1 otherwise
+ */
+int ctdb_ctrl_process_exists(struct ctdb_context *ctdb, uint32_t destnode, pid_t pid)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t status;
+
+       data.dptr = (uint8_t*)&pid;
+       data.dsize = sizeof(pid);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_PROCESS_EXISTS, 0, data, 
+                          NULL, NULL, &status, NULL, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for process_exists failed\n"));
+               return -1;
+       }
+
+       return status;
+}
+
+/*
+  get remote statistics
+ */
+int ctdb_ctrl_statistics(struct ctdb_context *ctdb, uint32_t destnode, struct ctdb_statistics *status)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_STATISTICS, 0, tdb_null, 
+                          ctdb, &data, &res, NULL, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for statistics failed\n"));
+               return -1;
+       }
+
+       if (data.dsize != sizeof(struct ctdb_statistics)) {
+               DEBUG(DEBUG_ERR,(__location__ " Wrong statistics size %u - expected %u\n",
+                        (unsigned)data.dsize, (unsigned)sizeof(struct ctdb_statistics)));
+                     return -1;
+       }
+
+       *status = *(struct ctdb_statistics *)data.dptr;
+       talloc_free(data.dptr);
+                       
+       return 0;
+}
+
+/*
+ * get db statistics
+ */
+int ctdb_ctrl_dbstatistics(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid,
+                          TALLOC_CTX *mem_ctx, struct ctdb_db_statistics **dbstat)
+{
+       int ret;
+       TDB_DATA indata, outdata;
+       int32_t res;
+       struct ctdb_db_statistics *wire, *s;
+       char *ptr;
+       int i;
+
+       indata.dptr = (uint8_t *)&dbid;
+       indata.dsize = sizeof(dbid);
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_DB_STATISTICS,
+                          0, indata, ctdb, &outdata, &res, NULL, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for dbstatistics failed\n"));
+               return -1;
+       }
+
+       if (outdata.dsize < offsetof(struct ctdb_db_statistics, hot_keys_wire)) {
+               DEBUG(DEBUG_ERR,(__location__ " Wrong dbstatistics size %zi - expected >= %lu\n",
+                                outdata.dsize,
+                                (long unsigned int)sizeof(struct ctdb_statistics)));
+               return -1;
+       }
+
+       s = talloc_zero(mem_ctx, struct ctdb_db_statistics);
+       if (s == NULL) {
+               talloc_free(outdata.dptr);
+               CTDB_NO_MEMORY(ctdb, s);
+       }
+
+       wire = (struct ctdb_db_statistics *)outdata.dptr;
+       *s = *wire;
+       ptr = &wire->hot_keys_wire[0];
+       for (i=0; i<wire->num_hot_keys; i++) {
+               s->hot_keys[i].key.dptr = talloc_size(mem_ctx, s->hot_keys[i].key.dsize);
+               if (s->hot_keys[i].key.dptr == NULL) {
+                       talloc_free(outdata.dptr);
+                       CTDB_NO_MEMORY(ctdb, s->hot_keys[i].key.dptr);
+               }
+
+               memcpy(s->hot_keys[i].key.dptr, ptr, s->hot_keys[i].key.dsize);
+               ptr += wire->hot_keys[i].key.dsize;
+       }
+
+       talloc_free(outdata.dptr);
+       *dbstat = s;
+       return 0;
+}
+
+/*
+  shutdown a remote ctdb node
+ */
+int ctdb_ctrl_shutdown(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SHUTDOWN, 0, tdb_null, 
+                          NULL, &timeout, NULL);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for shutdown failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  get vnn map from a remote node
+ */
+int ctdb_ctrl_getvnnmap(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_vnn_map **vnnmap)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_vnn_map_wire *map;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GETVNNMAP, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getvnnmap failed\n"));
+               return -1;
+       }
+       
+       map = (struct ctdb_vnn_map_wire *)outdata.dptr;
+       if (outdata.dsize < offsetof(struct ctdb_vnn_map_wire, map) ||
+           outdata.dsize != map->size*sizeof(uint32_t) + offsetof(struct ctdb_vnn_map_wire, map)) {
+               DEBUG(DEBUG_ERR,("Bad vnn map size received in ctdb_ctrl_getvnnmap\n"));
+               return -1;
+       }
+
+       (*vnnmap) = talloc(mem_ctx, struct ctdb_vnn_map);
+       CTDB_NO_MEMORY(ctdb, *vnnmap);
+       (*vnnmap)->generation = map->generation;
+       (*vnnmap)->size       = map->size;
+       (*vnnmap)->map        = talloc_array(*vnnmap, uint32_t, map->size);
+
+       CTDB_NO_MEMORY(ctdb, (*vnnmap)->map);
+       memcpy((*vnnmap)->map, map->map, sizeof(uint32_t)*map->size);
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+
+/*
+  get the recovery mode of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+       return ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_RECMODE, 0, tdb_null, 
+                          mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getrecmode_recv failed\n"));
+               return -1;
+       }
+
+       if (recmode) {
+               *recmode = (uint32_t)res;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, timeout, destnode);
+       return ctdb_ctrl_getrecmode_recv(ctdb, mem_ctx, state, recmode);
+}
+
+
+
+
+/*
+  set the recovery mode of a remote node
+ */
+int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmode)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = sizeof(uint32_t);
+       data.dptr = (unsigned char *)&recmode;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_RECMODE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setrecmode failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+
+/*
+  get the recovery master of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_getrecmaster_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, 
+                       struct timeval timeout, uint32_t destnode)
+{
+       return ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_RECMASTER, 0, tdb_null, 
+                          mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_getrecmaster_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmaster)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getrecmaster_recv failed\n"));
+               return -1;
+       }
+
+       if (recmaster) {
+               *recmaster = (uint32_t)res;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmaster)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, timeout, destnode);
+       return ctdb_ctrl_getrecmaster_recv(ctdb, mem_ctx, state, recmaster);
+}
+
+
+/*
+  set the recovery master of a remote node
+ */
+int ctdb_ctrl_setrecmaster(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmaster)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       ZERO_STRUCT(data);
+       data.dsize = sizeof(uint32_t);
+       data.dptr = (unsigned char *)&recmaster;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_RECMASTER, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setrecmaster failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  get a list of databases off a remote node
+ */
+int ctdb_ctrl_getdbmap(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, 
+                      TALLOC_CTX *mem_ctx, struct ctdb_dbid_map **dbmap)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_DBMAP, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getdbmap failed ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       *dbmap = (struct ctdb_dbid_map *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+/*
+  get a list of nodes (vnn and flags ) from a remote node
+ */
+int ctdb_ctrl_getnodemap(struct ctdb_context *ctdb, 
+               struct timeval timeout, uint32_t destnode, 
+               TALLOC_CTX *mem_ctx, struct ctdb_node_map **nodemap)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_NODEMAP, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret == 0 && res == -1 && outdata.dsize == 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getnodes failed, falling back to ipv4-only control\n"));
+               return ctdb_ctrl_getnodemapv4(ctdb, timeout, destnode, mem_ctx, nodemap);
+       }
+       if (ret != 0 || res != 0 || outdata.dsize == 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getnodes failed ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       *nodemap = (struct ctdb_node_map *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+/*
+  old style ipv4-only get a list of nodes (vnn and flags ) from a remote node
+ */
+int ctdb_ctrl_getnodemapv4(struct ctdb_context *ctdb, 
+               struct timeval timeout, uint32_t destnode, 
+               TALLOC_CTX *mem_ctx, struct ctdb_node_map **nodemap)
+{
+       int ret, i, len;
+       TDB_DATA outdata;
+       struct ctdb_node_mapv4 *nodemapv4;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_NODEMAPv4, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0 || outdata.dsize == 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getnodesv4 failed ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       nodemapv4 = (struct ctdb_node_mapv4 *)outdata.dptr;
+
+       len = offsetof(struct ctdb_node_map, nodes) + nodemapv4->num*sizeof(struct ctdb_node_and_flags);
+       (*nodemap) = talloc_zero_size(mem_ctx, len);
+       CTDB_NO_MEMORY(ctdb, (*nodemap));
+
+       (*nodemap)->num = nodemapv4->num;
+       for (i=0; i<nodemapv4->num; i++) {
+               (*nodemap)->nodes[i].pnn     = nodemapv4->nodes[i].pnn;
+               (*nodemap)->nodes[i].flags   = nodemapv4->nodes[i].flags;
+               (*nodemap)->nodes[i].addr.ip = nodemapv4->nodes[i].sin;
+               (*nodemap)->nodes[i].addr.sa.sa_family = AF_INET;
+       }
+               
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+/*
+  drop the transport, reload the nodes file and restart the transport
+ */
+int ctdb_ctrl_reload_nodes_file(struct ctdb_context *ctdb, 
+                   struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_RELOAD_NODES_FILE, 0, tdb_null, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for reloadnodesfile failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  set vnn map on a node
+ */
+int ctdb_ctrl_setvnnmap(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, 
+                       TALLOC_CTX *mem_ctx, struct ctdb_vnn_map *vnnmap)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+       struct ctdb_vnn_map_wire *map;
+       size_t len;
+
+       len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*vnnmap->size;
+       map = talloc_size(mem_ctx, len);
+       CTDB_NO_MEMORY(ctdb, map);
+
+       map->generation = vnnmap->generation;
+       map->size = vnnmap->size;
+       memcpy(map->map, vnnmap->map, sizeof(uint32_t)*map->size);
+       
+       data.dsize = len;
+       data.dptr  = (uint8_t *)map;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SETVNNMAP, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setvnnmap failed\n"));
+               return -1;
+       }
+
+       talloc_free(map);
+
+       return 0;
+}
+
+
+/*
+  async send for pull database
+ */
+struct ctdb_client_control_state *ctdb_ctrl_pulldb_send(
+       struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid,
+       uint32_t lmaster, TALLOC_CTX *mem_ctx, struct timeval timeout)
+{
+       TDB_DATA indata;
+       struct ctdb_control_pulldb *pull;
+       struct ctdb_client_control_state *state;
+
+       pull = talloc(mem_ctx, struct ctdb_control_pulldb);
+       CTDB_NO_MEMORY_NULL(ctdb, pull);
+
+       pull->db_id   = dbid;
+       pull->lmaster = lmaster;
+
+       indata.dsize = sizeof(struct ctdb_control_pulldb);
+       indata.dptr  = (unsigned char *)pull;
+
+       state = ctdb_control_send(ctdb, destnode, 0, 
+                                 CTDB_CONTROL_PULL_DB, 0, indata, 
+                                 mem_ctx, &timeout, NULL);
+       talloc_free(pull);
+
+       return state;
+}
+
+/*
+  async recv for pull database
+ */
+int ctdb_ctrl_pulldb_recv(
+       struct ctdb_context *ctdb, 
+       TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, 
+       TDB_DATA *outdata)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, mem_ctx, outdata, &res, NULL);
+       if ( (ret != 0) || (res != 0) ){
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_pulldb_recv failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  pull all keys and records for a specific database on a node
+ */
+int ctdb_ctrl_pulldb(struct ctdb_context *ctdb, uint32_t destnode, 
+               uint32_t dbid, uint32_t lmaster, 
+               TALLOC_CTX *mem_ctx, struct timeval timeout,
+               TDB_DATA *outdata)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_pulldb_send(ctdb, destnode, dbid, lmaster, mem_ctx,
+                                     timeout);
+       
+       return ctdb_ctrl_pulldb_recv(ctdb, mem_ctx, state, outdata);
+}
+
+
+/*
+  change dmaster for all keys in the database to the new value
+ */
+int ctdb_ctrl_setdmaster(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, 
+                        TALLOC_CTX *mem_ctx, uint32_t dbid, uint32_t dmaster)
+{
+       int ret;
+       TDB_DATA indata;
+       int32_t res;
+
+       indata.dsize = 2*sizeof(uint32_t);
+       indata.dptr = (unsigned char *)talloc_array(mem_ctx, uint32_t, 2);
+
+       ((uint32_t *)(&indata.dptr[0]))[0] = dbid;
+       ((uint32_t *)(&indata.dptr[0]))[1] = dmaster;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_DMASTER, 0, indata, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setdmaster failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  ping a node, return number of clients connected
+ */
+int ctdb_ctrl_ping(struct ctdb_context *ctdb, uint32_t destnode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_PING, 0, 
+                          tdb_null, NULL, NULL, &res, NULL, NULL);
+       if (ret != 0) {
+               return -1;
+       }
+       return res;
+}
+
+int ctdb_ctrl_get_runstate(struct ctdb_context *ctdb, 
+                          struct timeval timeout, 
+                          uint32_t destnode,
+                          uint32_t *runstate)
+{
+       TDB_DATA outdata;
+       int32_t res;
+       int ret;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_RUNSTATE, 0,
+                          tdb_null, ctdb, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("ctdb_control for get_runstate failed\n"));
+               return ret != 0 ? ret : res;
+       }
+
+       if (outdata.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,("Invalid return data in get_runstate\n"));
+               talloc_free(outdata.dptr);
+               return -1;
+       }
+
+       if (runstate != NULL) {
+               *runstate = *(uint32_t *)outdata.dptr;
+       }
+       talloc_free(outdata.dptr);
+
+       return 0;
+}
+
+/*
+  find the real path to a ltdb 
+ */
+int ctdb_ctrl_getdbpath(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t dbid, TALLOC_CTX *mem_ctx, 
+                  const char **path)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GETDBPATH, 0, data, 
+                          mem_ctx, &data, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+
+       (*path) = talloc_strndup(mem_ctx, (const char *)data.dptr, data.dsize);
+       if ((*path) == NULL) {
+               return -1;
+       }
+
+       talloc_free(data.dptr);
+
+       return 0;
+}
+
+/*
+  find the name of a db 
+ */
+int ctdb_ctrl_getdbname(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t dbid, TALLOC_CTX *mem_ctx, 
+                  const char **name)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_DBNAME, 0, data, 
+                          mem_ctx, &data, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+
+       (*name) = talloc_strndup(mem_ctx, (const char *)data.dptr, data.dsize);
+       if ((*name) == NULL) {
+               return -1;
+       }
+
+       talloc_free(data.dptr);
+
+       return 0;
+}
+
+/*
+  get the health status of a db
+ */
+int ctdb_ctrl_getdbhealth(struct ctdb_context *ctdb,
+                         struct timeval timeout,
+                         uint32_t destnode,
+                         uint32_t dbid, TALLOC_CTX *mem_ctx,
+                         const char **reason)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_DB_GET_HEALTH, 0, data,
+                          mem_ctx, &data, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+
+       if (data.dsize == 0) {
+               (*reason) = NULL;
+               return 0;
+       }
+
+       (*reason) = talloc_strndup(mem_ctx, (const char *)data.dptr, data.dsize);
+       if ((*reason) == NULL) {
+               return -1;
+       }
+
+       talloc_free(data.dptr);
+
+       return 0;
+}
+
+/*
+ * get db sequence number
+ */
+int ctdb_ctrl_getdbseqnum(struct ctdb_context *ctdb, struct timeval timeout,
+                         uint32_t destnode, uint32_t dbid, uint64_t *seqnum)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data, outdata;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(uint64_t);  /* This is just wrong */
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_DB_SEQNUM,
+                          0, data, ctdb, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("ctdb_control for getdbesqnum failed\n"));
+               return -1;
+       }
+
+       if (outdata.dsize != sizeof(uint64_t)) {
+               DEBUG(DEBUG_ERR,("Invalid return data in get_dbseqnum\n"));
+               talloc_free(outdata.dptr);
+               return -1;
+       }
+
+       if (seqnum != NULL) {
+               *seqnum = *(uint64_t *)outdata.dptr;
+       }
+       talloc_free(outdata.dptr);
+
+       return 0;
+}
+
+/*
+  create a database
+ */
+int ctdb_ctrl_createdb(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, 
+                      TALLOC_CTX *mem_ctx, const char *name, bool persistent)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+       uint64_t tdb_flags = 0;
+
+       data.dptr = discard_const(name);
+       data.dsize = strlen(name)+1;
+
+       /* Make sure that volatile databases use jenkins hash */
+       if (!persistent) {
+               tdb_flags = TDB_INCOMPATIBLE_HASH;
+       }
+
+       ret = ctdb_control(ctdb, destnode, tdb_flags,
+                          persistent?CTDB_CONTROL_DB_ATTACH_PERSISTENT:CTDB_CONTROL_DB_ATTACH, 
+                          0, data, 
+                          mem_ctx, &data, &res, &timeout, NULL);
+
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  get debug level on a node
+ */
+int ctdb_ctrl_get_debuglevel(struct ctdb_context *ctdb, uint32_t destnode, int32_t *level)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_DEBUG, 0, tdb_null, 
+                          ctdb, &data, &res, NULL, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+       if (data.dsize != sizeof(int32_t)) {
+               DEBUG(DEBUG_ERR,("Bad control reply size in ctdb_get_debuglevel (got %u)\n",
+                        (unsigned)data.dsize));
+               return -1;
+       }
+       *level = *(int32_t *)data.dptr;
+       talloc_free(data.dptr);
+       return 0;
+}
+
+/*
+  set debug level on a node
+ */
+int ctdb_ctrl_set_debuglevel(struct ctdb_context *ctdb, uint32_t destnode, int32_t level)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&level;
+       data.dsize = sizeof(level);
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_SET_DEBUG, 0, data, 
+                          NULL, NULL, &res, NULL, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+       return 0;
+}
+
+
+/*
+  get a list of connected nodes
+ */
+uint32_t *ctdb_get_connected_nodes(struct ctdb_context *ctdb, 
+                               struct timeval timeout,
+                               TALLOC_CTX *mem_ctx,
+                               uint32_t *num_nodes)
+{
+       struct ctdb_node_map *map=NULL;
+       int ret, i;
+       uint32_t *nodes;
+
+       *num_nodes = 0;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, timeout, CTDB_CURRENT_NODE, mem_ctx, &map);
+       if (ret != 0) {
+               return NULL;
+       }
+
+       nodes = talloc_array(mem_ctx, uint32_t, map->num);
+       if (nodes == NULL) {
+               return NULL;
+       }
+
+       for (i=0;i<map->num;i++) {
+               if (!(map->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
+                       nodes[*num_nodes] = map->nodes[i].pnn;
+                       (*num_nodes)++;
+               }
+       }
+
+       return nodes;
+}
+
+
+/*
+  reset remote status
+ */
+int ctdb_statistics_reset(struct ctdb_context *ctdb, uint32_t destnode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_STATISTICS_RESET, 0, tdb_null, 
+                          NULL, NULL, &res, NULL, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for reset statistics failed\n"));
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  attach to a specific database - client call
+*/
+struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb,
+                                   struct timeval timeout,
+                                   const char *name,
+                                   bool persistent,
+                                   uint32_t tdb_flags)
+{
+       struct ctdb_db_context *ctdb_db;
+       TDB_DATA data;
+       int ret;
+       int32_t res;
+
+       ctdb_db = ctdb_db_handle(ctdb, name);
+       if (ctdb_db) {
+               return ctdb_db;
+       }
+
+       ctdb_db = talloc_zero(ctdb, struct ctdb_db_context);
+       CTDB_NO_MEMORY_NULL(ctdb, ctdb_db);
+
+       ctdb_db->ctdb = ctdb;
+       ctdb_db->db_name = talloc_strdup(ctdb_db, name);
+       CTDB_NO_MEMORY_NULL(ctdb, ctdb_db->db_name);
+
+       data.dptr = discard_const(name);
+       data.dsize = strlen(name)+1;
+
+       /* CTDB has switched to using jenkins hash for volatile databases.
+        * Even if tdb_flags do not explicitly mention TDB_INCOMPATIBLE_HASH,
+        * always set it.
+        */
+       if (!persistent) {
+               tdb_flags |= TDB_INCOMPATIBLE_HASH;
+       }
+
+       /* tell ctdb daemon to attach */
+       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, tdb_flags, 
+                          persistent?CTDB_CONTROL_DB_ATTACH_PERSISTENT:CTDB_CONTROL_DB_ATTACH,
+                          0, data, ctdb_db, &data, &res, NULL, NULL);
+       if (ret != 0 || res != 0 || data.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,("Failed to attach to database '%s'\n", name));
+               talloc_free(ctdb_db);
+               return NULL;
+       }
+       
+       ctdb_db->db_id = *(uint32_t *)data.dptr;
+       talloc_free(data.dptr);
+
+       ret = ctdb_ctrl_getdbpath(ctdb, timeout, CTDB_CURRENT_NODE, ctdb_db->db_id, ctdb_db, &ctdb_db->db_path);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to get dbpath for database '%s'\n", name));
+               talloc_free(ctdb_db);
+               return NULL;
+       }
+
+       tdb_flags = persistent?TDB_DEFAULT:TDB_NOSYNC;
+       if (ctdb->valgrinding) {
+               tdb_flags |= TDB_NOMMAP;
+       }
+       tdb_flags |= TDB_DISALLOW_NESTING;
+
+       ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, 0, tdb_flags, O_RDWR, 0);
+       if (ctdb_db->ltdb == NULL) {
+               ctdb_set_error(ctdb, "Failed to open tdb '%s'\n", ctdb_db->db_path);
+               talloc_free(ctdb_db);
+               return NULL;
+       }
+
+       ctdb_db->persistent = persistent;
+
+       DLIST_ADD(ctdb->db_list, ctdb_db);
+
+       /* add well known functions */
+       ctdb_set_call(ctdb_db, ctdb_null_func, CTDB_NULL_FUNC);
+       ctdb_set_call(ctdb_db, ctdb_fetch_func, CTDB_FETCH_FUNC);
+       ctdb_set_call(ctdb_db, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC);
+
+       return ctdb_db;
+}
+
+
+/*
+  setup a call for a database
+ */
+int ctdb_set_call(struct ctdb_db_context *ctdb_db, ctdb_fn_t fn, uint32_t id)
+{
+       struct ctdb_registered_call *call;
+
+#if 0
+       TDB_DATA data;
+       int32_t status;
+       struct ctdb_control_set_call c;
+       int ret;
+
+       /* this is no longer valid with the separate daemon architecture */
+       c.db_id = ctdb_db->db_id;
+       c.fn    = fn;
+       c.id    = id;
+
+       data.dptr = (uint8_t *)&c;
+       data.dsize = sizeof(c);
+
+       ret = ctdb_control(ctdb_db->ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_SET_CALL, 0,
+                          data, NULL, NULL, &status, NULL, NULL);
+       if (ret != 0 || status != 0) {
+               DEBUG(DEBUG_ERR,("ctdb_set_call failed for call %u\n", id));
+               return -1;
+       }
+#endif
+
+       /* also register locally */
+       call = talloc(ctdb_db, struct ctdb_registered_call);
+       call->fn = fn;
+       call->id = id;
+
+       DLIST_ADD(ctdb_db->calls, call);        
+       return 0;
+}
+
+
+struct traverse_state {
+       bool done;
+       uint32_t count;
+       ctdb_traverse_func fn;
+       void *private_data;
+       bool listemptyrecords;
+};
+
+/*
+  called on each key during a ctdb_traverse
+ */
+static void traverse_handler(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data, void *p)
+{
+       struct traverse_state *state = (struct traverse_state *)p;
+       struct ctdb_rec_data *d = (struct ctdb_rec_data *)data.dptr;
+       TDB_DATA key;
+
+       if (data.dsize < sizeof(uint32_t) ||
+           d->length != data.dsize) {
+               DEBUG(DEBUG_ERR,("Bad data size %u in traverse_handler\n", (unsigned)data.dsize));
+               state->done = true;
+               return;
+       }
+
+       key.dsize = d->keylen;
+       key.dptr  = &d->data[0];
+       data.dsize = d->datalen;
+       data.dptr = &d->data[d->keylen];
+
+       if (key.dsize == 0 && data.dsize == 0) {
+               /* end of traverse */
+               state->done = true;
+               return;
+       }
+
+       if (!state->listemptyrecords &&
+           data.dsize == sizeof(struct ctdb_ltdb_header))
+       {
+               /* empty records are deleted records in ctdb */
+               return;
+       }
+
+       if (state->fn(ctdb, key, data, state->private_data) != 0) {
+               state->done = true;
+       }
+
+       state->count++;
+}
+
+/**
+ * start a cluster wide traverse, calling the supplied fn on each record
+ * return the number of records traversed, or -1 on error
+ *
+ * Extendet variant with a flag to signal whether empty records should
+ * be listed.
+ */
+static int ctdb_traverse_ext(struct ctdb_db_context *ctdb_db,
+                            ctdb_traverse_func fn,
+                            bool withemptyrecords,
+                            void *private_data)
+{
+       TDB_DATA data;
+       struct ctdb_traverse_start_ext t;
+       int32_t status;
+       int ret;
+       uint64_t srvid = (getpid() | 0xFLL<<60);
+       struct traverse_state state;
+
+       state.done = false;
+       state.count = 0;
+       state.private_data = private_data;
+       state.fn = fn;
+       state.listemptyrecords = withemptyrecords;
+
+       ret = ctdb_client_set_message_handler(ctdb_db->ctdb, srvid, traverse_handler, &state);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to setup traverse handler\n"));
+               return -1;
+       }
+
+       t.db_id = ctdb_db->db_id;
+       t.srvid = srvid;
+       t.reqid = 0;
+       t.withemptyrecords = withemptyrecords;
+
+       data.dptr = (uint8_t *)&t;
+       data.dsize = sizeof(t);
+
+       ret = ctdb_control(ctdb_db->ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_TRAVERSE_START_EXT, 0,
+                          data, NULL, NULL, &status, NULL, NULL);
+       if (ret != 0 || status != 0) {
+               DEBUG(DEBUG_ERR,("ctdb_traverse_all failed\n"));
+               ctdb_client_remove_message_handler(ctdb_db->ctdb, srvid, &state);
+               return -1;
+       }
+
+       while (!state.done) {
+               event_loop_once(ctdb_db->ctdb->ev);
+       }
+
+       ret = ctdb_client_remove_message_handler(ctdb_db->ctdb, srvid, &state);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to remove ctdb_traverse handler\n"));
+               return -1;
+       }
+
+       return state.count;
+}
+
+/**
+ * start a cluster wide traverse, calling the supplied fn on each record
+ * return the number of records traversed, or -1 on error
+ *
+ * Standard version which does not list the empty records:
+ * These are considered deleted.
+ */
+int ctdb_traverse(struct ctdb_db_context *ctdb_db, ctdb_traverse_func fn, void *private_data)
+{
+       return ctdb_traverse_ext(ctdb_db, fn, false, private_data);
+}
+
+#define ISASCII(x) (isprint(x) && !strchr("\"\\", (x)))
+/*
+  called on each key during a catdb
+ */
+int ctdb_dumpdb_record(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, void *p)
+{
+       int i;
+       struct ctdb_dump_db_context *c = (struct ctdb_dump_db_context *)p;
+       FILE *f = c->f;
+       struct ctdb_ltdb_header *h = (struct ctdb_ltdb_header *)data.dptr;
+
+       fprintf(f, "key(%u) = \"", (unsigned)key.dsize);
+       for (i=0;i<key.dsize;i++) {
+               if (ISASCII(key.dptr[i])) {
+                       fprintf(f, "%c", key.dptr[i]);
+               } else {
+                       fprintf(f, "\\%02X", key.dptr[i]);
+               }
+       }
+       fprintf(f, "\"\n");
+
+       fprintf(f, "dmaster: %u\n", h->dmaster);
+       fprintf(f, "rsn: %llu\n", (unsigned long long)h->rsn);
+
+       if (c->printlmaster && ctdb->vnn_map != NULL) {
+               fprintf(f, "lmaster: %u\n", ctdb_lmaster(ctdb, &key));
+       }
+
+       if (c->printhash) {
+               fprintf(f, "hash: 0x%08x\n", ctdb_hash(&key));
+       }
+
+       if (c->printrecordflags) {
+               fprintf(f, "flags: 0x%08x", h->flags);
+               if (h->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) printf(" MIGRATED_WITH_DATA");
+               if (h->flags & CTDB_REC_FLAG_VACUUM_MIGRATED) printf(" VACUUM_MIGRATED");
+               if (h->flags & CTDB_REC_FLAG_AUTOMATIC) printf(" AUTOMATIC");
+               if (h->flags & CTDB_REC_RO_HAVE_DELEGATIONS) printf(" RO_HAVE_DELEGATIONS");
+               if (h->flags & CTDB_REC_RO_HAVE_READONLY) printf(" RO_HAVE_READONLY");
+               if (h->flags & CTDB_REC_RO_REVOKING_READONLY) printf(" RO_REVOKING_READONLY");
+               if (h->flags & CTDB_REC_RO_REVOKE_COMPLETE) printf(" RO_REVOKE_COMPLETE");
+               fprintf(f, "\n");
+       }
+
+       if (c->printdatasize) {
+               fprintf(f, "data size: %u\n", (unsigned)data.dsize);
+       } else {
+               fprintf(f, "data(%u) = \"", (unsigned)(data.dsize - sizeof(*h)));
+               for (i=sizeof(*h);i<data.dsize;i++) {
+                       if (ISASCII(data.dptr[i])) {
+                               fprintf(f, "%c", data.dptr[i]);
+                       } else {
+                               fprintf(f, "\\%02X", data.dptr[i]);
+                       }
+               }
+               fprintf(f, "\"\n");
+       }
+
+       fprintf(f, "\n");
+
+       return 0;
+}
+
+/*
+  convenience function to list all keys to stdout
+ */
+int ctdb_dump_db(struct ctdb_db_context *ctdb_db,
+                struct ctdb_dump_db_context *ctx)
+{
+       return ctdb_traverse_ext(ctdb_db, ctdb_dumpdb_record,
+                                ctx->printemptyrecords, ctx);
+}
+
+/*
+  get the pid of a ctdb daemon
+ */
+int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *pid)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_PID, 0, tdb_null, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getpid failed\n"));
+               return -1;
+       }
+
+       *pid = res;
+
+       return 0;
+}
+
+
+/*
+  async freeze send control
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t priority)
+{
+       return ctdb_control_send(ctdb, destnode, priority, 
+                          CTDB_CONTROL_FREEZE, 0, tdb_null, 
+                          mem_ctx, &timeout, NULL);
+}
+
+/* 
+   async freeze recv control
+*/
+int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL);
+       if ( (ret != 0) || (res != 0) ){
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_freeze_recv failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  freeze databases of a certain priority
+ */
+int ctdb_ctrl_freeze_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t priority)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_client_control_state *state;
+       int ret;
+
+       state = ctdb_ctrl_freeze_send(ctdb, tmp_ctx, timeout, destnode, priority);
+       ret = ctdb_ctrl_freeze_recv(ctdb, tmp_ctx, state);
+       talloc_free(tmp_ctx);
+
+       return ret;
+}
+
+/* Freeze all databases */
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int i;
+
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb_ctrl_freeze_priority(ctdb, timeout, destnode, i) != 0) {
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+/*
+  thaw databases of a certain priority
+ */
+int ctdb_ctrl_thaw_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t priority)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, priority, 
+                          CTDB_CONTROL_THAW, 0, tdb_null, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control thaw failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* thaw all databases */
+int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       return ctdb_ctrl_thaw_priority(ctdb, timeout, destnode, 0);
+}
+
+/*
+  get pnn of a node, or -1
+ */
+int ctdb_ctrl_getpnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_PNN, 0, tdb_null, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getpnn failed\n"));
+               return -1;
+       }
+
+       return res;
+}
+
+/*
+  get the monitoring mode of a remote node
+ */
+int ctdb_ctrl_getmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *monmode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_MONMODE, 0, tdb_null, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getmonmode failed\n"));
+               return -1;
+       }
+
+       *monmode = res;
+
+       return 0;
+}
+
+
+/*
+ set the monitoring mode of a remote node to active
+ */
+int ctdb_ctrl_enable_monmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+       
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_ENABLE_MONITOR, 0, tdb_null, 
+                          NULL, NULL,NULL, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for enable_monitor failed\n"));
+               return -1;
+       }
+
+       
+
+       return 0;
+}
+
+/*
+  set the monitoring mode of a remote node to disable
+ */
+int ctdb_ctrl_disable_monmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+       
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_DISABLE_MONITOR, 0, tdb_null, 
+                          NULL, NULL, NULL, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for disable_monitor failed\n"));
+               return -1;
+       }
+
+       
+
+       return 0;
+}
+
+
+
+/* 
+  sent to a node to make it take over an ip address
+*/
+int ctdb_ctrl_takeover_ip(struct ctdb_context *ctdb, struct timeval timeout, 
+                         uint32_t destnode, struct ctdb_public_ip *ip)
+{
+       TDB_DATA data;
+       struct ctdb_public_ipv4 ipv4;
+       int ret;
+       int32_t res;
+
+       if (ip->addr.sa.sa_family == AF_INET) {
+               ipv4.pnn = ip->pnn;
+               ipv4.sin = ip->addr.ip;
+
+               data.dsize = sizeof(ipv4);
+               data.dptr  = (uint8_t *)&ipv4;
+
+               ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_TAKEOVER_IPv4, 0, data, NULL,
+                          NULL, &res, &timeout, NULL);
+       } else {
+               data.dsize = sizeof(*ip);
+               data.dptr  = (uint8_t *)ip;
+
+               ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_TAKEOVER_IP, 0, data, NULL,
+                          NULL, &res, &timeout, NULL);
+       }
+
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for takeover_ip failed\n"));
+               return -1;
+       }
+
+       return 0;       
+}
+
+
+/* 
+  sent to a node to make it release an ip address
+*/
+int ctdb_ctrl_release_ip(struct ctdb_context *ctdb, struct timeval timeout, 
+                        uint32_t destnode, struct ctdb_public_ip *ip)
+{
+       TDB_DATA data;
+       struct ctdb_public_ipv4 ipv4;
+       int ret;
+       int32_t res;
+
+       if (ip->addr.sa.sa_family == AF_INET) {
+               ipv4.pnn = ip->pnn;
+               ipv4.sin = ip->addr.ip;
+
+               data.dsize = sizeof(ipv4);
+               data.dptr  = (uint8_t *)&ipv4;
+
+               ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_RELEASE_IPv4, 0, data, NULL,
+                                  NULL, &res, &timeout, NULL);
+       } else {
+               data.dsize = sizeof(*ip);
+               data.dptr  = (uint8_t *)ip;
+
+               ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_RELEASE_IP, 0, data, NULL,
+                                  NULL, &res, &timeout, NULL);
+       }
+
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for release_ip failed\n"));
+               return -1;
+       }
+
+       return 0;       
+}
+
+
+/*
+  get a tunable
+ */
+int ctdb_ctrl_get_tunable(struct ctdb_context *ctdb, 
+                         struct timeval timeout, 
+                         uint32_t destnode,
+                         const char *name, uint32_t *value)
+{
+       struct ctdb_control_get_tunable *t;
+       TDB_DATA data, outdata;
+       int32_t res;
+       int ret;
+
+       data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(name) + 1;
+       data.dptr  = talloc_size(ctdb, data.dsize);
+       CTDB_NO_MEMORY(ctdb, data.dptr);
+
+       t = (struct ctdb_control_get_tunable *)data.dptr;
+       t->length = strlen(name)+1;
+       memcpy(t->name, name, t->length);
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_TUNABLE, 0, data, ctdb,
+                          &outdata, &res, &timeout, NULL);
+       talloc_free(data.dptr);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get_tunable failed\n"));
+               return ret != 0 ? ret : res;
+       }
+
+       if (outdata.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,("Invalid return data in get_tunable\n"));
+               talloc_free(outdata.dptr);
+               return -1;
+       }
+       
+       *value = *(uint32_t *)outdata.dptr;
+       talloc_free(outdata.dptr);
+
+       return 0;
+}
+
+/*
+  set a tunable
+ */
+int ctdb_ctrl_set_tunable(struct ctdb_context *ctdb, 
+                         struct timeval timeout, 
+                         uint32_t destnode,
+                         const char *name, uint32_t value)
+{
+       struct ctdb_control_set_tunable *t;
+       TDB_DATA data;
+       int32_t res;
+       int ret;
+
+       data.dsize = offsetof(struct ctdb_control_set_tunable, name) + strlen(name) + 1;
+       data.dptr  = talloc_size(ctdb, data.dsize);
+       CTDB_NO_MEMORY(ctdb, data.dptr);
+
+       t = (struct ctdb_control_set_tunable *)data.dptr;
+       t->length = strlen(name)+1;
+       memcpy(t->name, name, t->length);
+       t->value = value;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_SET_TUNABLE, 0, data, NULL,
+                          NULL, &res, &timeout, NULL);
+       talloc_free(data.dptr);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set_tunable failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  list tunables
+ */
+int ctdb_ctrl_list_tunables(struct ctdb_context *ctdb, 
+                           struct timeval timeout, 
+                           uint32_t destnode,
+                           TALLOC_CTX *mem_ctx,
+                           const char ***list, uint32_t *count)
+{
+       TDB_DATA outdata;
+       int32_t res;
+       int ret;
+       struct ctdb_control_list_tunable *t;
+       char *p, *s, *ptr;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_LIST_TUNABLES, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for list_tunables failed\n"));
+               return -1;
+       }
+
+       t = (struct ctdb_control_list_tunable *)outdata.dptr;
+       if (outdata.dsize < offsetof(struct ctdb_control_list_tunable, data) ||
+           t->length > outdata.dsize-offsetof(struct ctdb_control_list_tunable, data)) {
+               DEBUG(DEBUG_ERR,("Invalid data in list_tunables reply\n"));
+               talloc_free(outdata.dptr);
+               return -1;              
+       }
+       
+       p = talloc_strndup(mem_ctx, (char *)t->data, t->length);
+       CTDB_NO_MEMORY(ctdb, p);
+
+       talloc_free(outdata.dptr);
+       
+       (*list) = NULL;
+       (*count) = 0;
+
+       for (s=strtok_r(p, ":", &ptr); s; s=strtok_r(NULL, ":", &ptr)) {
+               (*list) = talloc_realloc(mem_ctx, *list, const char *, 1+(*count));
+               CTDB_NO_MEMORY(ctdb, *list);
+               (*list)[*count] = talloc_strdup(*list, s);
+               CTDB_NO_MEMORY(ctdb, (*list)[*count]);
+               (*count)++;
+       }
+
+       talloc_free(p);
+
+       return 0;
+}
+
+
+int ctdb_ctrl_get_public_ips_flags(struct ctdb_context *ctdb,
+                                  struct timeval timeout, uint32_t destnode,
+                                  TALLOC_CTX *mem_ctx,
+                                  uint32_t flags,
+                                  struct ctdb_all_public_ips **ips)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_PUBLIC_IPS, flags, tdb_null,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret == 0 && res == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control to get public ips failed, falling back to ipv4-only version\n"));
+               return ctdb_ctrl_get_public_ipsv4(ctdb, timeout, destnode, mem_ctx, ips);
+       }
+       if (ret != 0 || res != 0) {
+         DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getpublicips failed ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       *ips = (struct ctdb_all_public_ips *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb,
+                            struct timeval timeout, uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            struct ctdb_all_public_ips **ips)
+{
+       return ctdb_ctrl_get_public_ips_flags(ctdb, timeout,
+                                             destnode, mem_ctx,
+                                             0, ips);
+}
+
+int ctdb_ctrl_get_public_ipsv4(struct ctdb_context *ctdb, 
+                       struct timeval timeout, uint32_t destnode, 
+                       TALLOC_CTX *mem_ctx, struct ctdb_all_public_ips **ips)
+{
+       int ret, i, len;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_all_public_ipsv4 *ipsv4;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_PUBLIC_IPSv4, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getpublicips failed\n"));
+               return -1;
+       }
+
+       ipsv4 = (struct ctdb_all_public_ipsv4 *)outdata.dptr;
+       len = offsetof(struct ctdb_all_public_ips, ips) +
+               ipsv4->num*sizeof(struct ctdb_public_ip);
+       *ips = talloc_zero_size(mem_ctx, len);
+       CTDB_NO_MEMORY(ctdb, *ips);
+       (*ips)->num = ipsv4->num;
+       for (i=0; i<ipsv4->num; i++) {
+               (*ips)->ips[i].pnn     = ipsv4->ips[i].pnn;
+               (*ips)->ips[i].addr.ip = ipsv4->ips[i].sin;
+       }
+
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+int ctdb_ctrl_get_public_ip_info(struct ctdb_context *ctdb,
+                                struct timeval timeout, uint32_t destnode,
+                                TALLOC_CTX *mem_ctx,
+                                const ctdb_sock_addr *addr,
+                                struct ctdb_control_public_ip_info **_info)
+{
+       int ret;
+       TDB_DATA indata;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_control_public_ip_info *info;
+       uint32_t len;
+       uint32_t i;
+
+       indata.dptr = discard_const_p(uint8_t, addr);
+       indata.dsize = sizeof(*addr);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_GET_PUBLIC_IP_INFO, 0, indata,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       len = offsetof(struct ctdb_control_public_ip_info, ifaces);
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       info = (struct ctdb_control_public_ip_info *)outdata.dptr;
+       len += info->num*sizeof(struct ctdb_control_iface_info);
+
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       /* make sure we null terminate the returned strings */
+       for (i=0; i < info->num; i++) {
+               info->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+       }
+
+       *_info = (struct ctdb_control_public_ip_info *)talloc_memdup(mem_ctx,
+                                                               outdata.dptr,
+                                                               outdata.dsize);
+       talloc_free(outdata.dptr);
+       if (*_info == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get public ip info "
+                               "talloc_memdup size %u failed\n",
+                               (unsigned int)outdata.dsize));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_get_ifaces(struct ctdb_context *ctdb,
+                        struct timeval timeout, uint32_t destnode,
+                        TALLOC_CTX *mem_ctx,
+                        struct ctdb_control_get_ifaces **_ifaces)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+       struct ctdb_control_get_ifaces *ifaces;
+       uint32_t len;
+       uint32_t i;
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_GET_IFACES, 0, tdb_null,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces);
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       ifaces = (struct ctdb_control_get_ifaces *)outdata.dptr;
+       len += ifaces->num*sizeof(struct ctdb_control_iface_info);
+
+       if (len > outdata.dsize) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "returned invalid data with size %u > %u\n",
+                               (unsigned int)outdata.dsize,
+                               (unsigned int)len));
+               dump_data(DEBUG_DEBUG, outdata.dptr, outdata.dsize);
+               return -1;
+       }
+
+       /* make sure we null terminate the returned strings */
+       for (i=0; i < ifaces->num; i++) {
+               ifaces->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+       }
+
+       *_ifaces = (struct ctdb_control_get_ifaces *)talloc_memdup(mem_ctx,
+                                                                 outdata.dptr,
+                                                                 outdata.dsize);
+       talloc_free(outdata.dptr);
+       if (*_ifaces == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get ifaces "
+                               "talloc_memdup size %u failed\n",
+                               (unsigned int)outdata.dsize));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_set_iface_link(struct ctdb_context *ctdb,
+                            struct timeval timeout, uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            const struct ctdb_control_iface_info *info)
+{
+       int ret;
+       TDB_DATA indata;
+       int32_t res;
+
+       indata.dptr = discard_const_p(uint8_t, info);
+       indata.dsize = sizeof(*info);
+
+       ret = ctdb_control(ctdb, destnode, 0,
+                          CTDB_CONTROL_SET_IFACE_LINK_STATE, 0, indata,
+                          mem_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set iface link "
+                               "failed ret:%d res:%d\n",
+                               ret, res));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  set/clear the permanent disabled bit on a remote node
+ */
+int ctdb_ctrl_modflags(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, 
+                      uint32_t set, uint32_t clear)
+{
+       int ret;
+       TDB_DATA data;
+       struct ctdb_node_map *nodemap=NULL;
+       struct ctdb_node_flag_change c;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t recmaster;
+       uint32_t *nodes;
+
+
+       /* find the recovery master */
+       ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, timeout, CTDB_CURRENT_NODE, &recmaster);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+
+       /* read the node flags from the recmaster */
+       ret = ctdb_ctrl_getnodemap(ctdb, timeout, recmaster, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", destnode));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (destnode >= nodemap->num) {
+               DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", destnode));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       c.pnn       = destnode;
+       c.old_flags = nodemap->nodes[destnode].flags;
+       c.new_flags = c.old_flags;
+       c.new_flags |= set;
+       c.new_flags &= ~clear;
+
+       data.dsize = sizeof(c);
+       data.dptr = (unsigned char *)&c;
+
+       /* send the flags update to all connected nodes */
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
+                                       nodes, 0,
+                                       timeout, false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
+/*
+  get all tunables
+ */
+int ctdb_ctrl_get_all_tunables(struct ctdb_context *ctdb, 
+                              struct timeval timeout, 
+                              uint32_t destnode,
+                              struct ctdb_tunable *tunables)
+{
+       TDB_DATA outdata;
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_GET_ALL_TUNABLES, 0, tdb_null, ctdb,
+                          &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get all tunables failed\n"));
+               return -1;
+       }
+
+       if (outdata.dsize != sizeof(*tunables)) {
+               DEBUG(DEBUG_ERR,(__location__ " bad data size %u in ctdb_ctrl_get_all_tunables should be %u\n",
+                        (unsigned)outdata.dsize, (unsigned)sizeof(*tunables)));
+               return -1;              
+       }
+
+       *tunables = *(struct ctdb_tunable *)outdata.dptr;
+       talloc_free(outdata.dptr);
+       return 0;
+}
+
+/*
+  add a public address to a node
+ */
+int ctdb_ctrl_add_public_ip(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     struct ctdb_control_ip_iface *pub)
+{
+       TDB_DATA data;
+       int32_t res;
+       int ret;
+
+       data.dsize = offsetof(struct ctdb_control_ip_iface, iface) + pub->len;
+       data.dptr  = (unsigned char *)pub;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_ADD_PUBLIC_IP, 0, data, NULL,
+                          NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for add_public_ip failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  delete a public address from a node
+ */
+int ctdb_ctrl_del_public_ip(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     struct ctdb_control_ip_iface *pub)
+{
+       TDB_DATA data;
+       int32_t res;
+       int ret;
+
+       data.dsize = offsetof(struct ctdb_control_ip_iface, iface) + pub->len;
+       data.dptr  = (unsigned char *)pub;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_DEL_PUBLIC_IP, 0, data, NULL,
+                          NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for del_public_ip failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  kill a tcp connection
+ */
+int ctdb_ctrl_killtcp(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     struct ctdb_control_killtcp *killtcp)
+{
+       TDB_DATA data;
+       int32_t res;
+       int ret;
+
+       data.dsize = sizeof(struct ctdb_control_killtcp);
+       data.dptr  = (unsigned char *)killtcp;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_KILL_TCP, 0, data, NULL,
+                          NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for killtcp failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  send a gratious arp
+ */
+int ctdb_ctrl_gratious_arp(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     ctdb_sock_addr *addr,
+                     const char *ifname)
+{
+       TDB_DATA data;
+       int32_t res;
+       int ret, len;
+       struct ctdb_control_gratious_arp *gratious_arp;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+
+       len = strlen(ifname)+1;
+       gratious_arp = talloc_size(tmp_ctx, 
+               offsetof(struct ctdb_control_gratious_arp, iface) + len);
+       CTDB_NO_MEMORY(ctdb, gratious_arp);
+
+       gratious_arp->addr = *addr;
+       gratious_arp->len = len;
+       memcpy(&gratious_arp->iface[0], ifname, len);
+
+
+       data.dsize = offsetof(struct ctdb_control_gratious_arp, iface) + len;
+       data.dptr  = (unsigned char *)gratious_arp;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_SEND_GRATIOUS_ARP, 0, data, NULL,
+                          NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for gratious_arp failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  get a list of all tcp tickles that a node knows about for a particular vnn
+ */
+int ctdb_ctrl_get_tcp_tickles(struct ctdb_context *ctdb, 
+                             struct timeval timeout, uint32_t destnode, 
+                             TALLOC_CTX *mem_ctx, 
+                             ctdb_sock_addr *addr,
+                             struct ctdb_control_tcp_tickle_list **list)
+{
+       int ret;
+       TDB_DATA data, outdata;
+       int32_t status;
+
+       data.dptr = (uint8_t*)addr;
+       data.dsize = sizeof(ctdb_sock_addr);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_TCP_TICKLE_LIST, 0, data, 
+                          mem_ctx, &outdata, &status, NULL, NULL);
+       if (ret != 0 || status != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get tcp tickles failed\n"));
+               return -1;
+       }
+
+       *list = (struct ctdb_control_tcp_tickle_list *)outdata.dptr;
+
+       return status;
+}
+
+/*
+  register a server id
+ */
+int ctdb_ctrl_register_server_id(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     struct ctdb_server_id *id)
+{
+       TDB_DATA data;
+       int32_t res;
+       int ret;
+
+       data.dsize = sizeof(struct ctdb_server_id);
+       data.dptr  = (unsigned char *)id;
+
+       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, 
+                       CTDB_CONTROL_REGISTER_SERVER_ID, 
+                       0, data, NULL,
+                       NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for register server id failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  unregister a server id
+ */
+int ctdb_ctrl_unregister_server_id(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     struct ctdb_server_id *id)
+{
+       TDB_DATA data;
+       int32_t res;
+       int ret;
+
+       data.dsize = sizeof(struct ctdb_server_id);
+       data.dptr  = (unsigned char *)id;
+
+       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, 
+                       CTDB_CONTROL_UNREGISTER_SERVER_ID, 
+                       0, data, NULL,
+                       NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for unregister server id failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  check if a server id exists
+
+  if a server id does exist, return *status == 1, otherwise *status == 0
+ */
+int ctdb_ctrl_check_server_id(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     struct ctdb_server_id *id,
+                     uint32_t *status)
+{
+       TDB_DATA data;
+       int32_t res;
+       int ret;
+
+       data.dsize = sizeof(struct ctdb_server_id);
+       data.dptr  = (unsigned char *)id;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_CHECK_SERVER_ID, 
+                       0, data, NULL,
+                       NULL, &res, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for check server id failed\n"));
+               return -1;
+       }
+
+       if (res) {
+               *status = 1;
+       } else {
+               *status = 0;
+       }
+
+       return 0;
+}
+
+/*
+   get the list of server ids that are registered on a node
+*/
+int ctdb_ctrl_get_server_id_list(struct ctdb_context *ctdb,
+               TALLOC_CTX *mem_ctx,
+               struct timeval timeout, uint32_t destnode, 
+               struct ctdb_server_id_list **svid_list)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_SERVER_ID_LIST, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get_server_id_list failed\n"));
+               return -1;
+       }
+
+       *svid_list = (struct ctdb_server_id_list *)talloc_steal(mem_ctx, outdata.dptr);
+                   
+       return 0;
+}
+
+/*
+  initialise the ctdb daemon for client applications
+
+  NOTE: In current code the daemon does not fork. This is for testing purposes only
+  and to simplify the code.
+*/
+struct ctdb_context *ctdb_init(struct event_context *ev)
+{
+       int ret;
+       struct ctdb_context *ctdb;
+
+       ctdb = talloc_zero(ev, struct ctdb_context);
+       if (ctdb == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " talloc_zero failed.\n"));
+               return NULL;
+       }
+       ctdb->ev  = ev;
+       ctdb->idr = idr_init(ctdb);
+       /* Wrap early to exercise code. */
+       ctdb->lastid = INT_MAX-200;
+       CTDB_NO_MEMORY_NULL(ctdb, ctdb->idr);
+
+       ret = ctdb_set_socketname(ctdb, CTDB_PATH);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_set_socketname failed.\n"));
+               talloc_free(ctdb);
+               return NULL;
+       }
+
+       ctdb->statistics.statistics_start_time = timeval_current();
+
+       return ctdb;
+}
+
+
+/*
+  set some ctdb flags
+*/
+void ctdb_set_flags(struct ctdb_context *ctdb, unsigned flags)
+{
+       ctdb->flags |= flags;
+}
+
+/*
+  setup the local socket name
+*/
+int ctdb_set_socketname(struct ctdb_context *ctdb, const char *socketname)
+{
+       ctdb->daemon.name = talloc_strdup(ctdb, socketname);
+       CTDB_NO_MEMORY(ctdb, ctdb->daemon.name);
+
+       return 0;
+}
+
+const char *ctdb_get_socketname(struct ctdb_context *ctdb)
+{
+       return ctdb->daemon.name;
+}
+
+/*
+  return the pnn of this node
+*/
+uint32_t ctdb_get_pnn(struct ctdb_context *ctdb)
+{
+       return ctdb->pnn;
+}
+
+
+/*
+  get the uptime of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_uptime_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+       return ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_UPTIME, 0, tdb_null, 
+                          mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_uptime_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, struct ctdb_uptime **uptime)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA outdata;
+
+       ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_uptime_recv failed\n"));
+               return -1;
+       }
+
+       *uptime = (struct ctdb_uptime *)talloc_steal(mem_ctx, outdata.dptr);
+
+       return 0;
+}
+
+int ctdb_ctrl_uptime(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_uptime **uptime)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_uptime_send(ctdb, mem_ctx, timeout, destnode);
+       return ctdb_ctrl_uptime_recv(ctdb, mem_ctx, state, uptime);
+}
+
+/*
+  send a control to execute the "recovered" event script on a node
+ */
+int ctdb_ctrl_end_recovery(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+       int32_t status;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_END_RECOVERY, 0, tdb_null, 
+                          NULL, NULL, &status, &timeout, NULL);
+       if (ret != 0 || status != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for end_recovery failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* 
+  callback for the async helpers used when sending the same control
+  to multiple nodes in parallell.
+*/
+static void async_callback(struct ctdb_client_control_state *state)
+{
+       struct client_async_data *data = talloc_get_type(state->async.private_data, struct client_async_data);
+       struct ctdb_context *ctdb = talloc_get_type(state->ctdb, struct ctdb_context);
+       int ret;
+       TDB_DATA outdata;
+       int32_t res = -1;
+       uint32_t destnode = state->c->hdr.destnode;
+
+       /* one more node has responded with recmode data */
+       data->count--;
+
+       /* if we failed to push the db, then return an error and let
+          the main loop try again.
+       */
+       if (state->state != CTDB_CONTROL_DONE) {
+               if ( !data->dont_log_errors) {
+                       DEBUG(DEBUG_ERR,("Async operation failed with state %d, opcode:%u\n", state->state, data->opcode));
+               }
+               data->fail_count++;
+               if (state->state == CTDB_CONTROL_TIMEOUT) {
+                       res = -ETIME;
+               } else {
+                       res = -1;
+               }
+               if (data->fail_callback) {
+                       data->fail_callback(ctdb, destnode, res, outdata,
+                                       data->callback_data);
+               }
+               return;
+       }
+       
+       state->async.fn = NULL;
+
+       ret = ctdb_control_recv(ctdb, state, data, &outdata, &res, NULL);
+       if ((ret != 0) || (res != 0)) {
+               if ( !data->dont_log_errors) {
+                       DEBUG(DEBUG_ERR,("Async operation failed with ret=%d res=%d opcode=%u\n", ret, (int)res, data->opcode));
+               }
+               data->fail_count++;
+               if (data->fail_callback) {
+                       data->fail_callback(ctdb, destnode, res, outdata,
+                                       data->callback_data);
+               }
+       }
+       if ((ret == 0) && (data->callback != NULL)) {
+               data->callback(ctdb, destnode, res, outdata,
+                                       data->callback_data);
+       }
+}
+
+
+void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state)
+{
+       /* set up the callback functions */
+       state->async.fn = async_callback;
+       state->async.private_data = data;
+       
+       /* one more control to wait for to complete */
+       data->count++;
+}
+
+
+/* wait for up to the maximum number of seconds allowed
+   or until all nodes we expect a response from has replied
+*/
+int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data)
+{
+       while (data->count > 0) {
+               event_loop_once(ctdb->ev);
+       }
+       if (data->fail_count != 0) {
+               if (!data->dont_log_errors) {
+                       DEBUG(DEBUG_ERR,("Async wait failed - fail_count=%u\n", 
+                                data->fail_count));
+               }
+               return -1;
+       }
+       return 0;
+}
+
+
+/* 
+   perform a simple control on the listed nodes
+   The control cannot return data
+ */
+int ctdb_client_async_control(struct ctdb_context *ctdb,
+                               enum ctdb_controls opcode,
+                               uint32_t *nodes,
+                               uint64_t srvid,
+                               struct timeval timeout,
+                               bool dont_log_errors,
+                               TDB_DATA data,
+                               client_async_callback client_callback,
+                               client_async_callback fail_callback,
+                               void *callback_data)
+{
+       struct client_async_data *async_data;
+       struct ctdb_client_control_state *state;
+       int j, num_nodes;
+
+       async_data = talloc_zero(ctdb, struct client_async_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+       async_data->dont_log_errors = dont_log_errors;
+       async_data->callback = client_callback;
+       async_data->fail_callback = fail_callback;
+       async_data->callback_data = callback_data;
+       async_data->opcode        = opcode;
+
+       num_nodes = talloc_get_size(nodes) / sizeof(uint32_t);
+
+       /* loop over all nodes and send an async control to each of them */
+       for (j=0; j<num_nodes; j++) {
+               uint32_t pnn = nodes[j];
+
+               state = ctdb_control_send(ctdb, pnn, srvid, opcode, 
+                                         0, data, async_data, &timeout, NULL);
+               if (state == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to call async control %u\n", (unsigned)opcode));
+                       talloc_free(async_data);
+                       return -1;
+               }
+               
+               ctdb_client_async_add(async_data, state);
+       }
+
+       if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+               talloc_free(async_data);
+               return -1;
+       }
+
+       talloc_free(async_data);
+       return 0;
+}
+
+uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb,
+                               struct ctdb_vnn_map *vnn_map,
+                               TALLOC_CTX *mem_ctx,
+                               bool include_self)
+{
+       int i, j, num_nodes;
+       uint32_t *nodes;
+
+       for (i=num_nodes=0;i<vnn_map->size;i++) {
+               if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+                       continue;
+               }
+               num_nodes++;
+       } 
+
+       nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+       CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+       for (i=j=0;i<vnn_map->size;i++) {
+               if (vnn_map->map[i] == ctdb->pnn && !include_self) {
+                       continue;
+               }
+               nodes[j++] = vnn_map->map[i];
+       } 
+
+       return nodes;
+}
+
+/* Get list of nodes not including those with flags specified by mask.
+ * If exclude_pnn is not -1 then exclude that pnn from the list.
+ */
+uint32_t *list_of_nodes(struct ctdb_context *ctdb,
+                       struct ctdb_node_map *node_map,
+                       TALLOC_CTX *mem_ctx,
+                       uint32_t mask,
+                       int exclude_pnn)
+{
+       int i, j, num_nodes;
+       uint32_t *nodes;
+
+       for (i=num_nodes=0;i<node_map->num;i++) {
+               if (node_map->nodes[i].flags & mask) {
+                       continue;
+               }
+               if (node_map->nodes[i].pnn == exclude_pnn) {
+                       continue;
+               }
+               num_nodes++;
+       } 
+
+       nodes = talloc_array(mem_ctx, uint32_t, num_nodes);
+       CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+       for (i=j=0;i<node_map->num;i++) {
+               if (node_map->nodes[i].flags & mask) {
+                       continue;
+               }
+               if (node_map->nodes[i].pnn == exclude_pnn) {
+                       continue;
+               }
+               nodes[j++] = node_map->nodes[i].pnn;
+       } 
+
+       return nodes;
+}
+
+uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
+                               struct ctdb_node_map *node_map,
+                               TALLOC_CTX *mem_ctx,
+                               bool include_self)
+{
+       return list_of_nodes(ctdb, node_map, mem_ctx, NODE_FLAGS_INACTIVE,
+                            include_self ? -1 : ctdb->pnn);
+}
+
+uint32_t *list_of_connected_nodes(struct ctdb_context *ctdb,
+                               struct ctdb_node_map *node_map,
+                               TALLOC_CTX *mem_ctx,
+                               bool include_self)
+{
+       return list_of_nodes(ctdb, node_map, mem_ctx, NODE_FLAGS_DISCONNECTED,
+                            include_self ? -1 : ctdb->pnn);
+}
+
+/* 
+  this is used to test if a pnn lock exists and if it exists will return
+  the number of connections that pnn has reported or -1 if that recovery
+  daemon is not running.
+*/
+int
+ctdb_read_pnn_lock(int fd, int32_t pnn)
+{
+       struct flock lock;
+       char c;
+
+       lock.l_type = F_WRLCK;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = pnn;
+       lock.l_len = 1;
+       lock.l_pid = 0;
+
+       if (fcntl(fd, F_GETLK, &lock) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " F_GETLK failed with %s\n", strerror(errno)));
+               return -1;
+       }
+
+       if (lock.l_type == F_UNLCK) {
+               return -1;
+       }
+
+       if (pread(fd, &c, 1, pnn) == -1) {
+               DEBUG(DEBUG_CRIT,(__location__ " failed read pnn count - %s\n", strerror(errno)));
+               return -1;
+       }
+
+       return c;
+}
+
+/*
+  get capabilities of a remote node
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_getcapabilities_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode)
+{
+       return ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_CAPABILITIES, 0, tdb_null, 
+                          mem_ctx, &timeout, NULL);
+}
+
+int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *capabilities)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA outdata;
+
+       ret = ctdb_control_recv(ctdb, state, mem_ctx, &outdata, &res, NULL);
+       if ( (ret != 0) || (res != 0) ) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_getcapabilities_recv failed\n"));
+               return -1;
+       }
+
+       if (capabilities) {
+               *capabilities = *((uint32_t *)outdata.dptr);
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *capabilities)
+{
+       struct ctdb_client_control_state *state;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       int ret;
+
+       state = ctdb_ctrl_getcapabilities_send(ctdb, tmp_ctx, timeout, destnode);
+       ret = ctdb_ctrl_getcapabilities_recv(ctdb, tmp_ctx, state, capabilities);
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+struct server_id {
+       uint64_t pid;
+       uint32_t task_id;
+       uint32_t vnn;
+       uint64_t unique_id;
+};
+
+static struct server_id server_id_get(struct ctdb_context *ctdb, uint32_t reqid)
+{
+       struct server_id id;
+
+       id.pid = getpid();
+       id.task_id = reqid;
+       id.vnn = ctdb_get_pnn(ctdb);
+       id.unique_id = id.vnn;
+       id.unique_id = (id.unique_id << 32) | reqid;
+
+       return id;
+}
+
+static bool server_id_equal(struct server_id *id1, struct server_id *id2)
+{
+       if (id1->pid != id2->pid) {
+               return false;
+       }
+
+       if (id1->task_id != id2->task_id) {
+               return false;
+       }
+
+       if (id1->vnn != id2->vnn) {
+               return false;
+       }
+
+       if (id1->unique_id != id2->unique_id) {
+               return false;
+       }
+
+       return true;
+}
+
+static bool server_id_exists(struct ctdb_context *ctdb, struct server_id *id)
+{
+       struct ctdb_server_id sid;
+       int ret;
+       uint32_t result;
+
+       sid.type = SERVER_TYPE_SAMBA;
+       sid.pnn = id->vnn;
+       sid.server_id = id->pid;
+
+       ret = ctdb_ctrl_check_server_id(ctdb, timeval_current_ofs(3,0),
+                                       id->vnn, &sid, &result);
+       if (ret != 0) {
+               /* If control times out, assume server_id exists. */
+               return true;
+       }
+
+       if (result) {
+               return true;
+       }
+
+       return false;
+}
+
+
+enum g_lock_type {
+       G_LOCK_READ = 0,
+       G_LOCK_WRITE = 1,
+};
+
+struct g_lock_rec {
+       enum g_lock_type type;
+       struct server_id id;
+};
+
+struct g_lock_recs {
+       unsigned int num;
+       struct g_lock_rec *lock;
+};
+
+static bool g_lock_parse(TALLOC_CTX *mem_ctx, TDB_DATA data,
+                        struct g_lock_recs **locks)
+{
+       struct g_lock_recs *recs;
+
+       recs = talloc_zero(mem_ctx, struct g_lock_recs);
+       if (recs == NULL) {
+               return false;
+       }
+
+       if (data.dsize == 0) {
+               goto done;
+       }
+
+       if (data.dsize % sizeof(struct g_lock_rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ "invalid data size %lu in g_lock record\n",
+                                 (unsigned long)data.dsize));
+               talloc_free(recs);
+               return false;
+       }
+
+       recs->num = data.dsize / sizeof(struct g_lock_rec);
+       recs->lock = talloc_memdup(mem_ctx, data.dptr, data.dsize);
+       if (recs->lock == NULL) {
+               talloc_free(recs);
+               return false;
+       }
+
+done:
+       if (locks != NULL) {
+               *locks = recs;
+       }
+
+       return true;
+}
+
+
+static bool g_lock_lock(TALLOC_CTX *mem_ctx,
+                       struct ctdb_db_context *ctdb_db,
+                       const char *keyname, uint32_t reqid)
+{
+       TDB_DATA key, data;
+       struct ctdb_record_handle *h;
+       struct g_lock_recs *locks;
+       struct server_id id;
+       struct timeval t_start;
+       int i;
+
+       key.dptr = (uint8_t *)discard_const(keyname);
+       key.dsize = strlen(keyname) + 1;
+
+       t_start = timeval_current();
+
+again:
+       /* Keep trying for an hour. */
+       if (timeval_elapsed(&t_start) > 3600) {
+               return false;
+       }
+
+       h = ctdb_fetch_lock(ctdb_db, mem_ctx, key, &data);
+       if (h == NULL) {
+               return false;
+       }
+
+       if (!g_lock_parse(h, data, &locks)) {
+               DEBUG(DEBUG_ERR, ("g_lock: error parsing locks\n"));
+               talloc_free(data.dptr);
+               talloc_free(h);
+               return false;
+       }
+
+       talloc_free(data.dptr);
+
+       id = server_id_get(ctdb_db->ctdb, reqid);
+
+       i = 0;
+       while (i < locks->num) {
+               if (server_id_equal(&locks->lock[i].id, &id)) {
+                       /* Internal error */
+                       talloc_free(h);
+                       return false;
+               }
+
+               if (!server_id_exists(ctdb_db->ctdb, &locks->lock[i].id)) {
+                       if (i < locks->num-1) {
+                               locks->lock[i] = locks->lock[locks->num-1];
+                       }
+                       locks->num--;
+                       continue;
+               }
+
+               /* This entry is locked. */
+               DEBUG(DEBUG_INFO, ("g_lock: lock already granted for "
+                                  "pid=0x%llx taskid=%x vnn=%d id=0x%llx\n",
+                                  (unsigned long long)id.pid,
+                                  id.task_id, id.vnn,
+                                  (unsigned long long)id.unique_id));
+               talloc_free(h);
+               goto again;
+       }
+
+       locks->lock = talloc_realloc(locks, locks->lock, struct g_lock_rec,
+                                    locks->num+1);
+       if (locks->lock == NULL) {
+               talloc_free(h);
+               return false;
+       }
+
+       locks->lock[locks->num].type = G_LOCK_WRITE;
+       locks->lock[locks->num].id = id;
+       locks->num++;
+
+       data.dptr = (uint8_t *)locks->lock;
+       data.dsize = locks->num * sizeof(struct g_lock_rec);
+
+       if (ctdb_record_store(h, data) != 0) {
+               DEBUG(DEBUG_ERR, ("g_lock: failed to write transaction lock for "
+                                 "pid=0x%llx taskid=%x vnn=%d id=0x%llx\n",
+                                 (unsigned long long)id.pid,
+                                 id.task_id, id.vnn,
+                                 (unsigned long long)id.unique_id));
+               talloc_free(h);
+               return false;
+       }
+
+       DEBUG(DEBUG_INFO, ("g_lock: lock granted for "
+                          "pid=0x%llx taskid=%x vnn=%d id=0x%llx\n",
+                          (unsigned long long)id.pid,
+                          id.task_id, id.vnn,
+                          (unsigned long long)id.unique_id));
+
+       talloc_free(h);
+       return true;
+}
+
+static bool g_lock_unlock(TALLOC_CTX *mem_ctx,
+                         struct ctdb_db_context *ctdb_db,
+                         const char *keyname, uint32_t reqid)
+{
+       TDB_DATA key, data;
+       struct ctdb_record_handle *h;
+       struct g_lock_recs *locks;
+       struct server_id id;
+       int i;
+       bool found = false;
+
+       key.dptr = (uint8_t *)discard_const(keyname);
+       key.dsize = strlen(keyname) + 1;
+       h = ctdb_fetch_lock(ctdb_db, mem_ctx, key, &data);
+       if (h == NULL) {
+               return false;
+       }
+
+       if (!g_lock_parse(h, data, &locks)) {
+               DEBUG(DEBUG_ERR, ("g_lock: error parsing locks\n"));
+               talloc_free(data.dptr);
+               talloc_free(h);
+               return false;
+       }
+
+       talloc_free(data.dptr);
+
+       id = server_id_get(ctdb_db->ctdb, reqid);
+
+       for (i=0; i<locks->num; i++) {
+               if (server_id_equal(&locks->lock[i].id, &id)) {
+                       if (i < locks->num-1) {
+                               locks->lock[i] = locks->lock[locks->num-1];
+                       }
+                       locks->num--;
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found) {
+               DEBUG(DEBUG_ERR, ("g_lock: lock not found\n"));
+               talloc_free(h);
+               return false;
+       }
+
+       data.dptr = (uint8_t *)locks->lock;
+       data.dsize = locks->num * sizeof(struct g_lock_rec);
+
+       if (ctdb_record_store(h, data) != 0) {
+               talloc_free(h);
+               return false;
+       }
+
+       talloc_free(h);
+       return true;
+}
+
+
+struct ctdb_transaction_handle {
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_db_context *g_lock_db;
+       char *lock_name;
+       uint32_t reqid;
+       /*
+        * we store reads and writes done under a transaction:
+        * - one list stores both reads and writes (m_all)
+        * - the other just writes (m_write)
+        */
+       struct ctdb_marshall_buffer *m_all;
+       struct ctdb_marshall_buffer *m_write;
+};
+
+static int ctdb_transaction_destructor(struct ctdb_transaction_handle *h)
+{
+       g_lock_unlock(h, h->g_lock_db, h->lock_name, h->reqid);
+       ctdb_reqid_remove(h->ctdb_db->ctdb, h->reqid);
+       return 0;
+}
+
+
+/**
+ * start a transaction on a database
+ */
+struct ctdb_transaction_handle *ctdb_transaction_start(struct ctdb_db_context *ctdb_db,
+                                                      TALLOC_CTX *mem_ctx)
+{
+       struct ctdb_transaction_handle *h;
+       struct ctdb_server_id id;
+
+       h = talloc_zero(mem_ctx, struct ctdb_transaction_handle);
+       if (h == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " memory allocation error\n"));
+               return NULL;
+       }
+
+       h->ctdb_db = ctdb_db;
+       h->lock_name = talloc_asprintf(h, "transaction_db_0x%08x",
+                                      (unsigned int)ctdb_db->db_id);
+       if (h->lock_name == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " talloc asprintf failed\n"));
+               talloc_free(h);
+               return NULL;
+       }
+
+       h->g_lock_db = ctdb_attach(h->ctdb_db->ctdb, timeval_current_ofs(3,0),
+                                  "g_lock.tdb", false, 0);
+       if (!h->g_lock_db) {
+               DEBUG(DEBUG_ERR, (__location__ " unable to attach to g_lock.tdb\n"));
+               talloc_free(h);
+               return NULL;
+       }
+
+       id.type = SERVER_TYPE_SAMBA;
+       id.pnn = ctdb_get_pnn(ctdb_db->ctdb);
+       id.server_id = getpid();
+
+       if (ctdb_ctrl_register_server_id(ctdb_db->ctdb, timeval_current_ofs(3,0),
+                                        &id) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " unable to register server id\n"));
+               talloc_free(h);
+               return NULL;
+       }
+
+       h->reqid = ctdb_reqid_new(h->ctdb_db->ctdb, h);
+
+       if (!g_lock_lock(h, h->g_lock_db, h->lock_name, h->reqid)) {
+               DEBUG(DEBUG_ERR, (__location__ " Error locking g_lock.tdb\n"));
+               talloc_free(h);
+               return NULL;
+       }
+
+       talloc_set_destructor(h, ctdb_transaction_destructor);
+       return h;
+}
+
+/**
+ * fetch a record inside a transaction
+ */
+int ctdb_transaction_fetch(struct ctdb_transaction_handle *h,
+                          TALLOC_CTX *mem_ctx,
+                          TDB_DATA key, TDB_DATA *data)
+{
+       struct ctdb_ltdb_header header;
+       int ret;
+
+       ZERO_STRUCT(header);
+
+       ret = ctdb_ltdb_fetch(h->ctdb_db, key, &header, mem_ctx, data);
+       if (ret == -1 && header.dmaster == (uint32_t)-1) {
+               /* record doesn't exist yet */
+               *data = tdb_null;
+               ret = 0;
+       }
+
+       if (ret != 0) {
+               return ret;
+       }
+
+       h->m_all = ctdb_marshall_add(h, h->m_all, h->ctdb_db->db_id, 1, key, NULL, *data);
+       if (h->m_all == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/**
+ * stores a record inside a transaction
+ */
+int ctdb_transaction_store(struct ctdb_transaction_handle *h,
+                          TDB_DATA key, TDB_DATA data)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(h);
+       struct ctdb_ltdb_header header;
+       TDB_DATA olddata;
+       int ret;
+
+       /* we need the header so we can update the RSN */
+       ret = ctdb_ltdb_fetch(h->ctdb_db, key, &header, tmp_ctx, &olddata);
+       if (ret == -1 && header.dmaster == (uint32_t)-1) {
+               /* the record doesn't exist - create one with us as dmaster.
+                  This is only safe because we are in a transaction and this
+                  is a persistent database */
+               ZERO_STRUCT(header);
+       } else if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to fetch record\n"));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       if (data.dsize == olddata.dsize &&
+           memcmp(data.dptr, olddata.dptr, data.dsize) == 0 &&
+           header.rsn != 0) {
+               /* save writing the same data */
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       header.dmaster = h->ctdb_db->ctdb->pnn;
+       header.rsn++;
+
+       h->m_all = ctdb_marshall_add(h, h->m_all, h->ctdb_db->db_id, 0, key, NULL, data);
+       if (h->m_all == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       h->m_write = ctdb_marshall_add(h, h->m_write, h->ctdb_db->db_id, 0, key, &header, data);
+       if (h->m_write == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to add to marshalling record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+static int ctdb_fetch_db_seqnum(struct ctdb_db_context *ctdb_db, uint64_t *seqnum)
+{
+       const char *keyname = CTDB_DB_SEQNUM_KEY;
+       TDB_DATA key, data;
+       struct ctdb_ltdb_header header;
+       int ret;
+
+       key.dptr = (uint8_t *)discard_const(keyname);
+       key.dsize = strlen(keyname) + 1;
+
+       ret = ctdb_ltdb_fetch(ctdb_db, key, &header, ctdb_db, &data);
+       if (ret != 0) {
+               *seqnum = 0;
+               return 0;
+       }
+
+       if (data.dsize != sizeof(*seqnum)) {
+               DEBUG(DEBUG_ERR, (__location__ " Invalid data recived len=%zi\n",
+                                 data.dsize));
+               talloc_free(data.dptr);
+               return -1;
+       }
+
+       *seqnum = *(uint64_t *)data.dptr;
+       talloc_free(data.dptr);
+
+       return 0;
+}
+
+
+static int ctdb_store_db_seqnum(struct ctdb_transaction_handle *h,
+                               uint64_t seqnum)
+{
+       const char *keyname = CTDB_DB_SEQNUM_KEY;
+       TDB_DATA key, data;
+
+       key.dptr = (uint8_t *)discard_const(keyname);
+       key.dsize = strlen(keyname) + 1;
+
+       data.dptr = (uint8_t *)&seqnum;
+       data.dsize = sizeof(seqnum);
+
+       return ctdb_transaction_store(h, key, data);
+}
+
+
+/**
+ * commit a transaction
+ */
+int ctdb_transaction_commit(struct ctdb_transaction_handle *h)
+{
+       int ret;
+       uint64_t old_seqnum, new_seqnum;
+       int32_t status;
+       struct timeval timeout;
+
+       if (h->m_write == NULL) {
+               /* no changes were made */
+               talloc_free(h);
+               return 0;
+       }
+
+       ret = ctdb_fetch_db_seqnum(h->ctdb_db, &old_seqnum);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to fetch db sequence number\n"));
+               ret = -1;
+               goto done;
+       }
+
+       new_seqnum = old_seqnum + 1;
+       ret = ctdb_store_db_seqnum(h, new_seqnum);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to store db sequence number\n"));
+               ret = -1;
+               goto done;
+       }
+
+again:
+       timeout = timeval_current_ofs(3,0);
+       ret = ctdb_control(h->ctdb_db->ctdb, CTDB_CURRENT_NODE,
+                          h->ctdb_db->db_id,
+                          CTDB_CONTROL_TRANS3_COMMIT, 0,
+                          ctdb_marshall_finish(h->m_write), NULL, NULL,
+                          &status, &timeout, NULL);
+       if (ret != 0 || status != 0) {
+               /*
+                * TRANS3_COMMIT control will only fail if recovery has been
+                * triggered.  Check if the database has been updated or not.
+                */
+               ret = ctdb_fetch_db_seqnum(h->ctdb_db, &new_seqnum);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " failed to fetch db sequence number\n"));
+                       goto done;
+               }
+
+               if (new_seqnum == old_seqnum) {
+                       /* Database not yet updated, try again */
+                       goto again;
+               }
+
+               if (new_seqnum != (old_seqnum + 1)) {
+                       DEBUG(DEBUG_ERR, (__location__ " new seqnum [%llu] != old seqnum [%llu] + 1\n",
+                                         (long long unsigned)new_seqnum,
+                                         (long long unsigned)old_seqnum));
+                       ret = -1;
+                       goto done;
+               }
+       }
+
+       ret = 0;
+
+done:
+       talloc_free(h);
+       return ret;
+}
+
+/**
+ * cancel a transaction
+ */
+int ctdb_transaction_cancel(struct ctdb_transaction_handle *h)
+{
+       talloc_free(h);
+       return 0;
+}
+
+
+/*
+  recovery daemon ping to main daemon
+ */
+int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_PING, 0, tdb_null, 
+                          ctdb, NULL, &res, NULL, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send recd ping\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* When forking the main daemon and the child process needs to connect
+ * back to the daemon as a client process, this function can be used
+ * to change the ctdb context from daemon into client mode.  The child
+ * process must be created using ctdb_fork() and not fork() -
+ * ctdb_fork() does some necessary housekeeping.
+ */
+int switch_from_server_to_client(struct ctdb_context *ctdb, const char *fmt, ...)
+{
+       int ret;
+       va_list ap;
+
+       /* Add extra information so we can identify this in the logs */
+       va_start(ap, fmt);
+       debug_extra = talloc_strdup_append(talloc_vasprintf(NULL, fmt, ap), ":");
+       va_end(ap);
+
+       /* get a new event context */
+       ctdb->ev = event_context_init(ctdb);
+       tevent_loop_allow_nesting(ctdb->ev);
+
+       /* Connect to main CTDB daemon */
+       ret = ctdb_socket_connect(ctdb);
+       if (ret != 0) {
+               DEBUG(DEBUG_ALERT, (__location__ " Failed to init ctdb client\n"));
+               return -1;
+       }
+
+       ctdb->can_send_controls = true;
+
+       return 0;
+}
+
+/*
+  get the status of running the monitor eventscripts: NULL means never run.
+ */
+int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb, 
+               struct timeval timeout, uint32_t destnode, 
+               TALLOC_CTX *mem_ctx, enum ctdb_eventscript_call type,
+               struct ctdb_scripts_wire **scripts)
+{
+       int ret;
+       TDB_DATA outdata, indata;
+       int32_t res;
+       uint32_t uinttype = type;
+
+       indata.dptr = (uint8_t *)&uinttype;
+       indata.dsize = sizeof(uinttype);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS, 0, indata,
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getscriptstatus failed ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       if (outdata.dsize == 0) {
+               *scripts = NULL;
+       } else {
+               *scripts = (struct ctdb_scripts_wire *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+               talloc_free(outdata.dptr);
+       }
+                   
+       return 0;
+}
+
+/*
+  tell the main daemon how long it took to lock the reclock file
+ */
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&latency;
+       data.dsize = sizeof(latency);
+
+       ret = ctdb_control(ctdb, CTDB_CURRENT_NODE, 0, CTDB_CONTROL_RECD_RECLOCK_LATENCY, 0, data, 
+                          ctdb, NULL, &res, NULL, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send recd reclock latency\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  get the name of the reclock file
+ */
+int ctdb_ctrl_getreclock(struct ctdb_context *ctdb, struct timeval timeout,
+                        uint32_t destnode, TALLOC_CTX *mem_ctx,
+                        const char **name)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_RECLOCK_FILE, 0, tdb_null, 
+                          mem_ctx, &data, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               return -1;
+       }
+
+       if (data.dsize == 0) {
+               *name = NULL;
+       } else {
+               *name = talloc_strdup(mem_ctx, discard_const(data.dptr));
+       }
+       talloc_free(data.dptr);
+
+       return 0;
+}
+
+/*
+  set the reclock filename for a node
+ */
+int ctdb_ctrl_setreclock(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *reclock)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       if (reclock == NULL) {
+               data.dsize = 0;
+               data.dptr  = NULL;
+       } else {
+               data.dsize = strlen(reclock) + 1;
+               data.dptr  = discard_const(reclock);
+       }
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_RECLOCK_FILE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setreclock failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  stop a node
+ */
+int ctdb_ctrl_stop_node(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_STOP_NODE, 0, tdb_null, 
+                          ctdb, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("Failed to stop node\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  continue a node
+ */
+int ctdb_ctrl_continue_node(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode)
+{
+       int ret;
+
+       ret = ctdb_control(ctdb, destnode, 0, CTDB_CONTROL_CONTINUE_NODE, 0, tdb_null, 
+                          ctdb, NULL, NULL, &timeout, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to continue node\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  set the natgw state for a node
+ */
+int ctdb_ctrl_setnatgwstate(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t natgwstate)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = sizeof(natgwstate);
+       data.dptr  = (uint8_t *)&natgwstate;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_NATGWSTATE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setnatgwstate failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  set the lmaster role for a node
+ */
+int ctdb_ctrl_setlmasterrole(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t lmasterrole)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = sizeof(lmasterrole);
+       data.dptr  = (uint8_t *)&lmasterrole;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_LMASTERROLE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setlmasterrole failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  set the recmaster role for a node
+ */
+int ctdb_ctrl_setrecmasterrole(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmasterrole)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = sizeof(recmasterrole);
+       data.dptr  = (uint8_t *)&recmasterrole;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_RECMASTERROLE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for setrecmasterrole failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* enable an eventscript
+ */
+int ctdb_ctrl_enablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = strlen(script) + 1;
+       data.dptr  = discard_const(script);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_ENABLE_SCRIPT, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for enablescript failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* disable an eventscript
+ */
+int ctdb_ctrl_disablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = strlen(script) + 1;
+       data.dptr  = discard_const(script);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_DISABLE_SCRIPT, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for disablescript failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_ban_time *bantime)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+
+       data.dsize = sizeof(*bantime);
+       data.dptr  = (uint8_t *)bantime;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_BAN_STATE, 0, data, 
+                          NULL, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+int ctdb_ctrl_get_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_ban_time **bantime)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_BAN_STATE, 0, tdb_null,
+                          tmp_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set ban state failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       *bantime = (struct ctdb_ban_time *)talloc_steal(mem_ctx, outdata.dptr);
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+
+int ctdb_ctrl_set_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_db_priority *db_prio)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       data.dptr = (uint8_t*)db_prio;
+       data.dsize = sizeof(*db_prio);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_DB_PRIORITY, 0, data,
+                          tmp_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set_db_priority failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+int ctdb_ctrl_get_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t db_id, uint32_t *priority)
+{
+       int ret;
+       int32_t res;
+       TDB_DATA data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       data.dptr = (uint8_t*)&db_id;
+       data.dsize = sizeof(db_id);
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_DB_PRIORITY, 0, data,
+                          tmp_ctx, NULL, &res, &timeout, NULL);
+       if (ret != 0 || res < 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for get_db_priority failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (priority) {
+               *priority = res;
+       }
+
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+int ctdb_ctrl_getstathistory(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_statistics_wire **stats)
+{
+       int ret;
+       TDB_DATA outdata;
+       int32_t res;
+
+       ret = ctdb_control(ctdb, destnode, 0, 
+                          CTDB_CONTROL_GET_STAT_HISTORY, 0, tdb_null, 
+                          mem_ctx, &outdata, &res, &timeout, NULL);
+       if (ret != 0 || res != 0 || outdata.dsize == 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for getstathistory failed ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       *stats = (struct ctdb_statistics_wire *)talloc_memdup(mem_ctx, outdata.dptr, outdata.dsize);
+       talloc_free(outdata.dptr);
+                   
+       return 0;
+}
+
+struct ctdb_ltdb_header *ctdb_header_from_record_handle(struct ctdb_record_handle *h)
+{
+       if (h == NULL) {
+               return NULL;
+       }
+
+       return &h->header;
+}
+
+
+struct ctdb_client_control_state *
+ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       struct ctdb_client_control_state *handle;
+       struct ctdb_marshall_buffer *m;
+       struct ctdb_rec_data *rec;
+       TDB_DATA outdata;
+
+       m = talloc_zero(mem_ctx, struct ctdb_marshall_buffer);
+       if (m == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to allocate marshall buffer for update record\n"));
+               return NULL;
+       }
+
+       m->db_id = ctdb_db->db_id;
+
+       rec = ctdb_marshall_record(m, 0, key, header, data);
+       if (rec == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to marshall record for update record\n"));
+               talloc_free(m);
+               return NULL;
+       }
+       m = talloc_realloc_size(mem_ctx, m, rec->length + offsetof(struct ctdb_marshall_buffer, data));
+       if (m == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata\n"));
+               talloc_free(m);
+               return NULL;
+       }
+       m->count++;
+       memcpy((uint8_t *)m + offsetof(struct ctdb_marshall_buffer, data), rec, rec->length);
+
+
+       outdata.dptr = (uint8_t *)m;
+       outdata.dsize = talloc_get_size(m);
+
+       handle = ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_UPDATE_RECORD, 0, outdata,
+                          mem_ctx, &timeout, NULL);
+       talloc_free(m);
+       return handle;
+}
+
+int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, state, NULL, &res, NULL);
+       if ( (ret != 0) || (res != 0) ){
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_update_record_recv failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+int
+ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_updaterecord_send(ctdb, mem_ctx, timeout, destnode, ctdb_db, key, header, data);
+       return ctdb_ctrl_updaterecord_recv(ctdb, state);
+}
+
+
+
+
+
+
+/*
+  set a database to be readonly
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_set_db_readonly_send(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       return ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_DB_READONLY, 0, data, 
+                          ctdb, NULL, NULL);
+}
+
+int ctdb_ctrl_set_db_readonly_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, ctdb, NULL, &res, NULL);
+       if (ret != 0 || res != 0) {
+         DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_set_db_readonly_recv failed  ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_set_db_readonly(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_set_db_readonly_send(ctdb, destnode, dbid);
+       return ctdb_ctrl_set_db_readonly_recv(ctdb, state);
+}
+
+/*
+  set a database to be sticky
+ */
+struct ctdb_client_control_state *
+ctdb_ctrl_set_db_sticky_send(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+       TDB_DATA data;
+
+       data.dptr = (uint8_t *)&dbid;
+       data.dsize = sizeof(dbid);
+
+       return ctdb_control_send(ctdb, destnode, 0, 
+                          CTDB_CONTROL_SET_DB_STICKY, 0, data, 
+                          ctdb, NULL, NULL);
+}
+
+int ctdb_ctrl_set_db_sticky_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state)
+{
+       int ret;
+       int32_t res;
+
+       ret = ctdb_control_recv(ctdb, state, ctdb, NULL, &res, NULL);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ctrl_set_db_sticky_recv failed  ret:%d res:%d\n", ret, res));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_ctrl_set_db_sticky(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid)
+{
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_set_db_sticky_send(ctdb, destnode, dbid);
+       return ctdb_ctrl_set_db_sticky_recv(ctdb, state);
+}
diff --git a/ctdb/common/cmdline.c b/ctdb/common/cmdline.c
new file mode 100644 (file)
index 0000000..cd59d84
--- /dev/null
@@ -0,0 +1,170 @@
+/* 
+   common commandline code to ctdb test tools
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+#include <ctype.h>
+
+/* Handle common command line options for ctdb test progs
+ */
+
+static struct {
+       const char *socketname;
+       const char *debuglevel;
+       int torture;
+       const char *events;
+} ctdb_cmdline = {
+       .torture = 0,
+       .debuglevel = "ERR",
+};
+
+enum {OPT_EVENTSYSTEM=1};
+
+static void ctdb_cmdline_callback(poptContext con, 
+                                 enum poptCallbackReason reason,
+                                 const struct poptOption *opt,
+                                 const char *arg, const void *data)
+{
+       switch (opt->val) {
+       case OPT_EVENTSYSTEM:
+               event_set_default_backend(arg);
+               break;
+       }
+}
+
+
+struct poptOption popt_ctdb_cmdline[] = {
+       { NULL, 0, POPT_ARG_CALLBACK, (void *)ctdb_cmdline_callback },  
+       { "socket", 0, POPT_ARG_STRING, &ctdb_cmdline.socketname, 0, "local socket name", "filename" },
+       { "debug", 'd', POPT_ARG_STRING, &ctdb_cmdline.debuglevel, 0, "debug level"},
+       { "torture", 0, POPT_ARG_NONE, &ctdb_cmdline.torture, 0, "enable nastiness in library", NULL },
+       { "events", 0, POPT_ARG_STRING, NULL, OPT_EVENTSYSTEM, "event system", NULL },
+       { NULL }
+};
+
+
+/*
+  startup daemon side of ctdb according to command line options
+ */
+struct ctdb_context *ctdb_cmdline_init(struct event_context *ev)
+{
+       struct ctdb_context *ctdb;
+       int ret;
+
+       /* initialise ctdb */
+       ctdb = ctdb_init(ev);
+       if (ctdb == NULL) {
+               printf("Failed to init ctdb\n");
+               exit(1);
+       }
+
+       if (ctdb_cmdline.torture) {
+               ctdb_set_flags(ctdb, CTDB_FLAG_TORTURE);
+       }
+
+       /* command line specified a socket name */
+       if (ctdb_cmdline.socketname != NULL) {
+               setenv("CTDB_SOCKET", ctdb_cmdline.socketname, 1);
+               ret = ctdb_set_socketname(ctdb, ctdb_cmdline.socketname);
+               if (ret == -1) {
+                       printf("ctdb_set_socketname failed - %s\n",
+                                                   ctdb_errstr(ctdb));
+                       exit(1);
+               }
+       }
+
+       /* Set the debug level */
+       if (isalpha(ctdb_cmdline.debuglevel[0]) || ctdb_cmdline.debuglevel[0] == '-') { 
+               LogLevel = get_debug_by_desc(ctdb_cmdline.debuglevel);
+       } else {
+               LogLevel = strtol(ctdb_cmdline.debuglevel, NULL, 0);
+       }
+
+       /* set up the tree to store server ids */
+       ctdb->server_ids = trbt_create(ctdb, 0);
+
+       return ctdb;
+}
+
+
+/*
+  startup a client only ctdb context
+ */
+struct ctdb_context *ctdb_cmdline_client(struct tevent_context *ev,
+                                        struct timeval req_timeout)
+{
+       struct ctdb_context *ctdb;
+       char *socket_name;
+       int ret;
+
+       /* initialise ctdb */
+       ctdb = ctdb_init(ev);
+       if (ctdb == NULL) {
+               fprintf(stderr, "Failed to init ctdb\n");
+               exit(1);
+       }
+
+       /* tell ctdb the socket address */
+       socket_name = getenv("CTDB_SOCKET");
+       if (socket_name != NULL) {
+               ret = ctdb_set_socketname(ctdb, socket_name);
+               if (ret == -1) {
+                       printf("ctdb_set_socketname failed - %s\n",
+                                                   ctdb_errstr(ctdb));
+                       exit(1);
+               }
+       }
+
+       if (ctdb_cmdline.socketname != NULL) {
+               ret = ctdb_set_socketname(ctdb, ctdb_cmdline.socketname);
+               if (ret == -1) {
+                       fprintf(stderr, "ctdb_set_socketname failed - %s\n",
+                                       ctdb_errstr(ctdb));
+                       exit(1);
+               }
+       }
+
+       /* Set the debug level */
+       if (isalpha(ctdb_cmdline.debuglevel[0]) || ctdb_cmdline.debuglevel[0] == '-') { 
+               LogLevel = get_debug_by_desc(ctdb_cmdline.debuglevel);
+       } else {
+               LogLevel = strtol(ctdb_cmdline.debuglevel, NULL, 0);
+       }
+
+       ret = ctdb_socket_connect(ctdb);
+       if (ret != 0) {
+               fprintf(stderr, __location__ " Failed to connect to daemon\n");
+               talloc_free(ctdb);
+               return NULL;
+       }
+
+       /* get our pnn */
+       ctdb->pnn = ctdb_ctrl_getpnn(ctdb, req_timeout, CTDB_CURRENT_NODE);
+       if (ctdb->pnn == (uint32_t)-1) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to get ctdb pnn\n"));
+               talloc_free(ctdb);
+               return NULL;
+       }
+
+       return ctdb;
+}
diff --git a/ctdb/common/ctdb_fork.c b/ctdb/common/ctdb_fork.c
new file mode 100644 (file)
index 0000000..d372ae0
--- /dev/null
@@ -0,0 +1,178 @@
+/* 
+   functions to track and manage processes
+
+   Copyright (C) Ronnie Sahlberg 2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/wait.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+static bool is_child = false;
+
+void ctdb_set_child_info(TALLOC_CTX *mem_ctx, const char *child_name_fmt, ...)
+{
+       is_child = true;
+       if (child_name_fmt != NULL) {
+               va_list ap;
+               char *t;
+
+               va_start(ap, child_name_fmt);
+               t = talloc_vasprintf(mem_ctx, child_name_fmt, ap);
+               debug_extra = talloc_asprintf(mem_ctx, "%s:", t);
+               talloc_free(t);
+               va_end(ap);
+       }
+}
+
+bool ctdb_is_child_process(void)
+{
+       return is_child;
+}
+
+/*
+ * This function forks a child process and drops the realtime 
+ * scheduler for the child process.
+ */
+pid_t ctdb_fork_no_free_ringbuffer(struct ctdb_context *ctdb)
+{
+       pid_t pid;
+       char *process;
+
+       pid = fork();
+       if (pid == -1) {
+               return -1;
+       }
+       if (pid == 0) {
+               ctdb_set_child_info(ctdb, NULL);
+
+               /* Close the Unix Domain socket and the TCP socket.
+                * This ensures that none of the child processes will
+                * look like the main daemon when it is not running.
+                * tevent needs to be stopped before closing sockets.
+                */
+               if (ctdb->ev != NULL) {
+                       talloc_free(ctdb->ev);
+                       ctdb->ev = NULL;
+               }
+               if (ctdb->daemon.sd != -1) {
+                       close(ctdb->daemon.sd);
+                       ctdb->daemon.sd = -1;
+               }
+               if (ctdb->methods != NULL) {
+                       ctdb->methods->shutdown(ctdb);
+               }
+
+               /* The child does not need to be realtime */
+               if (ctdb->do_setsched) {
+                       ctdb_restore_scheduler(ctdb);
+               }
+               ctdb->can_send_controls = false;
+
+               return 0;
+       }
+
+       if (getpid() != ctdb->ctdbd_pid) {
+               return pid;
+       }
+
+       process = talloc_asprintf(ctdb->child_processes, "process:%d", (int)pid);
+       trbt_insert32(ctdb->child_processes, pid, process);
+
+       return pid;
+}
+
+pid_t ctdb_fork(struct ctdb_context *ctdb)
+{
+       pid_t pid;
+
+       pid = ctdb_fork_no_free_ringbuffer(ctdb);
+       if (pid == 0) {
+               ctdb_log_ringbuffer_free();
+       }
+
+       return pid;
+}
+
+
+static void ctdb_sigchld_handler(struct tevent_context *ev,
+       struct tevent_signal *te, int signum, int count,
+       void *dont_care, 
+       void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int status;
+       pid_t pid = -1;
+
+       while (pid != 0) {
+               pid = waitpid(-1, &status, WNOHANG);
+               if (pid == -1) {
+                       DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%d\n", errno));
+                       return;
+               }
+               if (pid > 0) {
+                       char *process;
+
+                       if (getpid() != ctdb->ctdbd_pid) {
+                               continue;
+                       }
+
+                       process = trbt_lookup32(ctdb->child_processes, pid);
+                       if (process == NULL) {
+                               DEBUG(DEBUG_ERR,("Got SIGCHLD from pid:%d we didn not spawn with ctdb_fork\n", pid));
+                       }
+
+                       DEBUG(DEBUG_DEBUG, ("SIGCHLD from %d %s\n", (int)pid, process));
+                       talloc_free(process);
+               }
+       }
+}
+
+
+struct tevent_signal *
+ctdb_init_sigchld(struct ctdb_context *ctdb)
+{
+       struct tevent_signal *se;
+
+       ctdb->child_processes = trbt_create(ctdb, 0);
+
+       se = tevent_add_signal(ctdb->ev, ctdb, SIGCHLD, 0, ctdb_sigchld_handler, ctdb);
+       return se;
+}
+
+int
+ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum)
+{
+       char *process;
+
+       if (signum == 0) {
+               return kill(pid, signum);
+       }
+
+       if (getpid() != ctdb->ctdbd_pid) {
+               return kill(pid, signum);
+       }
+
+       process = trbt_lookup32(ctdb->child_processes, pid);
+       if (process == NULL) {
+               DEBUG(DEBUG_ERR,("ctdb_kill: trying to kill(%d, %d) a process that does not exist\n", pid, signum));
+               return 0;
+       }
+
+       return kill(pid, signum);
+}
diff --git a/ctdb/common/ctdb_io.c b/ctdb/common/ctdb_io.c
new file mode 100644 (file)
index 0000000..351006d
--- /dev/null
@@ -0,0 +1,442 @@
+/* 
+   ctdb database library
+   Utility functions to read/write blobs of data from a file descriptor
+   and handle the case where we might need multiple read/writes to get all the
+   data.
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "lib/util/dlinklist.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "../include/ctdb_client.h"
+#include <stdarg.h>
+
+#define QUEUE_BUFFER_SIZE      (16*1024)
+
+/* structures for packet queueing - see common/ctdb_io.c */
+struct ctdb_buffer {
+       uint8_t *data;
+       uint32_t length;
+       uint32_t size;
+       uint32_t extend;
+};
+
+struct ctdb_queue_pkt {
+       struct ctdb_queue_pkt *next, *prev;
+       uint8_t *data;
+       uint32_t length;
+       uint32_t full_length;
+};
+
+struct ctdb_queue {
+       struct ctdb_context *ctdb;
+       struct tevent_immediate *im;
+       struct ctdb_buffer buffer; /* input buffer */
+       struct ctdb_queue_pkt *out_queue, *out_queue_tail;
+       uint32_t out_queue_length;
+       struct fd_event *fde;
+       int fd;
+       size_t alignment;
+       void *private_data;
+       ctdb_queue_cb_fn_t callback;
+       bool *destroyed;
+       const char *name;
+};
+
+
+
+int ctdb_queue_length(struct ctdb_queue *queue)
+{
+       return queue->out_queue_length;
+}
+
+static void queue_process(struct ctdb_queue *queue);
+
+static void queue_process_event(struct tevent_context *ev, struct tevent_immediate *im,
+                               void *private_data)
+{
+       struct ctdb_queue *queue = talloc_get_type(private_data, struct ctdb_queue);
+
+       queue_process(queue);
+}
+
+/*
+ * This function is used to process data in queue buffer.
+ *
+ * Queue callback function can end up freeing the queue, there should not be a
+ * loop processing packets from queue buffer.  Instead set up a timed event for
+ * immediate run to process remaining packets from buffer.
+ */
+static void queue_process(struct ctdb_queue *queue)
+{
+       uint32_t pkt_size;
+       uint8_t *data;
+
+       if (queue->buffer.length < sizeof(pkt_size)) {
+               return;
+       }
+
+       pkt_size = *(uint32_t *)queue->buffer.data;
+       if (pkt_size == 0) {
+               DEBUG(DEBUG_CRIT, ("Invalid packet of length 0\n"));
+               goto failed;
+       }
+
+       if (queue->buffer.length < pkt_size) {
+               if (pkt_size > QUEUE_BUFFER_SIZE) {
+                       queue->buffer.extend = pkt_size;
+               }
+               return;
+       }
+
+       /* Extract complete packet */
+       data = talloc_size(queue, pkt_size);
+       if (data == NULL) {
+               DEBUG(DEBUG_ERR, ("read error alloc failed for %u\n", pkt_size));
+               return;
+       }
+       memcpy(data, queue->buffer.data, pkt_size);
+
+       /* Shift packet out from buffer */
+       if (queue->buffer.length > pkt_size) {
+               memmove(queue->buffer.data,
+                       queue->buffer.data + pkt_size,
+                       queue->buffer.length - pkt_size);
+       }
+       queue->buffer.length -= pkt_size;
+
+       if (queue->buffer.length > 0) {
+               /* There is more data to be processed, schedule an event */
+               tevent_schedule_immediate(queue->im, queue->ctdb->ev,
+                                         queue_process_event, queue);
+       } else {
+               if (queue->buffer.size > QUEUE_BUFFER_SIZE) {
+                       TALLOC_FREE(queue->buffer.data);
+                       queue->buffer.size = 0;
+               }
+       }
+
+       /* It is the responsibility of the callback to free 'data' */
+       queue->callback(data, pkt_size, queue->private_data);
+       return;
+
+failed:
+       queue->callback(NULL, 0, queue->private_data);
+
+}
+
+
+/*
+  called when an incoming connection is readable
+  This function MUST be safe for reentry via the queue callback!
+*/
+static void queue_io_read(struct ctdb_queue *queue)
+{
+       int num_ready = 0;
+       ssize_t nread;
+       uint8_t *data;
+       int navail;
+
+       /* check how much data is available on the socket for immediately
+          guaranteed nonblocking access.
+          as long as we are careful never to try to read more than this
+          we know all reads will be successful and will neither block
+          nor fail with a "data not available right now" error
+       */
+       if (ioctl(queue->fd, FIONREAD, &num_ready) != 0) {
+               return;
+       }
+       if (num_ready == 0) {
+               /* the descriptor has been closed */
+               goto failed;
+       }
+
+       if (queue->buffer.data == NULL) {
+               /* starting fresh, allocate buf to read data */
+               queue->buffer.data = talloc_size(queue, QUEUE_BUFFER_SIZE);
+               if (queue->buffer.data == NULL) {
+                       DEBUG(DEBUG_ERR, ("read error alloc failed for %u\n", num_ready));
+                       goto failed;
+               }
+               queue->buffer.size = QUEUE_BUFFER_SIZE;
+       } else if (queue->buffer.extend > 0) {
+               /* extending buffer */
+               data = talloc_realloc_size(queue, queue->buffer.data, queue->buffer.extend);
+               if (data == NULL) {
+                       DEBUG(DEBUG_ERR, ("read error realloc failed for %u\n", queue->buffer.extend));
+                       goto failed;
+               }
+               queue->buffer.data = data;
+               queue->buffer.size = queue->buffer.extend;
+               queue->buffer.extend = 0;
+       }
+
+       navail = queue->buffer.size - queue->buffer.length;
+       if (num_ready > navail) {
+               num_ready = navail;
+       }
+
+       if (num_ready > 0) {
+               nread = read(queue->fd, queue->buffer.data + queue->buffer.length, num_ready);
+               if (nread <= 0) {
+                       DEBUG(DEBUG_ERR, ("read error nread=%d\n", (int)nread));
+                       goto failed;
+               }
+               queue->buffer.length += nread;
+       }
+
+       queue_process(queue);
+       return;
+
+failed:
+       queue->callback(NULL, 0, queue->private_data);
+}
+
+
+/* used when an event triggers a dead queue */
+static void queue_dead(struct event_context *ev, struct tevent_immediate *im,
+                      void *private_data)
+{
+       struct ctdb_queue *queue = talloc_get_type(private_data, struct ctdb_queue);
+       queue->callback(NULL, 0, queue->private_data);
+}
+
+
+/*
+  called when an incoming connection is writeable
+*/
+static void queue_io_write(struct ctdb_queue *queue)
+{
+       while (queue->out_queue) {
+               struct ctdb_queue_pkt *pkt = queue->out_queue;
+               ssize_t n;
+               if (queue->ctdb->flags & CTDB_FLAG_TORTURE) {
+                       n = write(queue->fd, pkt->data, 1);
+               } else {
+                       n = write(queue->fd, pkt->data, pkt->length);
+               }
+
+               if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+                       if (pkt->length != pkt->full_length) {
+                               /* partial packet sent - we have to drop it */
+                               DLIST_REMOVE(queue->out_queue, pkt);
+                               queue->out_queue_length--;
+                               talloc_free(pkt);
+                       }
+                       talloc_free(queue->fde);
+                       queue->fde = NULL;
+                       queue->fd = -1;
+                       tevent_schedule_immediate(queue->im, queue->ctdb->ev,
+                                                 queue_dead, queue);
+                       return;
+               }
+               if (n <= 0) return;
+               
+               if (n != pkt->length) {
+                       pkt->length -= n;
+                       pkt->data += n;
+                       return;
+               }
+
+               DLIST_REMOVE(queue->out_queue, pkt);
+               queue->out_queue_length--;
+               talloc_free(pkt);
+       }
+
+       EVENT_FD_NOT_WRITEABLE(queue->fde);
+}
+
+/*
+  called when an incoming connection is readable or writeable
+*/
+static void queue_io_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct ctdb_queue *queue = talloc_get_type(private_data, struct ctdb_queue);
+
+       if (flags & EVENT_FD_READ) {
+               queue_io_read(queue);
+       } else {
+               queue_io_write(queue);
+       }
+}
+
+
+/*
+  queue a packet for sending
+*/
+int ctdb_queue_send(struct ctdb_queue *queue, uint8_t *data, uint32_t length)
+{
+       struct ctdb_queue_pkt *pkt;
+       uint32_t length2, full_length;
+
+       if (queue->alignment) {
+               /* enforce the length and alignment rules from the tcp packet allocator */
+               length2 = (length+(queue->alignment-1)) & ~(queue->alignment-1);
+               *(uint32_t *)data = length2;
+       } else {
+               length2 = length;
+       }
+
+       if (length2 != length) {
+               memset(data+length, 0, length2-length);
+       }
+
+       full_length = length2;
+       
+       /* if the queue is empty then try an immediate write, avoiding
+          queue overhead. This relies on non-blocking sockets */
+       if (queue->out_queue == NULL && queue->fd != -1 &&
+           !(queue->ctdb->flags & CTDB_FLAG_TORTURE)) {
+               ssize_t n = write(queue->fd, data, length2);
+               if (n == -1 && errno != EAGAIN && errno != EWOULDBLOCK) {
+                       talloc_free(queue->fde);
+                       queue->fde = NULL;
+                       queue->fd = -1;
+                       tevent_schedule_immediate(queue->im, queue->ctdb->ev,
+                                                 queue_dead, queue);
+                       /* yes, we report success, as the dead node is 
+                          handled via a separate event */
+                       return 0;
+               }
+               if (n > 0) {
+                       data += n;
+                       length2 -= n;
+               }
+               if (length2 == 0) return 0;
+       }
+
+       pkt = talloc(queue, struct ctdb_queue_pkt);
+       CTDB_NO_MEMORY(queue->ctdb, pkt);
+
+       pkt->data = talloc_memdup(pkt, data, length2);
+       CTDB_NO_MEMORY(queue->ctdb, pkt->data);
+
+       pkt->length = length2;
+       pkt->full_length = full_length;
+
+       if (queue->out_queue == NULL && queue->fd != -1) {
+               EVENT_FD_WRITEABLE(queue->fde);
+       }
+
+       DLIST_ADD_END(queue->out_queue, pkt, NULL);
+
+       queue->out_queue_length++;
+
+       if (queue->ctdb->tunable.verbose_memory_names != 0) {
+               struct ctdb_req_header *hdr = (struct ctdb_req_header *)pkt->data;
+               switch (hdr->operation) {
+               case CTDB_REQ_CONTROL: {
+                       struct ctdb_req_control *c = (struct ctdb_req_control *)hdr;
+                       talloc_set_name(pkt, "ctdb_queue_pkt: %s control opcode=%u srvid=%llu datalen=%u",
+                                       queue->name, (unsigned)c->opcode, (unsigned long long)c->srvid, (unsigned)c->datalen);
+                       break;
+               }
+               case CTDB_REQ_MESSAGE: {
+                       struct ctdb_req_message *m = (struct ctdb_req_message *)hdr;
+                       talloc_set_name(pkt, "ctdb_queue_pkt: %s message srvid=%llu datalen=%u",
+                                       queue->name, (unsigned long long)m->srvid, (unsigned)m->datalen);
+                       break;
+               }
+               default:
+                       talloc_set_name(pkt, "ctdb_queue_pkt: %s operation=%u length=%u src=%u dest=%u",
+                                       queue->name, (unsigned)hdr->operation, (unsigned)hdr->length,
+                                       (unsigned)hdr->srcnode, (unsigned)hdr->destnode);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+  setup the fd used by the queue
+ */
+int ctdb_queue_set_fd(struct ctdb_queue *queue, int fd)
+{
+       queue->fd = fd;
+       talloc_free(queue->fde);
+       queue->fde = NULL;
+
+       if (fd != -1) {
+               queue->fde = event_add_fd(queue->ctdb->ev, queue, fd, EVENT_FD_READ,
+                                         queue_io_handler, queue);
+               if (queue->fde == NULL) {
+                       return -1;
+               }
+               tevent_fd_set_auto_close(queue->fde);
+
+               if (queue->out_queue) {
+                       EVENT_FD_WRITEABLE(queue->fde);         
+               }
+       }
+
+       return 0;
+}
+
+/* If someone sets up this pointer, they want to know if the queue is freed */
+static int queue_destructor(struct ctdb_queue *queue)
+{
+       TALLOC_FREE(queue->buffer.data);
+       queue->buffer.length = 0;
+       queue->buffer.size = 0;
+       if (queue->destroyed != NULL)
+               *queue->destroyed = true;
+       return 0;
+}
+
+/*
+  setup a packet queue on a socket
+ */
+struct ctdb_queue *ctdb_queue_setup(struct ctdb_context *ctdb,
+                                   TALLOC_CTX *mem_ctx, int fd, int alignment,
+                                   ctdb_queue_cb_fn_t callback,
+                                   void *private_data, const char *fmt, ...)
+{
+       struct ctdb_queue *queue;
+       va_list ap;
+
+       queue = talloc_zero(mem_ctx, struct ctdb_queue);
+       CTDB_NO_MEMORY_NULL(ctdb, queue);
+       va_start(ap, fmt);
+       queue->name = talloc_vasprintf(mem_ctx, fmt, ap);
+       va_end(ap);
+       CTDB_NO_MEMORY_NULL(ctdb, queue->name);
+
+       queue->im= tevent_create_immediate(queue);
+       CTDB_NO_MEMORY_NULL(ctdb, queue->im);
+
+       queue->ctdb = ctdb;
+       queue->fd = fd;
+       queue->alignment = alignment;
+       queue->private_data = private_data;
+       queue->callback = callback;
+       if (fd != -1) {
+               if (ctdb_queue_set_fd(queue, fd) != 0) {
+                       talloc_free(queue);
+                       return NULL;
+               }
+       }
+       talloc_set_destructor(queue, queue_destructor);
+
+       return queue;
+}
diff --git a/ctdb/common/ctdb_logging.c b/ctdb/common/ctdb_logging.c
new file mode 100644 (file)
index 0000000..ba3e861
--- /dev/null
@@ -0,0 +1,200 @@
+/* 
+   ctdb logging code
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/time.h"
+#include "../include/ctdb_private.h"
+#include "../include/ctdb_client.h"
+
+int log_ringbuf_size;
+
+#define MAX_LOG_SIZE 128
+
+static int first_entry = 0;
+static int ringbuf_count = 0;
+
+struct ctdb_log_entry {
+       int32_t level;
+       struct timeval t;
+       char message[MAX_LOG_SIZE];
+};
+
+
+static struct ctdb_log_entry *log_entries;
+
+/*
+ * this function logs all messages for all levels to a ringbuffer
+ */
+static void log_ringbuffer_v(const char *format, va_list ap)
+{
+       int ret;
+       int next_entry;
+
+       if (log_entries == NULL && log_ringbuf_size != 0) {
+               /* Hope this works. We cant log anything if it doesnt anyway */
+               log_entries = malloc(sizeof(struct ctdb_log_entry) * log_ringbuf_size);
+       }
+       if (log_entries == NULL) {
+               return;
+       }
+
+       next_entry = (first_entry + ringbuf_count) % log_ringbuf_size;
+
+       if (ringbuf_count > 0 && first_entry == next_entry) {
+               first_entry = (first_entry + 1) % log_ringbuf_size;
+       }
+
+       log_entries[next_entry].message[0] = '\0';
+
+       ret = vsnprintf(&log_entries[next_entry].message[0], MAX_LOG_SIZE, format, ap);
+       if (ret == -1) {
+               return;
+       }
+       /* Log messages longer than MAX_LOG_SIZE are truncated to MAX_LOG_SIZE-1
+        * bytes.  In that case, add a newline.
+        */
+       if (ret >= MAX_LOG_SIZE) {
+               log_entries[next_entry].message[MAX_LOG_SIZE-2] = '\n';
+       }
+
+       log_entries[next_entry].level = this_log_level;
+       log_entries[next_entry].t = timeval_current();
+
+       if (ringbuf_count < log_ringbuf_size) {
+               ringbuf_count++;
+       }
+}
+
+void log_ringbuffer(const char *format, ...)
+{
+       va_list ap;
+
+       va_start(ap, format);
+       log_ringbuffer_v(format, ap);
+       va_end(ap);
+}
+
+void ctdb_log_ringbuffer_free(void)
+{
+       if (log_entries != NULL) {
+               free(log_entries);
+               log_entries = NULL;
+       }
+       log_ringbuf_size = 0;
+}
+
+void ctdb_collect_log(struct ctdb_context *ctdb, struct ctdb_get_log_addr *log_addr)
+{
+       TDB_DATA data;
+       FILE *f;
+       long fsize;
+       int tmp_entry;
+       struct tm *tm;
+       char tbuf[100];
+       int i;
+
+       DEBUG(DEBUG_ERR,("Marshalling %d log entries\n", ringbuf_count));
+
+       /* dump to a file, then send the file as a blob */
+       f = tmpfile();
+       if (f == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Unable to open tmpfile - %s\n", strerror(errno)));
+               return;
+       }
+
+       for (i=0; i<ringbuf_count; i++) {
+               tmp_entry = (first_entry + i) % log_ringbuf_size;
+
+               if (log_entries[tmp_entry].level > log_addr->level) {
+                       continue;
+               }
+
+               tm = localtime(&log_entries[tmp_entry].t.tv_sec);
+               strftime(tbuf, sizeof(tbuf)-1,"%Y/%m/%d %H:%M:%S", tm);
+
+               if (log_entries[tmp_entry].message[0] != '\0') {
+                       fprintf(f, "%s:%s %s", tbuf,
+                               get_debug_by_level(log_entries[tmp_entry].level),
+                               log_entries[tmp_entry].message);
+               }
+       }
+
+       fsize = ftell(f);
+       if (fsize < 0) {
+               fclose(f);
+               DEBUG(DEBUG_ERR, ("Cannot get file size for log entries\n"));
+               return;
+       }
+       rewind(f);
+       data.dptr = talloc_size(NULL, fsize);
+       if (data.dptr == NULL) {
+               fclose(f);
+               CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
+       }
+       data.dsize = fread(data.dptr, 1, fsize, f);
+       fclose(f);
+
+       DEBUG(DEBUG_ERR,("Marshalling log entries into a blob of %d bytes\n", (int)data.dsize));
+
+       DEBUG(DEBUG_ERR,("Send log to %d:%d\n", (int)log_addr->pnn, (int)log_addr->srvid));
+       ctdb_client_send_message(ctdb, log_addr->pnn, log_addr->srvid, data);
+
+       talloc_free(data.dptr);
+}
+
+int32_t ctdb_control_get_log(struct ctdb_context *ctdb, TDB_DATA addr)
+{
+       struct ctdb_get_log_addr *log_addr = (struct ctdb_get_log_addr *)addr.dptr;
+       pid_t child;
+
+       /* spawn a child process to marshall the huge log blob and send it back
+          to the ctdb tool using a MESSAGE
+       */
+       child = ctdb_fork_no_free_ringbuffer(ctdb);
+       if (child == (pid_t)-1) {
+               DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
+               return -1;
+       }
+
+       if (child == 0) {
+               ctdb_set_process_name("ctdb_log_collector");
+               if (switch_from_server_to_client(ctdb, "log-collector") != 0) {
+                       DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
+                       _exit(1);
+               }
+               ctdb_collect_log(ctdb, log_addr);
+               _exit(0);
+       }
+
+       return 0;
+}
+
+void ctdb_clear_log(struct ctdb_context *ctdb)
+{
+       first_entry = 0;
+       ringbuf_count  = 0;
+}
+
+int32_t ctdb_control_clear_log(struct ctdb_context *ctdb)
+{
+       ctdb_clear_log(ctdb);
+
+       return 0;
+}
diff --git a/ctdb/common/ctdb_ltdb.c b/ctdb/common/ctdb_ltdb.c
new file mode 100644 (file)
index 0000000..4681f30
--- /dev/null
@@ -0,0 +1,336 @@
+/* 
+   ctdb ltdb code
+
+   Copyright (C) Andrew Tridgell  2006
+   Copyright (C) Ronnie sahlberg  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "db_wrap.h"
+#include "lib/util/dlinklist.h"
+
+/*
+  find an attached ctdb_db handle given a name
+ */
+struct ctdb_db_context *ctdb_db_handle(struct ctdb_context *ctdb, const char *name)
+{
+       struct ctdb_db_context *tmp_db;
+       for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) {
+               if (strcmp(name, tmp_db->db_name) == 0) {
+                       return tmp_db;
+               }
+       }
+       return NULL;
+}
+
+
+/*
+  return the lmaster given a key
+*/
+uint32_t ctdb_lmaster(struct ctdb_context *ctdb, const TDB_DATA *key)
+{
+       uint32_t idx, lmaster;
+
+       idx = ctdb_hash(key) % ctdb->vnn_map->size;
+       lmaster = ctdb->vnn_map->map[idx];
+
+       return lmaster;
+}
+
+
+/*
+  construct an initial header for a record with no ltdb header yet
+*/
+static void ltdb_initial_header(struct ctdb_db_context *ctdb_db, 
+                               TDB_DATA key,
+                               struct ctdb_ltdb_header *header)
+{
+       ZERO_STRUCTP(header);
+       /* initial dmaster is the lmaster */
+       header->dmaster = ctdb_lmaster(ctdb_db->ctdb, &key);
+       header->flags = CTDB_REC_FLAG_AUTOMATIC;
+}
+
+
+/*
+  fetch a record from the ltdb, separating out the header information
+  and returning the body of the record. A valid (initial) header is
+  returned if the record is not present
+*/
+int ctdb_ltdb_fetch(struct ctdb_db_context *ctdb_db, 
+                   TDB_DATA key, struct ctdb_ltdb_header *header, 
+                   TALLOC_CTX *mem_ctx, TDB_DATA *data)
+{
+       TDB_DATA rec;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+       rec = tdb_fetch(ctdb_db->ltdb->tdb, key);
+       if (rec.dsize < sizeof(*header)) {
+               TDB_DATA d2;
+               /* return an initial header */
+               if (rec.dptr) free(rec.dptr);
+               if (ctdb->vnn_map == NULL) {
+                       /* called from the client */
+                       ZERO_STRUCTP(data);
+                       header->dmaster = (uint32_t)-1;
+                       return -1;
+               }
+               ltdb_initial_header(ctdb_db, key, header);
+               ZERO_STRUCT(d2);
+               if (data) {
+                       *data = d2;
+               }
+               if (ctdb_db->persistent || header->dmaster == ctdb_db->ctdb->pnn) {
+                       ctdb_ltdb_store(ctdb_db, key, header, d2);
+               }
+               return 0;
+       }
+
+       *header = *(struct ctdb_ltdb_header *)rec.dptr;
+
+       if (data) {
+               data->dsize = rec.dsize - sizeof(struct ctdb_ltdb_header);
+               data->dptr = talloc_memdup(mem_ctx, 
+                                          sizeof(struct ctdb_ltdb_header)+rec.dptr,
+                                          data->dsize);
+       }
+
+       free(rec.dptr);
+       if (data) {
+               CTDB_NO_MEMORY(ctdb, data->dptr);
+       }
+
+       return 0;
+}
+
+/*
+  fetch a record from the ltdb, separating out the header information
+  and returning the body of the record.
+  if the record does not exist, *header will be NULL
+  and data = {0, NULL}
+*/
+int ctdb_ltdb_fetch_with_header(struct ctdb_db_context *ctdb_db, 
+                   TDB_DATA key, struct ctdb_ltdb_header *header, 
+                   TALLOC_CTX *mem_ctx, TDB_DATA *data)
+{
+       TDB_DATA rec;
+
+       rec = tdb_fetch(ctdb_db->ltdb->tdb, key);
+       if (rec.dsize < sizeof(*header)) {
+               free(rec.dptr);
+
+               data->dsize = 0;
+               data->dptr = NULL;
+               return -1;
+       }
+
+       *header = *(struct ctdb_ltdb_header *)rec.dptr;
+       if (data) {
+               data->dsize = rec.dsize - sizeof(struct ctdb_ltdb_header);
+               data->dptr = talloc_memdup(mem_ctx, 
+                                          sizeof(struct ctdb_ltdb_header)+rec.dptr,
+                                          data->dsize);
+       }
+
+       free(rec.dptr);
+
+       return 0;
+}
+
+
+/*
+  write a record to a normal database
+*/
+int ctdb_ltdb_store(struct ctdb_db_context *ctdb_db, TDB_DATA key, 
+                   struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       TDB_DATA rec;
+       int ret;
+       bool seqnum_suppressed = false;
+
+       if (ctdb_db->ctdb_ltdb_store_fn) {
+               return ctdb_db->ctdb_ltdb_store_fn(ctdb_db, key, header, data);
+       }
+
+       if (ctdb->flags & CTDB_FLAG_TORTURE) {
+               struct ctdb_ltdb_header *h2;
+               rec = tdb_fetch(ctdb_db->ltdb->tdb, key);
+               h2 = (struct ctdb_ltdb_header *)rec.dptr;
+               if (rec.dptr && rec.dsize >= sizeof(h2) && h2->rsn > header->rsn) {
+                       DEBUG(DEBUG_CRIT,("RSN regression! %llu %llu\n",
+                                (unsigned long long)h2->rsn, (unsigned long long)header->rsn));
+               }
+               if (rec.dptr) free(rec.dptr);
+       }
+
+       rec.dsize = sizeof(*header) + data.dsize;
+       rec.dptr = talloc_size(ctdb, rec.dsize);
+       CTDB_NO_MEMORY(ctdb, rec.dptr);
+
+       memcpy(rec.dptr, header, sizeof(*header));
+       memcpy(rec.dptr + sizeof(*header), data.dptr, data.dsize);
+
+       /* Databases with seqnum updates enabled only get their seqnum
+          changes when/if we modify the data */
+       if (ctdb_db->seqnum_update != NULL) {
+               TDB_DATA old;
+               old = tdb_fetch(ctdb_db->ltdb->tdb, key);
+
+               if ( (old.dsize == rec.dsize)
+               && !memcmp(old.dptr+sizeof(struct ctdb_ltdb_header),
+                         rec.dptr+sizeof(struct ctdb_ltdb_header),
+                         rec.dsize-sizeof(struct ctdb_ltdb_header)) ) {
+                       tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_SEQNUM);
+                       seqnum_suppressed = true;
+               }
+               if (old.dptr) free(old.dptr);
+       }
+       ret = tdb_store(ctdb_db->ltdb->tdb, key, rec, TDB_REPLACE);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to store dynamic data\n"));
+       }
+       if (seqnum_suppressed) {
+               tdb_add_flags(ctdb_db->ltdb->tdb, TDB_SEQNUM);
+       }
+
+       talloc_free(rec.dptr);
+
+       return ret;
+}
+
+/*
+  lock a record in the ltdb, given a key
+ */
+int ctdb_ltdb_lock(struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       return tdb_chainlock(ctdb_db->ltdb->tdb, key);
+}
+
+/*
+  unlock a record in the ltdb, given a key
+ */
+int ctdb_ltdb_unlock(struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       int ret = tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("tdb_chainunlock failed on db %s [%s]\n", ctdb_db->db_name, tdb_errorstr(ctdb_db->ltdb->tdb)));
+       }
+       return ret;
+}
+
+
+/*
+  delete a record from a normal database
+*/
+int ctdb_ltdb_delete(struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       if (ctdb_db->persistent != 0) {
+               DEBUG(DEBUG_ERR,("Trying to delete emty record in persistent database\n"));
+               return 0;
+       }
+       if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+               DEBUG(DEBUG_ERR,("Failed to delete empty record."));
+               return -1;
+       }
+       return 0;
+}
+
+int ctdb_trackingdb_add_pnn(struct ctdb_context *ctdb, TDB_DATA *data, uint32_t pnn)
+{
+       int byte_pos = pnn / 8;
+       int bit_mask   = 1 << (pnn % 8);
+
+       if (byte_pos + 1 > data->dsize) {
+               char *buf;
+
+               buf = malloc(byte_pos + 1);
+               memset(buf, 0, byte_pos + 1);
+               if (buf == NULL) {
+                       DEBUG(DEBUG_ERR, ("Out of memory when allocating buffer of %d bytes for trackingdb\n", byte_pos + 1));
+                       return -1;
+               }
+               if (data->dptr != NULL) {
+                       memcpy(buf, data->dptr, data->dsize);
+                       free(data->dptr);
+               }
+               data->dptr  = (uint8_t *)buf;
+               data->dsize = byte_pos + 1;
+       }
+
+       data->dptr[byte_pos] |= bit_mask;
+       return 0;
+}
+
+void ctdb_trackingdb_traverse(struct ctdb_context *ctdb, TDB_DATA data, ctdb_trackingdb_cb cb, void *private_data)
+{
+       int i;
+
+       for(i = 0; i < data.dsize; i++) {
+               int j;
+
+               for (j=0; j<8; j++) {
+                       int mask = 1<<j;
+
+                       if (data.dptr[i] & mask) {
+                               cb(ctdb, i * 8 + j, private_data);
+                       }
+               }
+       }
+}
+
+/*
+  this is the dummy null procedure that all databases support
+*/
+int ctdb_null_func(struct ctdb_call_info *call)
+{
+       return 0;
+}
+
+/*
+  this is a plain fetch procedure that all databases support
+*/
+int ctdb_fetch_func(struct ctdb_call_info *call)
+{
+       call->reply_data = &call->record_data;
+       return 0;
+}
+
+/*
+  this is a plain fetch procedure that all databases support
+  this returns the full record including the ltdb header
+*/
+int ctdb_fetch_with_header_func(struct ctdb_call_info *call)
+{
+       call->reply_data = talloc(call, TDB_DATA);
+       if (call->reply_data == NULL) {
+               return -1;
+       }
+       call->reply_data->dsize = sizeof(struct ctdb_ltdb_header) + call->record_data.dsize;
+       call->reply_data->dptr  = talloc_size(call->reply_data, call->reply_data->dsize);
+       if (call->reply_data->dptr == NULL) {
+               return -1;
+       }
+       memcpy(call->reply_data->dptr, call->header, sizeof(struct ctdb_ltdb_header));
+       memcpy(&call->reply_data->dptr[sizeof(struct ctdb_ltdb_header)], call->record_data.dptr, call->record_data.dsize);
+
+       return 0;
+}
+
diff --git a/ctdb/common/ctdb_message.c b/ctdb/common/ctdb_message.c
new file mode 100644 (file)
index 0000000..0e19761
--- /dev/null
@@ -0,0 +1,286 @@
+/* 
+   ctdb_message protocol code
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Amitay Isaacs  2013
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+  see http://wiki.samba.org/index.php/Samba_%26_Clustering for
+  protocol design and packet details
+*/
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "lib/util/dlinklist.h"
+
+static int message_list_db_init(struct ctdb_context *ctdb)
+{
+       ctdb->message_list_indexdb = tdb_open("messagedb", 8192,
+                                             TDB_INTERNAL|
+                                             TDB_INCOMPATIBLE_HASH|
+                                             TDB_DISALLOW_NESTING,
+                                             O_RDWR|O_CREAT, 0);
+       if (ctdb->message_list_indexdb == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to create message list indexdb\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int message_list_db_add(struct ctdb_context *ctdb, uint64_t srvid,
+                              struct ctdb_message_list_header *h)
+{
+       int ret;
+       TDB_DATA key, data;
+
+       if (ctdb->message_list_indexdb == NULL) {
+               ret = message_list_db_init(ctdb);
+               if (ret < 0) {
+                       return -1;
+               }
+       }
+
+       key.dptr = (uint8_t *)&srvid;
+       key.dsize = sizeof(uint64_t);
+
+       data.dptr = (uint8_t *)&h;
+       data.dsize = sizeof(struct ctdb_message_list_header *);
+
+       ret = tdb_store(ctdb->message_list_indexdb, key, data, TDB_INSERT);
+       if (ret < 0) {
+               DEBUG(DEBUG_ERR, ("Failed to add message list handler (%s)\n",
+                                 tdb_errorstr(ctdb->message_list_indexdb)));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int message_list_db_delete(struct ctdb_context *ctdb, uint64_t srvid)
+{
+       int ret;
+       TDB_DATA key;
+
+       if (ctdb->message_list_indexdb == NULL) {
+               return -1;
+       }
+
+       key.dptr = (uint8_t *)&srvid;
+       key.dsize = sizeof(uint64_t);
+
+       ret = tdb_delete(ctdb->message_list_indexdb, key);
+       if (ret < 0) {
+               DEBUG(DEBUG_ERR, ("Failed to delete message list handler (%s)\n",
+                                 tdb_errorstr(ctdb->message_list_indexdb)));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int message_list_db_fetch_parser(TDB_DATA key, TDB_DATA data,
+                                       void *private_data)
+{
+       struct ctdb_message_list_header **h =
+               (struct ctdb_message_list_header **)private_data;
+
+       if (data.dsize != sizeof(struct ctdb_message_list_header *)) {
+               return -1;
+       }
+
+       *h = *(struct ctdb_message_list_header **)data.dptr;
+       return 0;
+}
+
+static int message_list_db_fetch(struct ctdb_context *ctdb, uint64_t srvid,
+                                struct ctdb_message_list_header **h)
+{
+       TDB_DATA key;
+
+       if (ctdb->message_list_indexdb == NULL) {
+               return -1;
+       }
+
+       key.dptr = (uint8_t *)&srvid;
+       key.dsize = sizeof(uint64_t);
+
+       return tdb_parse_record(ctdb->message_list_indexdb, key,
+                               message_list_db_fetch_parser, h);
+}
+
+/*
+  this dispatches the messages to the registered ctdb message handler
+*/
+int ctdb_dispatch_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
+{
+       struct ctdb_message_list_header *h;
+       struct ctdb_message_list *m;
+       uint64_t srvid_all = CTDB_SRVID_ALL;
+       int ret;
+
+       ret = message_list_db_fetch(ctdb, srvid, &h);
+       if (ret == 0) {
+               for (m=h->m; m; m=m->next) {
+                       m->message_handler(ctdb, srvid, data, m->message_private);
+               }
+       }
+
+       ret = message_list_db_fetch(ctdb, srvid_all, &h);
+       if (ret == 0) {
+               for(m=h->m; m; m=m->next) {
+                       m->message_handler(ctdb, srvid, data, m->message_private);
+               }
+       }
+
+       return 0;
+}
+
+/*
+  called when a CTDB_REQ_MESSAGE packet comes in
+*/
+void ctdb_request_message(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_req_message *c = (struct ctdb_req_message *)hdr;
+       TDB_DATA data;
+
+       data.dsize = c->datalen;
+       data.dptr = talloc_memdup(c, &c->data[0], c->datalen);
+
+       ctdb_dispatch_message(ctdb, c->srvid, data);
+}
+
+/*
+ * When header is freed, remove all the srvid handlers
+ */
+static int message_header_destructor(struct ctdb_message_list_header *h)
+{
+       struct ctdb_message_list *m;
+
+       while (h->m != NULL) {
+               m = h->m;
+               DLIST_REMOVE(h->m, m);
+               TALLOC_FREE(m);
+       }
+
+       message_list_db_delete(h->ctdb, h->srvid);
+       DLIST_REMOVE(h->ctdb->message_list_header, h);
+
+       return 0;
+}
+
+/*
+  when a client goes away, we need to remove its srvid handler from the list
+ */
+static int message_handler_destructor(struct ctdb_message_list *m)
+{
+       struct ctdb_message_list_header *h = m->h;
+
+       DLIST_REMOVE(h->m, m);
+       if (h->m == NULL) {
+               talloc_free(h);
+       }
+       return 0;
+}
+
+/*
+  setup handler for receipt of ctdb messages from ctdb_send_message()
+*/
+int ctdb_register_message_handler(struct ctdb_context *ctdb, 
+                                 TALLOC_CTX *mem_ctx,
+                                 uint64_t srvid,
+                                 ctdb_msg_fn_t handler,
+                                 void *private_data)
+{
+       struct ctdb_message_list_header *h;
+       struct ctdb_message_list *m;
+       int ret;
+
+       m = talloc_zero(mem_ctx, struct ctdb_message_list);
+       CTDB_NO_MEMORY(ctdb, m);
+
+       m->message_handler = handler;
+       m->message_private = private_data;
+
+       ret = message_list_db_fetch(ctdb, srvid, &h);
+       if (ret != 0) {
+               /* srvid not registered yet */
+               h = talloc_zero(ctdb, struct ctdb_message_list_header);
+               CTDB_NO_MEMORY(ctdb, h);
+
+               h->ctdb = ctdb;
+               h->srvid = srvid;
+
+               ret = message_list_db_add(ctdb, srvid, h);
+               if (ret < 0) {
+                       talloc_free(m);
+                       talloc_free(h);
+                       return -1;
+               }
+
+               DLIST_ADD(ctdb->message_list_header, h);
+               talloc_set_destructor(h, message_header_destructor);
+       }
+
+       m->h = h;
+       DLIST_ADD(h->m, m);
+       talloc_set_destructor(m, message_handler_destructor);
+       return 0;
+}
+
+
+/*
+  setup handler for receipt of ctdb messages from ctdb_send_message()
+*/
+int ctdb_deregister_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void *private_data)
+{
+       struct ctdb_message_list_header *h;
+       struct ctdb_message_list *m;
+       int ret;
+
+       ret = message_list_db_fetch(ctdb, srvid, &h);
+       if (ret != 0) {
+               return -1;
+       }
+
+       for (m=h->m; m; m=m->next) {
+               if (m->message_private == private_data) {
+                       talloc_free(m);
+                       return 0;
+               }
+       }
+
+       return -1;
+}
+
+
+/*
+ * check if the given srvid exists
+ */
+bool ctdb_check_message_handler(struct ctdb_context *ctdb, uint64_t srvid)
+{
+       struct ctdb_message_list_header *h;
+       int ret;
+
+       ret = message_list_db_fetch(ctdb, srvid, &h);
+       if (ret != 0 || h->m == NULL) {
+               return false;
+       }
+
+       return true;
+}
diff --git a/ctdb/common/ctdb_util.c b/ctdb/common/ctdb_util.c
new file mode 100644 (file)
index 0000000..7a70fea
--- /dev/null
@@ -0,0 +1,827 @@
+/* 
+   ctdb utility code
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "system/shmem.h"
+#include "../include/ctdb_private.h"
+
+int LogLevel = DEBUG_NOTICE;
+int this_log_level = 0;
+
+/*
+  return error string for last error
+*/
+const char *ctdb_errstr(struct ctdb_context *ctdb)
+{
+       return ctdb->err_msg;
+}
+
+
+/*
+  remember an error message
+*/
+void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...)
+{
+       va_list ap;
+       talloc_free(ctdb->err_msg);
+       va_start(ap, fmt);
+       ctdb->err_msg = talloc_vasprintf(ctdb, fmt, ap);
+       DEBUG(DEBUG_ERR,("ctdb error: %s\n", ctdb->err_msg));
+       va_end(ap);
+}
+
+/*
+  a fatal internal error occurred - no hope for recovery
+*/
+void ctdb_fatal(struct ctdb_context *ctdb, const char *msg)
+{
+       DEBUG(DEBUG_ALERT,("ctdb fatal error: %s\n", msg));
+       abort();
+}
+
+/*
+  like ctdb_fatal() but a core/backtrace would not be useful
+*/
+void ctdb_die(struct ctdb_context *ctdb, const char *msg)
+{
+       DEBUG(DEBUG_ALERT,("ctdb exiting with error: %s\n", msg));
+       exit(1);
+}
+
+/* Invoke an external program to do some sort of tracing on the CTDB
+ * process.  This might block for a little while.  The external
+ * program is specified by the environment variable
+ * CTDB_EXTERNAL_TRACE.  This program should take one argument: the
+ * pid of the process to trace.  Commonly, the program would be a
+ * wrapper script around gcore.
+ */
+void ctdb_external_trace(void)
+{
+
+       const char * t = getenv("CTDB_EXTERNAL_TRACE");
+       char * cmd;
+
+       if (t == NULL) {
+               return;
+       }
+
+       cmd = talloc_asprintf(NULL, "%s %lu", t, (unsigned long) getpid());
+       DEBUG(DEBUG_WARNING,("begin external trace: %s\n", cmd));
+       system(cmd);
+       DEBUG(DEBUG_WARNING,("end external trace: %s\n", cmd));
+       talloc_free(cmd);
+}
+
+/*
+  parse a IP:port pair
+*/
+int ctdb_parse_address(struct ctdb_context *ctdb,
+                      TALLOC_CTX *mem_ctx, const char *str,
+                      struct ctdb_address *address)
+{
+       struct servent *se;
+
+       setservent(0);
+       se = getservbyname("ctdb", "tcp");
+       endservent();
+       
+       address->address = talloc_strdup(mem_ctx, str);
+       CTDB_NO_MEMORY(ctdb, address->address);
+
+       if (se == NULL) {
+               address->port = CTDB_PORT;
+       } else {
+               address->port = ntohs(se->s_port);
+       }
+       return 0;
+}
+
+
+/*
+  check if two addresses are the same
+*/
+bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2)
+{
+       return strcmp(a1->address, a2->address) == 0 && a1->port == a2->port;
+}
+
+
+/*
+  hash function for mapping data to a VNN - taken from tdb
+*/
+uint32_t ctdb_hash(const TDB_DATA *key)
+{
+       return tdb_jenkins_hash(discard_const(key));
+}
+
+/*
+  a type checking varient of idr_find
+ */
+static void *_idr_find_type(struct idr_context *idp, int id, const char *type, const char *location)
+{
+       void *p = idr_find(idp, id);
+       if (p && talloc_check_name(p, type) == NULL) {
+               DEBUG(DEBUG_ERR,("%s idr_find_type expected type %s  but got %s\n",
+                        location, type, talloc_get_name(p)));
+               return NULL;
+       }
+       return p;
+}
+
+uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state)
+{
+       int id = idr_get_new_above(ctdb->idr, state, ctdb->lastid+1, INT_MAX);
+       if (id < 0) {
+               DEBUG(DEBUG_DEBUG, ("Reqid wrap!\n"));
+               id = idr_get_new(ctdb->idr, state, INT_MAX);
+       }
+       ctdb->lastid = id;
+       return id;
+}
+
+void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *type, const char *location)
+{
+       void *p;
+
+       p = _idr_find_type(ctdb->idr, reqid, type, location);
+       if (p == NULL) {
+               DEBUG(DEBUG_WARNING, ("Could not find idr:%u\n",reqid));
+       }
+
+       return p;
+}
+
+
+void ctdb_reqid_remove(struct ctdb_context *ctdb, uint32_t reqid)
+{
+       int ret;
+
+       ret = idr_remove(ctdb->idr, reqid);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Removing idr that does not exist\n"));
+       }
+}
+
+
+/*
+  form a ctdb_rec_data record from a key/data pair
+  
+  note that header may be NULL. If not NULL then it is included in the data portion
+  of the record
+ */
+struct ctdb_rec_data *ctdb_marshall_record(TALLOC_CTX *mem_ctx, uint32_t reqid,        
+                                          TDB_DATA key, 
+                                          struct ctdb_ltdb_header *header,
+                                          TDB_DATA data)
+{
+       size_t length;
+       struct ctdb_rec_data *d;
+
+       length = offsetof(struct ctdb_rec_data, data) + key.dsize + 
+               data.dsize + (header?sizeof(*header):0);
+       d = (struct ctdb_rec_data *)talloc_size(mem_ctx, length);
+       if (d == NULL) {
+               return NULL;
+       }
+       d->length = length;
+       d->reqid = reqid;
+       d->keylen = key.dsize;
+       memcpy(&d->data[0], key.dptr, key.dsize);
+       if (header) {
+               d->datalen = data.dsize + sizeof(*header);
+               memcpy(&d->data[key.dsize], header, sizeof(*header));
+               memcpy(&d->data[key.dsize+sizeof(*header)], data.dptr, data.dsize);
+       } else {
+               d->datalen = data.dsize;
+               memcpy(&d->data[key.dsize], data.dptr, data.dsize);
+       }
+       return d;
+}
+
+
+/* helper function for marshalling multiple records */
+struct ctdb_marshall_buffer *ctdb_marshall_add(TALLOC_CTX *mem_ctx, 
+                                              struct ctdb_marshall_buffer *m,
+                                              uint64_t db_id,
+                                              uint32_t reqid,
+                                              TDB_DATA key,
+                                              struct ctdb_ltdb_header *header,
+                                              TDB_DATA data)
+{
+       struct ctdb_rec_data *r;
+       size_t m_size, r_size;
+       struct ctdb_marshall_buffer *m2;
+
+       r = ctdb_marshall_record(mem_ctx, reqid, key, header, data);
+       if (r == NULL) {
+               talloc_free(m);
+               return NULL;
+       }
+
+       if (m == NULL) {
+               m = talloc_zero_size(mem_ctx, offsetof(struct ctdb_marshall_buffer, data));
+               if (m == NULL) {
+                       return NULL;
+               }
+               m->db_id = db_id;
+       }
+
+       m_size = talloc_get_size(m);
+       r_size = talloc_get_size(r);
+
+       m2 = talloc_realloc_size(mem_ctx, m,  m_size + r_size);
+       if (m2 == NULL) {
+               talloc_free(m);
+               return NULL;
+       }
+
+       memcpy(m_size + (uint8_t *)m2, r, r_size);
+
+       talloc_free(r);
+
+       m2->count++;
+
+       return m2;
+}
+
+/* we've finished marshalling, return a data blob with the marshalled records */
+TDB_DATA ctdb_marshall_finish(struct ctdb_marshall_buffer *m)
+{
+       TDB_DATA data;
+       data.dptr = (uint8_t *)m;
+       data.dsize = talloc_get_size(m);
+       return data;
+}
+
+/* 
+   loop over a marshalling buffer 
+   
+     - pass r==NULL to start
+     - loop the number of times indicated by m->count
+*/
+struct ctdb_rec_data *ctdb_marshall_loop_next(struct ctdb_marshall_buffer *m, struct ctdb_rec_data *r,
+                                             uint32_t *reqid,
+                                             struct ctdb_ltdb_header *header,
+                                             TDB_DATA *key, TDB_DATA *data)
+{
+       if (r == NULL) {
+               r = (struct ctdb_rec_data *)&m->data[0];
+       } else {
+               r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
+       }
+
+       if (reqid != NULL) {
+               *reqid = r->reqid;
+       }
+       
+       if (key != NULL) {
+               key->dptr   = &r->data[0];
+               key->dsize  = r->keylen;
+       }
+       if (data != NULL) {
+               data->dptr  = &r->data[r->keylen];
+               data->dsize = r->datalen;
+               if (header != NULL) {
+                       data->dptr += sizeof(*header);
+                       data->dsize -= sizeof(*header);
+               }
+       }
+
+       if (header != NULL) {
+               if (r->datalen < sizeof(*header)) {
+                       return NULL;
+               }
+               *header = *(struct ctdb_ltdb_header *)&r->data[r->keylen];
+       }
+
+       return r;
+}
+
+
+#if HAVE_SCHED_H
+#include <sched.h>
+#endif
+
+#if HAVE_PROCINFO_H
+#include <procinfo.h>
+#endif
+
+/*
+  if possible, make this task real time
+ */
+void ctdb_set_scheduler(struct ctdb_context *ctdb)
+{
+#ifdef _AIX_
+#if HAVE_THREAD_SETSCHED
+       struct thrdentry64 te;
+       tid64_t ti;
+
+       ti = 0ULL;
+       if (getthrds64(getpid(), &te, sizeof(te), &ti, 1) != 1) {
+               DEBUG(DEBUG_ERR, ("Unable to get thread information\n"));
+               return;
+       }
+
+       if (ctdb->saved_scheduler_param == NULL) {
+               ctdb->saved_scheduler_param = talloc_size(ctdb, sizeof(te));
+       }
+       *(struct thrdentry64 *)ctdb->saved_scheduler_param = te;
+
+       if (thread_setsched(te.ti_tid, 0, SCHED_RR) == -1) {
+               DEBUG(DEBUG_ERR, ("Unable to set scheduler to SCHED_RR (%s)\n",
+                                 strerror(errno)));
+       } else {
+               DEBUG(DEBUG_NOTICE, ("Set scheduler to SCHED_RR\n"));
+       }
+#endif
+#else /* no AIX */
+#if HAVE_SCHED_SETSCHEDULER
+       struct sched_param p;
+       if (ctdb->saved_scheduler_param == NULL) {
+               ctdb->saved_scheduler_param = talloc_size(ctdb, sizeof(p));
+       }
+
+       if (sched_getparam(0, (struct sched_param *)ctdb->saved_scheduler_param) == -1) {
+               DEBUG(DEBUG_ERR,("Unable to get old scheduler params\n"));
+               return;
+       }
+
+       p = *(struct sched_param *)ctdb->saved_scheduler_param;
+       p.sched_priority = 1;
+
+       if (sched_setscheduler(0, SCHED_FIFO, &p) == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to set scheduler to SCHED_FIFO (%s)\n", 
+                        strerror(errno)));
+       } else {
+               DEBUG(DEBUG_NOTICE,("Set scheduler to SCHED_FIFO\n"));
+       }
+#endif
+#endif
+}
+
+/*
+  restore previous scheduler parameters
+ */
+void ctdb_restore_scheduler(struct ctdb_context *ctdb)
+{
+#ifdef _AIX_
+#if HAVE_THREAD_SETSCHED
+       struct thrdentry64 te, *saved;
+       tid64_t ti;
+
+       ti = 0ULL;
+       if (getthrds64(getpid(), &te, sizeof(te), &ti, 1) != 1) {
+               ctdb_fatal(ctdb, "Unable to get thread information\n");
+       }
+       if (ctdb->saved_scheduler_param == NULL) {
+               ctdb_fatal(ctdb, "No saved scheduler parameters\n");
+       }
+       saved = (struct thrdentry64 *)ctdb->saved_scheduler_param;
+       if (thread_setsched(te.ti_tid, saved->ti_pri, saved->ti_policy) == -1) {
+               ctdb_fatal(ctdb, "Unable to restore old scheduler parameters\n");
+       }
+#endif
+#else /* no AIX */
+#if HAVE_SCHED_SETSCHEDULER
+       if (ctdb->saved_scheduler_param == NULL) {
+               ctdb_fatal(ctdb, "No saved scheduler parameters\n");
+       }
+       if (sched_setscheduler(0, SCHED_OTHER, (struct sched_param *)ctdb->saved_scheduler_param) == -1) {
+               ctdb_fatal(ctdb, "Unable to restore old scheduler parameters\n");
+       }
+#endif
+#endif
+}
+
+void set_nonblocking(int fd)
+{
+       unsigned v;
+       v = fcntl(fd, F_GETFL, 0);
+        fcntl(fd, F_SETFL, v | O_NONBLOCK);
+}
+
+void set_close_on_exec(int fd)
+{
+       unsigned v;
+       v = fcntl(fd, F_GETFD, 0);
+       fcntl(fd, F_SETFD, v | FD_CLOEXEC);
+}
+
+
+bool parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin)
+{
+       sin->sin_family = AF_INET;
+       sin->sin_port   = htons(port);
+
+       if (inet_pton(AF_INET, s, &sin->sin_addr) != 1) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to translate %s into sin_addr\n", s));
+               return false;
+       }
+
+       return true;
+}
+
+static bool parse_ipv6(const char *s, const char *ifaces, unsigned port, ctdb_sock_addr *saddr)
+{
+       saddr->ip6.sin6_family   = AF_INET6;
+       saddr->ip6.sin6_port     = htons(port);
+       saddr->ip6.sin6_flowinfo = 0;
+       saddr->ip6.sin6_scope_id = 0;
+
+       if (inet_pton(AF_INET6, s, &saddr->ip6.sin6_addr) != 1) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to translate %s into sin6_addr\n", s));
+               return false;
+       }
+
+       if (ifaces && IN6_IS_ADDR_LINKLOCAL(&saddr->ip6.sin6_addr)) {
+               if (strchr(ifaces, ',')) {
+                       DEBUG(DEBUG_ERR, (__location__ " Link local address %s "
+                                         "is specified for multiple ifaces %s\n",
+                                         s, ifaces));
+                       return false;
+               }
+               saddr->ip6.sin6_scope_id = if_nametoindex(ifaces);
+       }
+
+       return true;
+}
+/*
+  parse a ip:port pair
+ */
+bool parse_ip_port(const char *addr, ctdb_sock_addr *saddr)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       char *s, *p;
+       unsigned port;
+       char *endp = NULL;
+       bool ret;
+
+       s = talloc_strdup(tmp_ctx, addr);
+       if (s == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed strdup()\n"));
+               talloc_free(tmp_ctx);
+               return false;
+       }
+
+       p = rindex(s, ':');
+       if (p == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " This addr: %s does not contain a port number\n", s));
+               talloc_free(tmp_ctx);
+               return false;
+       }
+
+       port = strtoul(p+1, &endp, 10);
+       if (endp == NULL || *endp != 0) {
+               /* trailing garbage */
+               DEBUG(DEBUG_ERR, (__location__ " Trailing garbage after the port in %s\n", s));
+               talloc_free(tmp_ctx);
+               return false;
+       }
+       *p = 0;
+
+
+       /* now is this a ipv4 or ipv6 address ?*/
+       ret = parse_ip(s, NULL, port, saddr);
+
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+/*
+  parse an ip
+ */
+bool parse_ip(const char *addr, const char *ifaces, unsigned port, ctdb_sock_addr *saddr)
+{
+       char *p;
+       bool ret;
+
+       ZERO_STRUCTP(saddr); /* valgrind :-) */
+
+       /* now is this a ipv4 or ipv6 address ?*/
+       p = index(addr, ':');
+       if (p == NULL) {
+               ret = parse_ipv4(addr, port, &saddr->ip);
+       } else {
+               ret = parse_ipv6(addr, ifaces, port, saddr);
+       }
+
+       return ret;
+}
+
+/*
+  parse a ip/mask pair
+ */
+bool parse_ip_mask(const char *str, const char *ifaces, ctdb_sock_addr *addr, unsigned *mask)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       char *s, *p;
+       char *endp = NULL;
+       bool ret;
+
+       ZERO_STRUCT(*addr);
+       s = talloc_strdup(tmp_ctx, str);
+       if (s == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed strdup()\n"));
+               talloc_free(tmp_ctx);
+               return false;
+       }
+
+       p = rindex(s, '/');
+       if (p == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " This addr: %s does not contain a mask\n", s));
+               talloc_free(tmp_ctx);
+               return false;
+       }
+
+       *mask = strtoul(p+1, &endp, 10);
+       if (endp == NULL || *endp != 0) {
+               /* trailing garbage */
+               DEBUG(DEBUG_ERR, (__location__ " Trailing garbage after the mask in %s\n", s));
+               talloc_free(tmp_ctx);
+               return false;
+       }
+       *p = 0;
+
+
+       /* now is this a ipv4 or ipv6 address ?*/
+       ret = parse_ip(s, ifaces, 0, addr);
+
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+/*
+   This is used to canonicalize a ctdb_sock_addr structure.
+*/
+void ctdb_canonicalize_ip(const ctdb_sock_addr *ip, ctdb_sock_addr *cip)
+{
+       char prefix[12] = { 0,0,0,0,0,0,0,0,0,0,0xff,0xff };
+
+       memcpy(cip, ip, sizeof (*cip));
+
+       if ( (ip->sa.sa_family == AF_INET6)
+       && !memcmp(&ip->ip6.sin6_addr, prefix, 12)) {
+               memset(cip, 0, sizeof(*cip));
+#ifdef HAVE_SOCK_SIN_LEN
+               cip->ip.sin_len = sizeof(*cip);
+#endif
+               cip->ip.sin_family = AF_INET;
+               cip->ip.sin_port   = ip->ip6.sin6_port;
+               memcpy(&cip->ip.sin_addr, &ip->ip6.sin6_addr.s6_addr[12], 4);
+       }
+}
+
+bool ctdb_same_ip(const ctdb_sock_addr *tip1, const ctdb_sock_addr *tip2)
+{
+       ctdb_sock_addr ip1, ip2;
+
+       ctdb_canonicalize_ip(tip1, &ip1);
+       ctdb_canonicalize_ip(tip2, &ip2);
+       
+       if (ip1.sa.sa_family != ip2.sa.sa_family) {
+               return false;
+       }
+
+       switch (ip1.sa.sa_family) {
+       case AF_INET:
+               return ip1.ip.sin_addr.s_addr == ip2.ip.sin_addr.s_addr;
+       case AF_INET6:
+               return !memcmp(&ip1.ip6.sin6_addr.s6_addr[0],
+                               &ip2.ip6.sin6_addr.s6_addr[0],
+                               16);
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " CRITICAL Can not compare sockaddr structures of type %u\n", ip1.sa.sa_family));
+               return false;
+       }
+
+       return true;
+}
+
+/*
+  compare two ctdb_sock_addr structures
+ */
+bool ctdb_same_sockaddr(const ctdb_sock_addr *ip1, const ctdb_sock_addr *ip2)
+{
+       return ctdb_same_ip(ip1, ip2) && ip1->ip.sin_port == ip2->ip.sin_port;
+}
+
+char *ctdb_addr_to_str(ctdb_sock_addr *addr)
+{
+       static char cip[128] = "";
+
+       switch (addr->sa.sa_family) {
+       case AF_INET:
+               inet_ntop(addr->ip.sin_family, &addr->ip.sin_addr, cip, sizeof(cip));
+               break;
+       case AF_INET6:
+               inet_ntop(addr->ip6.sin6_family, &addr->ip6.sin6_addr, cip, sizeof(cip));
+               break;
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family %u\n", addr->sa.sa_family));
+               ctdb_external_trace();
+       }
+
+       return cip;
+}
+
+unsigned ctdb_addr_to_port(ctdb_sock_addr *addr)
+{
+       switch (addr->sa.sa_family) {
+       case AF_INET:
+               return ntohs(addr->ip.sin_port);
+               break;
+       case AF_INET6:
+               return ntohs(addr->ip6.sin6_port);
+               break;
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family %u\n", addr->sa.sa_family));
+       }
+
+       return 0;
+}
+
+void ctdb_block_signal(int signum)
+{
+       sigset_t set;
+       sigemptyset(&set);
+       sigaddset(&set,signum);
+       sigprocmask(SIG_BLOCK,&set,NULL);
+}
+
+void ctdb_unblock_signal(int signum)
+{
+       sigset_t set;
+       sigemptyset(&set);
+       sigaddset(&set,signum);
+       sigprocmask(SIG_UNBLOCK,&set,NULL);
+}
+
+struct debug_levels debug_levels[] = {
+       {DEBUG_EMERG,   "EMERG"},
+       {DEBUG_ALERT,   "ALERT"},
+       {DEBUG_CRIT,    "CRIT"},
+       {DEBUG_ERR,     "ERR"},
+       {DEBUG_WARNING, "WARNING"},
+       {DEBUG_NOTICE,  "NOTICE"},
+       {DEBUG_INFO,    "INFO"},
+       {DEBUG_DEBUG,   "DEBUG"},
+       {0, NULL}
+};
+
+const char *get_debug_by_level(int32_t level)
+{
+       int i;
+
+       for (i=0; debug_levels[i].description != NULL; i++) {
+               if (debug_levels[i].level == level) {
+                       return debug_levels[i].description;
+               }
+       }
+       return "Unknown";
+}
+
+int32_t get_debug_by_desc(const char *desc)
+{
+       int i;
+
+       for (i=0; debug_levels[i].description != NULL; i++) {
+               if (!strcasecmp(debug_levels[i].description, desc)) {
+                       return debug_levels[i].level;
+               }
+       }
+
+       return DEBUG_ERR;
+}
+
+/* we don't lock future pages here; it would increase the chance that
+ * we'd fail to mmap later on. */
+void ctdb_lockdown_memory(struct ctdb_context *ctdb)
+{
+#ifdef HAVE_MLOCKALL
+       /* Extra stack, please! */
+       char dummy[10000];
+       memset(dummy, 0, sizeof(dummy));
+
+       if (ctdb->valgrinding) {
+               return;
+       }
+
+       /* TODO: Add a command line option to disable memory lockdown.
+        *       This can be a performance issue on AIX since fork() copies
+        *       all locked memory pages. 
+        */
+
+       /* Ignore when running in local daemons mode */
+       if (getuid() != 0) {
+               return;
+       }
+
+       /* Avoid compiler optimizing out dummy. */
+       mlock(dummy, sizeof(dummy));
+       if (mlockall(MCL_CURRENT) != 0) {
+               DEBUG(DEBUG_WARNING,("Failed to lockdown memory: %s'\n",
+                                    strerror(errno)));
+       }
+#endif
+}
+
+const char *ctdb_eventscript_call_names[] = {
+       "init",
+       "setup",
+       "startup",
+       "startrecovery",
+       "recovered",
+       "takeip",
+       "releaseip",
+       "stopped",
+       "monitor",
+       "status",
+       "shutdown",
+       "reload",
+       "updateip",
+       "ipreallocated"
+};
+
+/* Runstate handling */
+static struct {
+       enum ctdb_runstate runstate;
+       const char * label;
+} runstate_map[] = {
+       { CTDB_RUNSTATE_UNKNOWN, "UNKNOWN" },
+       { CTDB_RUNSTATE_INIT, "INIT" },
+       { CTDB_RUNSTATE_SETUP, "SETUP" },
+       { CTDB_RUNSTATE_FIRST_RECOVERY, "FIRST_RECOVERY" },
+       { CTDB_RUNSTATE_STARTUP, "STARTUP" },
+       { CTDB_RUNSTATE_RUNNING, "RUNNING" },
+       { CTDB_RUNSTATE_SHUTDOWN, "SHUTDOWN" },
+       { -1, NULL },
+};
+
+const char *runstate_to_string(enum ctdb_runstate runstate)
+{
+       int i;
+       for (i=0; runstate_map[i].label != NULL ; i++) {
+               if (runstate_map[i].runstate == runstate) {
+                       return runstate_map[i].label;
+               }
+       }
+
+       return runstate_map[0].label;
+}
+
+enum ctdb_runstate runstate_from_string(const char *label)
+{
+       int i;
+       for (i=0; runstate_map[i].label != NULL; i++) {
+               if (strcasecmp(runstate_map[i].label, label) == 0) {
+                       return runstate_map[i].runstate;
+               }
+       }
+
+       return CTDB_RUNSTATE_UNKNOWN;
+}
+
+void ctdb_set_runstate(struct ctdb_context *ctdb, enum ctdb_runstate runstate)
+{
+       if (runstate <= ctdb->runstate) {
+               ctdb_fatal(ctdb, "runstate must always increase");
+       }
+
+       DEBUG(DEBUG_NOTICE,("Set runstate to %s (%d)\n",
+                           runstate_to_string(runstate), runstate));
+       ctdb->runstate = runstate;
+}
+
+void ctdb_mkdir_p_or_die(struct ctdb_context *ctdb, const char *dir, int mode)
+{
+       int ret;
+
+       ret = mkdir_p(dir, mode);
+       if (ret != 0) {
+               DEBUG(DEBUG_ALERT,
+                     ("ctdb exiting with error: "
+                      "failed to create directory \"%s\" (%s)\n",
+                      dir, strerror(ret)));
+               exit(1);
+       }
+}
diff --git a/ctdb/common/rb_tree.c b/ctdb/common/rb_tree.c
new file mode 100644 (file)
index 0000000..6b131bc
--- /dev/null
@@ -0,0 +1,1095 @@
+/* 
+   a talloc based red-black tree
+
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "rb_tree.h"
+
+#define NO_MEMORY_FATAL(p) do { if (!(p)) { \
+          DEBUG(DEBUG_CRIT,("Out of memory for %s at %s\n", #p, __location__)); \
+         exit(10); \
+         }} while (0)
+
+
+static void 
+tree_destructor_traverse_node(TALLOC_CTX *mem_ctx, trbt_node_t *node)
+{
+       talloc_set_destructor(node, NULL);
+       if (node->left) {
+               tree_destructor_traverse_node(mem_ctx, node->left);
+       }
+       if (node->right) {
+               tree_destructor_traverse_node(mem_ctx, node->right);
+       }
+       talloc_steal(mem_ctx, node);
+}
+
+/*
+  destroy a tree and remove all its nodes
+ */
+static int tree_destructor(trbt_tree_t *tree)
+{
+       TALLOC_CTX *tmp_ctx;
+       trbt_node_t *node;
+
+       if (tree == NULL) {
+               return 0;
+       }
+
+       node=tree->root;
+       if (node == NULL) {
+               return 0;
+       }
+
+       /* traverse the tree and remove the node destructor and steal
+          the node to the temporary context.
+          we dont want to use the existing destructor for the node
+          since that will remove the nodes one by one from the tree.
+          since the entire tree will be completely destroyed we dont care
+          if it is inconsistent or unbalanced while freeing the
+          individual nodes
+       */
+       tmp_ctx = talloc_new(NULL);
+       tree_destructor_traverse_node(tmp_ctx, node);
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+
+/* create a red black tree */
+trbt_tree_t *
+trbt_create(TALLOC_CTX *memctx, uint32_t flags)
+{
+       trbt_tree_t *tree;
+
+       tree = talloc_zero(memctx, trbt_tree_t);
+       NO_MEMORY_FATAL(tree);
+
+       /* If the tree is freed, we must walk over all entries and steal the
+          node from the stored data pointer and release the node.
+          Note, when we free the tree  we only free the tree and not any of 
+          the data stored in the tree.
+       */
+       talloc_set_destructor(tree, tree_destructor);
+       tree->flags = flags;
+
+       return tree;
+}
+
+static inline trbt_node_t *
+trbt_parent(trbt_node_t *node)
+{
+       return node->parent;
+}
+
+static inline trbt_node_t *
+trbt_grandparent(trbt_node_t *node)
+{
+       trbt_node_t *parent;
+
+       parent=trbt_parent(node);
+       if(parent){
+               return parent->parent;
+       }
+       return NULL;
+}
+
+static inline trbt_node_t *
+trbt_uncle(trbt_node_t *node)
+{
+       trbt_node_t *parent, *grandparent;
+
+       parent=trbt_parent(node);
+       if(!parent){
+               return NULL;
+       }
+       grandparent=trbt_parent(parent);
+       if(!grandparent){
+               return NULL;
+       }
+       if(parent==grandparent->left){
+               return grandparent->right;
+       }
+       return grandparent->left;
+}
+
+
+static inline void trbt_insert_case1(trbt_tree_t *tree, trbt_node_t *node);
+static inline void trbt_insert_case2(trbt_tree_t *tree, trbt_node_t *node);
+
+static inline void
+trbt_rotate_left(trbt_node_t *node)
+{
+       trbt_tree_t *tree = node->tree;
+
+       if(node->parent){
+               if(node->parent->left==node){
+                       node->parent->left=node->right;
+               } else {
+                       node->parent->right=node->right;
+               }
+       } else {
+               tree->root=node->right;
+       }
+       node->right->parent=node->parent;
+       node->parent=node->right;
+       node->right=node->right->left;
+       if(node->right){
+               node->right->parent=node;
+       }
+       node->parent->left=node;
+}
+
+static inline void
+trbt_rotate_right(trbt_node_t *node)
+{
+       trbt_tree_t *tree = node->tree;
+
+       if(node->parent){
+               if(node->parent->left==node){
+                       node->parent->left=node->left;
+               } else {
+                       node->parent->right=node->left;
+               }
+       } else {
+               tree->root=node->left;
+       }
+       node->left->parent=node->parent;
+       node->parent=node->left;
+       node->left=node->left->right;
+       if(node->left){
+               node->left->parent=node;
+       }
+       node->parent->right=node;
+}
+
+/* NULL nodes are black by definition */
+static inline int trbt_get_color(trbt_node_t *node)
+{
+       if (node==NULL) {
+               return TRBT_BLACK;
+       }
+       return node->rb_color;
+}
+static inline int trbt_get_color_left(trbt_node_t *node)
+{
+       if (node==NULL) {
+               return TRBT_BLACK;
+       }
+       if (node->left==NULL) {
+               return TRBT_BLACK;
+       }
+       return node->left->rb_color;
+}
+static inline int trbt_get_color_right(trbt_node_t *node)
+{
+       if (node==NULL) {
+               return TRBT_BLACK;
+       }
+       if (node->right==NULL) {
+               return TRBT_BLACK;
+       }
+       return node->right->rb_color;
+}
+/* setting a NULL node to black is a nop */
+static inline void trbt_set_color(trbt_node_t *node, int color)
+{
+       if ( (node==NULL) && (color==TRBT_BLACK) ) {
+               return;
+       }
+       node->rb_color = color;
+}
+static inline void trbt_set_color_left(trbt_node_t *node, int color)
+{
+       if ( ((node==NULL)||(node->left==NULL)) && (color==TRBT_BLACK) ) {
+               return;
+       }
+       node->left->rb_color = color;
+}
+static inline void trbt_set_color_right(trbt_node_t *node, int color)
+{
+       if ( ((node==NULL)||(node->right==NULL)) && (color==TRBT_BLACK) ) {
+               return;
+       }
+       node->right->rb_color = color;
+}
+
+static inline void
+trbt_insert_case5(trbt_tree_t *tree, trbt_node_t *node)
+{
+       trbt_node_t *grandparent;
+       trbt_node_t *parent;
+
+       parent=trbt_parent(node);
+       grandparent=trbt_parent(parent);
+       parent->rb_color=TRBT_BLACK;
+       grandparent->rb_color=TRBT_RED;
+       if( (node==parent->left) && (parent==grandparent->left) ){
+               trbt_rotate_right(grandparent);
+       } else {
+               trbt_rotate_left(grandparent);
+       }
+}
+
+static inline void
+trbt_insert_case4(trbt_tree_t *tree, trbt_node_t *node)
+{
+       trbt_node_t *grandparent;
+       trbt_node_t *parent;
+
+       parent=trbt_parent(node);
+       grandparent=trbt_parent(parent);
+       if(!grandparent){
+               return;
+       }
+       if( (node==parent->right) && (parent==grandparent->left) ){
+               trbt_rotate_left(parent);
+               node=node->left;
+       } else if( (node==parent->left) && (parent==grandparent->right) ){
+               trbt_rotate_right(parent);
+               node=node->right;
+       }
+       trbt_insert_case5(tree, node);
+}
+
+static inline void
+trbt_insert_case3(trbt_tree_t *tree, trbt_node_t *node)
+{
+       trbt_node_t *grandparent;
+       trbt_node_t *parent;
+       trbt_node_t *uncle;
+
+       uncle=trbt_uncle(node);
+       if(uncle && (uncle->rb_color==TRBT_RED)){
+               parent=trbt_parent(node);
+               parent->rb_color=TRBT_BLACK;
+               uncle->rb_color=TRBT_BLACK;
+               grandparent=trbt_grandparent(node);
+               grandparent->rb_color=TRBT_RED;
+               trbt_insert_case1(tree, grandparent);
+       } else {
+               trbt_insert_case4(tree, node);
+       }
+}
+
+static inline void
+trbt_insert_case2(trbt_tree_t *tree, trbt_node_t *node)
+{
+       trbt_node_t *parent;
+
+       parent=trbt_parent(node);
+       /* parent is always non-NULL here */
+       if(parent->rb_color==TRBT_BLACK){
+               return;
+       }
+       trbt_insert_case3(tree, node);
+}
+
+static inline void
+trbt_insert_case1(trbt_tree_t *tree, trbt_node_t *node)
+{
+       trbt_node_t *parent;
+
+       parent=trbt_parent(node);
+       if(!parent){
+               node->rb_color=TRBT_BLACK;
+               return;
+       }
+       trbt_insert_case2(tree, node);
+}
+
+static inline trbt_node_t *
+trbt_sibling(trbt_node_t *node)
+{
+       trbt_node_t *parent;
+
+       parent=trbt_parent(node);
+       if(!parent){
+               return NULL;
+       }
+
+       if (node == parent->left) {
+               return parent->right;
+       } else {
+               return parent->left;
+       }
+}
+
+static inline void
+trbt_delete_case6(trbt_node_t *node)
+{
+       trbt_node_t *sibling, *parent;
+
+       sibling = trbt_sibling(node);
+       parent  = trbt_parent(node);
+
+       trbt_set_color(sibling, parent->rb_color);
+       trbt_set_color(parent, TRBT_BLACK);
+       if (node == parent->left) {
+               trbt_set_color_right(sibling, TRBT_BLACK);
+               trbt_rotate_left(parent);
+       } else {
+               trbt_set_color_left(sibling, TRBT_BLACK);
+               trbt_rotate_right(parent);
+       }
+}
+
+
+static inline void
+trbt_delete_case5(trbt_node_t *node)
+{
+       trbt_node_t *parent, *sibling;
+
+       parent = trbt_parent(node);
+       sibling = trbt_sibling(node);
+       if ( (node == parent->left)
+          &&(trbt_get_color(sibling)        == TRBT_BLACK)
+          &&(trbt_get_color_left(sibling)   == TRBT_RED)
+          &&(trbt_get_color_right(sibling)  == TRBT_BLACK) ){
+               trbt_set_color(sibling, TRBT_RED);
+               trbt_set_color_left(sibling, TRBT_BLACK);
+               trbt_rotate_right(sibling);
+               trbt_delete_case6(node);
+               return;
+       } 
+       if ( (node == parent->right)
+          &&(trbt_get_color(sibling)        == TRBT_BLACK)
+          &&(trbt_get_color_right(sibling)  == TRBT_RED)
+          &&(trbt_get_color_left(sibling)   == TRBT_BLACK) ){
+               trbt_set_color(sibling, TRBT_RED);
+               trbt_set_color_right(sibling, TRBT_BLACK);
+               trbt_rotate_left(sibling);
+               trbt_delete_case6(node);
+               return;
+       }
+
+       trbt_delete_case6(node);
+}
+
+static inline void
+trbt_delete_case4(trbt_node_t *node)
+{
+       trbt_node_t *sibling;
+
+       sibling = trbt_sibling(node);
+       if ( (trbt_get_color(node->parent)   == TRBT_RED)
+          &&(trbt_get_color(sibling)        == TRBT_BLACK)
+          &&(trbt_get_color_left(sibling)   == TRBT_BLACK)
+          &&(trbt_get_color_right(sibling)  == TRBT_BLACK) ){
+               trbt_set_color(sibling, TRBT_RED);
+               trbt_set_color(node->parent, TRBT_BLACK);
+       } else {
+               trbt_delete_case5(node);
+       }
+}
+
+static void trbt_delete_case1(trbt_node_t *node);
+
+static inline void
+trbt_delete_case3(trbt_node_t *node)
+{
+       trbt_node_t *sibling;
+
+       sibling = trbt_sibling(node);
+       if ( (trbt_get_color(node->parent)   == TRBT_BLACK)
+          &&(trbt_get_color(sibling)        == TRBT_BLACK)
+          &&(trbt_get_color_left(sibling)   == TRBT_BLACK)
+          &&(trbt_get_color_right(sibling)  == TRBT_BLACK) ){
+               trbt_set_color(sibling, TRBT_RED);
+               trbt_delete_case1(node->parent);
+       } else {
+               trbt_delete_case4(node);
+       }
+}
+       
+static inline void
+trbt_delete_case2(trbt_node_t *node)
+{
+       trbt_node_t *sibling;
+
+       sibling = trbt_sibling(node);
+       if (trbt_get_color(sibling) == TRBT_RED) {
+               trbt_set_color(node->parent, TRBT_RED);
+               trbt_set_color(sibling, TRBT_BLACK);
+               if (node == node->parent->left) {
+                       trbt_rotate_left(node->parent);
+               } else {
+                       trbt_rotate_right(node->parent);
+               }
+       }
+       trbt_delete_case3(node);
+}      
+
+static void
+trbt_delete_case1(trbt_node_t *node)
+{
+       if (!node->parent) {
+               return;
+       } else {
+               trbt_delete_case2(node);
+       }
+}
+
+static void
+delete_node(trbt_node_t *node, bool from_destructor)
+{
+       trbt_node_t *parent, *child, dc;
+       trbt_node_t *temp = NULL;
+
+       /* This node has two child nodes, then just copy the content
+          from the next smaller node with this node and delete the 
+          predecessor instead.
+          The predecessor is guaranteed to have at most one child
+          node since its right arm must be NULL
+          (It must be NULL since we are its sucessor and we are above
+           it in the tree)
+        */
+       if (node->left != NULL && node->right != NULL) {
+               /* This node has two children, just copy the data */
+               /* find the predecessor */
+               temp = node->left;
+
+               while (temp->right != NULL) {
+                       temp = temp->right;
+               }
+
+               /* swap the predecessor data and key with the node to
+                  be deleted.
+                */
+               node->key32 = temp->key32;
+               node->data  = temp->data;
+               /* now we let node hang off the new data */
+               talloc_steal(node->data, node);
+       
+               temp->data  = NULL;
+               temp->key32 = -1;
+               /* then delete the temp node.
+                  this node is guaranteed to have at least one leaf 
+                  child */
+               delete_node(temp, from_destructor);
+               goto finished;
+       }
+
+
+       /* There is at most one child to this node to be deleted */
+       child = node->left;
+       if (node->right) {
+               child = node->right;
+       }
+
+       /* If the node to be deleted did not have any child at all we
+          create a temporary dummy node for the child and mark it black.
+          Once the delete of the node is finished, we remove this dummy
+          node, which is simple to do since it is guaranteed that it will
+          still not have any children after the delete operation.
+          This is because we dont represent the leaf-nodes as actual nodes
+          in this implementation.
+        */
+       if (!child) {
+               child = &dc;
+               child->tree = node->tree;
+               child->left=NULL;
+               child->right=NULL;
+               child->rb_color=TRBT_BLACK;
+               child->data=NULL;
+       }
+
+       /* replace node with child */
+       parent = trbt_parent(node);
+       if (parent) {
+               if (parent->left == node) {
+                       parent->left = child;
+               } else {
+                       parent->right = child;
+               }
+       } else {
+               node->tree->root = child;
+       }
+       child->parent = node->parent;
+
+
+       if (node->rb_color == TRBT_BLACK) {
+               if (trbt_get_color(child) == TRBT_RED) {
+                       child->rb_color = TRBT_BLACK;
+               } else {
+                       trbt_delete_case1(child);
+               }
+       }
+
+       /* If we had to create a temporary dummy node to represent a black 
+          leaf child we now has to delete it.
+          This is simple since this dummy node originally had no children
+          and we are guaranteed that it will also not have any children 
+          after the node has been deleted and any possible rotations 
+          have occured.
+
+          The only special case is if this was the last node of the tree
+          in which case we have to reset the root to NULL as well.
+          Othervise it is enough to just unlink the child from its new
+          parent.
+        */
+       if (child == &dc) {
+               if (child->parent == NULL) {
+                       node->tree->root = NULL;
+               } else if (child == child->parent->left) {
+                       child->parent->left = NULL;
+               } else {
+                       child->parent->right = NULL;
+               }
+       }
+
+finished:
+       if (!from_destructor) {
+               talloc_free(node);
+       }
+
+       /* if we came from a destructor and temp!=NULL  this means we
+          did the node-swap but now the tree still contains the old
+          node  which was freed in the destructor. Not good.
+       */
+       if (from_destructor && temp) {
+               temp->key32    = node->key32;
+               temp->rb_color = node->rb_color;
+
+               temp->data = node->data;
+               talloc_steal(temp->data, temp);
+
+               temp->parent = node->parent;
+               if (temp->parent) {
+                       if (temp->parent->left == node) {
+                               temp->parent->left = temp;
+                       } else {
+                               temp->parent->right = temp;
+                       }
+               }
+
+               temp->left = node->left;
+               if (temp->left) {
+                       temp->left->parent = temp;
+               }
+               temp->right = node->right;
+               if (temp->right) {
+                       temp->right->parent = temp;
+               }
+
+               if (temp->tree->root == node) {
+                       temp->tree->root = temp;
+               }
+       }
+
+       if ( (node->tree->flags & TRBT_AUTOFREE)
+       &&   (node->tree->root == NULL) ) {
+               talloc_free(node->tree);
+       }
+
+       return;
+}
+
+/*
+  destroy a node and remove it from its tree
+ */
+static int node_destructor(trbt_node_t *node)
+{
+       delete_node(node, true);
+
+       return 0;
+}
+
+static inline trbt_node_t *
+trbt_create_node(trbt_tree_t *tree, trbt_node_t *parent, uint32_t key, void *data)
+{
+       trbt_node_t *node;
+
+       node=talloc_zero(tree, trbt_node_t);
+       NO_MEMORY_FATAL(node);
+
+       node->tree=tree;
+       node->rb_color=TRBT_BLACK;
+       node->parent=parent;
+       node->left=NULL;
+       node->right=NULL;
+       node->key32=key;
+       node->data = data;
+
+       /* let this node hang off data so that it is removed when
+          data is freed
+        */
+       talloc_steal(data, node);
+       talloc_set_destructor(node, node_destructor);
+
+       return node;
+}
+
+/* insert a new node in the tree. 
+   if there is already a node with a matching key in the tree 
+   we replace it with the new data and return a pointer to the old data
+   in case the caller wants to take any special action
+ */
+void *
+trbt_insert32(trbt_tree_t *tree, uint32_t key, void *data)
+{
+       trbt_node_t *node;
+
+       node=tree->root;
+
+       /* is this the first node ?*/
+       if(!node){
+               node = trbt_create_node(tree, NULL, key, data);
+
+               tree->root=node;
+               return NULL;
+       }
+
+       /* it was not the new root so walk the tree until we find where to
+        * insert this new leaf.
+        */
+       while(1){
+               /* this node already exists, replace data and return the 
+                  old data
+                */
+               if(key==node->key32){
+                       void *old_data;
+
+                       old_data = node->data;
+                       node->data  = data;
+                       /* Let the node now be owned by the new data
+                          so the node is freed when the enw data is released
+                       */
+                       talloc_steal(node->data, node);
+
+                       return old_data;
+               }
+               if(key<node->key32) {
+                       if(!node->left){
+                               /* new node to the left */
+                               trbt_node_t *new_node;
+
+                               new_node = trbt_create_node(tree, node, key, data);
+                               node->left=new_node;
+                               node=new_node;
+
+                               break;
+                       }
+                       node=node->left;
+                       continue;
+               }
+               if(key>node->key32) {
+                       if(!node->right){
+                               /* new node to the right */
+                               trbt_node_t *new_node;
+
+                               new_node = trbt_create_node(tree, node, key, data);
+                               node->right=new_node;
+                               node=new_node;
+                               break;
+                       }
+                       node=node->right;
+                       continue;
+               }
+       }
+
+       /* node will now point to the newly created node */
+       node->rb_color=TRBT_RED;
+       trbt_insert_case1(tree, node);
+       return NULL;
+}
+
+void *
+trbt_lookup32(trbt_tree_t *tree, uint32_t key)
+{
+       trbt_node_t *node;
+
+       node=tree->root;
+
+       while(node){
+               if(key==node->key32){
+                       return node->data;
+               }
+               if(key<node->key32){
+                       node=node->left;
+                       continue;
+               }
+               if(key>node->key32){
+                       node=node->right;
+                       continue;
+               }
+       }
+       return NULL;
+}
+
+
+/* This deletes a node from the tree.
+   Note that this does not release the data that the node points to
+*/
+void 
+trbt_delete32(trbt_tree_t *tree, uint32_t key)
+{
+       trbt_node_t *node;
+
+       node=tree->root;
+
+       while(node){
+               if(key==node->key32){
+                       delete_node(node, false);
+                       return;
+               }
+               if(key<node->key32){
+                       node=node->left;
+                       continue;
+               }
+               if(key>node->key32){
+                       node=node->right;
+                       continue;
+               }
+       }
+}
+
+
+void 
+trbt_insert32_callback(trbt_tree_t *tree, uint32_t key, void *(*callback)(void *param, void *data), void *param)
+{
+       trbt_node_t *node;
+
+       node=tree->root;
+
+       /* is this the first node ?*/
+       if(!node){
+               node = trbt_create_node(tree, NULL, key, 
+                               callback(param, NULL));
+
+               tree->root=node;
+               return;
+       }
+
+       /* it was not the new root so walk the tree until we find where to
+        * insert this new leaf.
+        */
+       while(1){
+               /* this node already exists, replace it 
+                */
+               if(key==node->key32){
+                       node->data  = callback(param, node->data);
+                       talloc_steal(node->data, node); 
+
+                       return;
+               }
+               if(key<node->key32) {
+                       if(!node->left){
+                               /* new node to the left */
+                               trbt_node_t *new_node;
+
+                               new_node = trbt_create_node(tree, node, key,
+                                               callback(param, NULL));
+                               node->left=new_node;
+                               node=new_node;
+
+                               break;
+                       }
+                       node=node->left;
+                       continue;
+               }
+               if(key>node->key32) {
+                       if(!node->right){
+                               /* new node to the right */
+                               trbt_node_t *new_node;
+
+                               new_node = trbt_create_node(tree, node, key,
+                                               callback(param, NULL));
+                               node->right=new_node;
+                               node=new_node;
+                               break;
+                       }
+                       node=node->right;
+                       continue;
+               }
+       }
+
+       /* node will now point to the newly created node */
+       node->rb_color=TRBT_RED;
+       trbt_insert_case1(tree, node);
+       return;
+}
+
+
+struct trbt_array_param {
+       void *(*callback)(void *param, void *data);
+       void *param;
+       uint32_t keylen;
+       uint32_t *key;
+       trbt_tree_t *tree;
+};
+static void *array_insert_callback(void *p, void *data)
+{
+       struct trbt_array_param *param = (struct trbt_array_param *)p;
+       trbt_tree_t *tree = NULL;
+
+
+       /* if keylen has reached 0 we are done and can call the users 
+          callback function with the users parameters
+       */
+       if (param->keylen == 0) {
+               return param->callback(param->param, data);
+       }
+
+
+       /* keylen is not zero yes so we must create/process more subtrees */
+       /* if data is NULL this means we did not yet have a subtree here
+          and we must create one.
+       */
+       if (data == NULL) {
+               /* create a new subtree and hang it off our current tree
+                  set it to autofree so that the tree is freed when
+                  the last node in it has been released.
+               */
+               tree = trbt_create(param->tree, TRBT_AUTOFREE);
+       } else {
+               /* we already have a subtree for this path */
+               tree = (trbt_tree_t *)data;
+       }
+               
+       trbt_insertarray32_callback(tree, param->keylen, param->key, param->callback, param->param);
+
+       /* now return either the old tree we got in *data or the new tree
+          we created to our caller so he can update his pointer in his
+          tree to point to our subtree
+       */
+       return tree;
+}
+
+
+
+/* insert into the tree using an array of uint32 as a key */
+void 
+trbt_insertarray32_callback(trbt_tree_t *tree, uint32_t keylen, uint32_t *key, void *(*cb)(void *param, void *data), void *pm)
+{
+       struct trbt_array_param tap;
+
+       /* keylen-1 and key[1]  since the call to insert32 will consume the
+          first part of the key.
+       */
+       tap.callback= cb;
+       tap.param   = pm;
+       tap.keylen  = keylen-1;
+       tap.key     = &key[1];
+       tap.tree    = tree;
+
+       trbt_insert32_callback(tree, key[0], array_insert_callback, &tap);
+}
+
+/* lookup the tree using an array of uint32 as a key */
+void *
+trbt_lookuparray32(trbt_tree_t *tree, uint32_t keylen, uint32_t *key)
+{
+       /* if keylen is 1 we can do a regular lookup and return this to the
+          user 
+       */
+       if (keylen == 1) {
+               return trbt_lookup32(tree, key[0]);
+       }
+
+       /* we need to lookup the next subtree */
+       tree = trbt_lookup32(tree, key[0]);
+       if (tree == NULL) {
+               /* the key does not exist, return NULL */
+               return NULL;
+       }
+
+       /* now lookup the next part of the key in our new tree */
+       return trbt_lookuparray32(tree, keylen-1, &key[1]);
+}
+
+
+/* traverse a tree starting at node */
+static int
+trbt_traversearray32_node(trbt_node_t *node, uint32_t keylen, 
+       int (*callback)(void *param, void *data), 
+       void *param)
+{
+       trbt_node_t *left = node->left;
+       trbt_node_t *right = node->right;
+
+       if (left) {
+               int ret;
+               ret = trbt_traversearray32_node(left, keylen, callback, param);
+               if (ret != 0) {
+                       return ret;
+               }
+       }
+
+       /* this is the smallest node in this subtree
+          if keylen is 0 this means we can just call the callback
+          otherwise we must pull the next subtree and traverse that one as well
+       */
+       if (keylen == 0) {
+               int ret;
+
+               ret = callback(param, node->data);
+               if (ret != 0) {
+                       return ret;
+               }
+       } else {
+               int ret;
+
+               ret = trbt_traversearray32(node->data, keylen, callback, param);
+               if (ret != 0) {
+                       return ret;
+               }
+       }
+
+       if (right) {
+               int ret;
+
+               ret = trbt_traversearray32_node(right, keylen, callback, param);
+               if (ret != 0) {
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+       
+
+/* traverse the tree using an array of uint32 as a key */
+int 
+trbt_traversearray32(trbt_tree_t *tree, uint32_t keylen, 
+       int (*callback)(void *param, void *data), 
+       void *param)
+{
+       trbt_node_t *node;
+
+       if (tree == NULL) {
+               return 0;
+       }
+
+       node=tree->root;
+       if (node == NULL) {
+               return 0;
+       }
+
+       return trbt_traversearray32_node(node, keylen-1, callback, param);
+}
+
+
+/* this function will return the first node in a tree where
+   the key is an array of uint32_t
+*/
+void *
+trbt_findfirstarray32(trbt_tree_t *tree, uint32_t keylen)
+{
+       trbt_node_t *node;
+
+       if (keylen < 1) {
+               return NULL;
+       }
+       
+       if (tree == NULL) {
+               return NULL;
+       }
+
+       node=tree->root;
+       if (node == NULL) {
+               return NULL;
+       }
+
+       while (node->left) {
+               node = node->left;
+       }
+
+       /* we found our node so return the data */
+       if (keylen == 1) {
+               return node->data;
+       }
+
+       /* we are still traversing subtrees so find the first node in the
+          next level of trees
+       */
+       return trbt_findfirstarray32(node->data, keylen-1);
+}
+
+
+#if TEST_RB_TREE
+static void printtree(trbt_node_t *node, int levels)
+{
+       int i;
+       if(node==NULL)return;
+       printtree(node->left, levels+1);
+
+       for(i=0;i<levels;i++)printf("    ");
+       printf("key:%d COLOR:%s (node:%p parent:%p left:%p right:%p)\n",node->key32,node->rb_color==TRBT_BLACK?"BLACK":"RED", node, node->parent, node->left, node->right);
+
+       printtree(node->right, levels+1);
+       printf("\n");
+}
+
+void print_tree(trbt_tree_t *tree)
+{
+       if(tree->root==NULL){
+               printf("tree is empty\n");
+               return;
+       }
+       printf("---\n");
+       printtree(tree->root->left, 1);
+       printf("root node key:%d COLOR:%s (node:%p left:%p right:%p)\n",tree->root->key32,tree->root->rb_color==TRBT_BLACK?"BLACK":"RED", tree->root, tree->root->left, tree->root->right);
+       printtree(tree->root->right, 1);
+       printf("===\n");
+}
+
+void 
+test_tree(void)
+{
+       trbt_tree_t *tree;
+       char *str;
+       int i, ret;
+       int NUM=15;
+       int cnt=0;
+
+       tree=trbt_create(talloc_new(NULL), 0);
+#if 0
+       for(i=0;i<10;i++){
+               printf("adding node %i\n",i);
+               trbt_insert32(tree, i, NULL);
+               print_tree(tree);
+       }
+       printf("deleting node %i\n",3);
+       trbt_delete32(tree, 3);
+       print_tree(tree);
+       for(i=0;i<10;i++){
+               printf("deleting node %i\n",i);
+               trbt_delete32(tree, i);
+               print_tree(tree);
+       }
+exit(0);
+#endif
+       while(++cnt){
+               int i;
+               printf("iteration : %d\n",cnt);
+               i=random()%20;
+               printf("adding node %i\n",i);
+               trbt_insert32(tree, i, NULL);
+               print_tree(tree);
+
+               i=random()%20;
+               printf("deleting node %i\n",i);
+               trbt_delete32(tree, i);
+               print_tree(tree);
+       }
+
+}
+
+#endif
diff --git a/ctdb/common/rb_tree.h b/ctdb/common/rb_tree.h
new file mode 100644 (file)
index 0000000..b5ddbb2
--- /dev/null
@@ -0,0 +1,90 @@
+/* 
+   a talloc based red-black tree
+
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RB_TREE_H
+#define _RB_TREE_H
+#define TRBT_RED               0x00
+#define TRBT_BLACK             0x01
+typedef struct trbt_node {
+       struct trbt_tree *tree;
+       struct trbt_node *parent;
+       struct trbt_node *left;
+       struct trbt_node *right;
+       uint32_t rb_color;
+       uint32_t key32;
+       void *data;
+} trbt_node_t;
+
+typedef struct trbt_tree {
+       trbt_node_t *root;
+/* automatically free the tree when the last node has been deleted */
+#define TRBT_AUTOFREE          0x00000001
+       uint32_t flags;
+} trbt_tree_t;
+
+
+
+/* Create a RB tree */
+trbt_tree_t *trbt_create(TALLOC_CTX *memctx, uint32_t flags);
+
+/* Lookup a node in the tree and return a pointer to data or NULL */
+void *trbt_lookup32(trbt_tree_t *tree, uint32_t key);
+
+/* Insert a new node into the tree. If there was already a node with this
+   key the pointer to the previous data is returned.
+   The tree will talloc_steal() the data inserted into the tree .
+*/
+void *trbt_insert32(trbt_tree_t *tree, uint32_t key, void *data);
+
+/* Insert a new node into the tree.
+   If this is a new node:
+     callback is called with data==NULL and param=param
+     the returned value from the callback is talloc_stolen and inserted in the
+     tree.
+   If a node already exists for this key then:
+     callback is called with data==existing data and param=param
+     the returned calue is talloc_stolen and inserted in the tree
+*/
+void trbt_insert32_callback(trbt_tree_t *tree, uint32_t key, void *(*callback)(void *param, void *data), void *param);
+
+/* Delete a node from the tree and free all data associated with it */
+void trbt_delete32(trbt_tree_t *tree, uint32_t key);
+
+
+/* insert into the tree with a key based on an array of uint32 */
+void trbt_insertarray32_callback(trbt_tree_t *tree, uint32_t keylen, uint32_t *key, void *(*callback)(void *param, void *data), void *param);
+
+/* Lookup a node in the tree with a key based on an array of uint32 
+   and return a pointer to data or NULL */
+void *trbt_lookuparray32(trbt_tree_t *tree, uint32_t keylen, uint32_t *key);
+
+/* Traverse a tree with a key based on an array of uint32
+   returns 0 if traverse completed
+   !0 if the traverse was aborted
+
+   If the callback returns !0  the traverse will be aborted
+*/
+int trbt_traversearray32(trbt_tree_t *tree, uint32_t keylen, int (*callback)(void *param, void *data), void *param);
+
+/* Lookup the first node in the tree with a key based on an array of uint32 
+   and return a pointer to data or NULL */
+void *trbt_findfirstarray32(trbt_tree_t *tree, uint32_t keylen);
+
+#endif /* _RB_TREE_H */
diff --git a/ctdb/common/system_aix.c b/ctdb/common/system_aix.c
new file mode 100644 (file)
index 0000000..41f61ae
--- /dev/null
@@ -0,0 +1,399 @@
+/* 
+   ctdb system specific code to manage raw sockets on aix
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "includes.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include <netinet/if_ether.h>
+#include <netinet/ip6.h>
+#include <net/if_arp.h>
+#include <sys/ndd_var.h>
+#include <sys/kinfo.h>
+#include <pcap.h>
+
+
+
+#if 0
+This function is no longer used and its code should be moved into
+send tcp packet   after that function has been enhanced to do ipv6 as well.
+
+/* This function is used to open a raw socket to send tickles from
+ */
+int ctdb_sys_open_sending_socket(void)
+{
+       int s, ret;
+       uint32_t one = 1;
+
+       s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+       if (s == -1) {
+               DEBUG(DEBUG_CRIT,(" failed to open raw socket (%s)\n",
+                        strerror(errno)));
+               return -1;
+       }
+
+       ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT, (" failed to setup IP headers (%s)\n",
+                        strerror(errno)));
+               close(s);
+               return -1;
+       }
+
+       set_nonblocking(s);
+       set_close_on_exec(s);
+
+       return s;
+}
+#endif
+
+/*
+  simple TCP checksum - assumes data is multiple of 2 bytes long
+ */
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct ip *ip)
+{
+       uint32_t sum = uint16_checksum(data, n);
+       uint16_t sum2;
+
+       sum += uint16_checksum((uint16_t *)&ip->ip_src, sizeof(ip->ip_src));
+       sum += uint16_checksum((uint16_t *)&ip->ip_dst, sizeof(ip->ip_dst));
+       sum += ip->ip_p + n;
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  Send tcp segment from the specified IP/port to the specified
+  destination IP/port. 
+
+  This is used to trigger the receiving host into sending its own ACK,
+  which should trigger early detection of TCP reset by the client
+  after IP takeover
+
+  This can also be used to send RST segments (if rst is true) and also
+  if correct seq and ack numbers are provided.
+ */
+int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
+                     const ctdb_sock_addr *src,
+                     uint32_t seq, uint32_t ack, int rst)
+{
+       int s;
+       int ret;
+       uint32_t one = 1;
+       ctdb_sock_addr *tmpdest;
+       
+       struct {
+               struct ip ip;
+               struct tcphdr tcp;
+       } ip4pkt;
+
+
+       /* for now, we only handle AF_INET addresses */
+       if (src->ip.sin_family != AF_INET || dest->ip.sin_family != AF_INET) {
+               DEBUG(DEBUG_CRIT,(__location__ " not an ipv4 address\n"));
+               return -1;
+       }
+
+
+
+       s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+       if (s == -1) {
+               DEBUG(DEBUG_CRIT,(" failed to open raw socket (%s)\n",
+                        strerror(errno)));
+               return -1;
+       }
+
+       ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT, (" failed to setup IP headers (%s)\n",
+                        strerror(errno)));
+               close(s);
+               return -1;
+       }
+
+       set_nonblocking(s);
+       set_close_on_exec(s);
+
+       memset(&ip4pkt, 0, sizeof(ip4pkt));
+       ip4pkt.ip.ip_v     = 4;
+       ip4pkt.ip.ip_hl    = sizeof(ip4pkt.ip)/4;
+       ip4pkt.ip.ip_len   = htons(sizeof(ip4pkt));
+       ip4pkt.ip.ip_ttl   = 255;
+       ip4pkt.ip.ip_p     = IPPROTO_TCP;
+       ip4pkt.ip.ip_src.s_addr   = src->ip.sin_addr.s_addr;
+       ip4pkt.ip.ip_dst.s_addr   = dest->ip.sin_addr.s_addr;
+       ip4pkt.ip.ip_sum   = 0;
+
+       ip4pkt.tcp.th_sport   = src->ip.sin_port;
+       ip4pkt.tcp.th_dport     = dest->ip.sin_port;
+       ip4pkt.tcp.th_seq      = seq;
+       ip4pkt.tcp.th_ack    = ack;
+       ip4pkt.tcp.th_flags  = TH_ACK;
+       if (rst) {
+               ip4pkt.tcp.th_flags      = TH_RST;
+       }
+       ip4pkt.tcp.th_off    = sizeof(ip4pkt.tcp)/4;
+       ip4pkt.tcp.th_win   = htons(1234);
+       ip4pkt.tcp.th_sum    = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
+
+       ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, (struct sockaddr *)dest, sizeof(*dest));
+       if (ret != sizeof(ip4pkt)) {
+               DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* This function is used to open a raw socket to capture from
+ */
+int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
+{
+       pcap_t *pt;
+
+       pt=pcap_open_live(iface, 100, 0, 0, NULL);
+       if (pt == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to open capture device %s\n", iface));
+               return -1;
+       }
+       *((pcap_t **)private_data) = pt;
+
+       return pcap_fileno(pt);
+}
+
+
+/* This function is used to close the capture socket
+ */
+int ctdb_sys_close_capture_socket(void *private_data)
+{
+       pcap_t *pt = (pcap_t *)private_data;
+       pcap_close(pt);
+       return 0;
+}
+
+
+
+/*
+  send gratuitous arp reply after we have taken over an ip address
+
+  saddr is the address we are trying to claim
+  iface is the interface name we will be using to claim the address
+ */
+int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
+{
+       /* FIXME AIX: We dont do gratuitous arp yet */
+       return -1;
+}
+
+
+
+/*
+  get ethernet MAC address on AIX
+ */
+static int aix_get_mac_addr(const char *device_name, uint8_t mac[6])
+{
+        size_t ksize;
+        struct kinfo_ndd *ndd;
+       int count, i;
+
+        ksize = getkerninfo(KINFO_NDD, 0, 0, 0);
+        if (ksize == 0) {
+               errno = ENOSYS;
+               return -1;
+        }
+
+        ndd = (struct kinfo_ndd *)malloc(ksize);
+        if (ndd == NULL) {
+               errno = ENOMEM;
+               return -1;
+        }
+
+        if (getkerninfo(KINFO_NDD, ndd, &ksize, 0) == -1) {
+               errno = ENOSYS;
+               return -1;
+        }
+
+       count= ksize/sizeof(struct kinfo_ndd);
+       for (i=0;i<count;i++) {
+               if ( (ndd[i].ndd_type != NDD_ETHER) 
+               &&   (ndd[i].ndd_type != NDD_ISO88023) ) {
+                       continue;
+               }
+               if (ndd[i].ndd_addrlen != 6) {
+                       continue;
+               }
+               if (!(ndd[i].ndd_flags&NDD_UP)) {
+                       continue;
+               }
+               if ( strcmp(device_name, ndd[i].ndd_name)
+               &&   strcmp(device_name, ndd[i].ndd_alias) ) {
+                       continue;
+               }
+                memcpy(mac, ndd[i].ndd_addr, 6);
+               free(ndd);
+               return 0;
+        }
+       free(ndd);
+       errno = ENOENT;
+       return -1;
+}
+
+int ctdb_sys_read_tcp_packet(int s, void *private_data, 
+                       ctdb_sock_addr *src, ctdb_sock_addr *dst,
+                       uint32_t *ack_seq, uint32_t *seq)
+{
+       int ret;
+       struct ether_header *eth;
+       struct ip *ip;
+       struct ip6_hdr *ip6;
+       struct tcphdr *tcp;
+       struct ctdb_killtcp_connection *conn;
+       struct pcap_pkthdr pkthdr;
+       const u_char *buffer;
+       pcap_t *pt = (pcap_t *)private_data;
+
+       buffer=pcap_next(pt, &pkthdr);
+       if (buffer==NULL) {
+               return -1;
+       }
+
+       /* Ethernet */
+       eth = (struct ether_header *)buffer;
+
+       /* we want either IPv4 or IPv6 */
+       if (eth->ether_type == htons(ETHERTYPE_IP)) {
+               /* IP */
+               ip = (struct ip *)(eth+1);
+
+               /* We only want IPv4 packets */
+               if (ip->ip_v != 4) {
+                       return -1;
+               }
+               /* Dont look at fragments */
+               if ((ntohs(ip->ip_off)&0x1fff) != 0) {
+                       return -1;
+               }
+               /* we only want TCP */
+               if (ip->ip_p != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* make sure its not a short packet */
+               if (offsetof(struct tcphdr, th_ack) + 4 + 
+                   (ip->ip_hl*4) > ret) {
+                       return -1;
+               }
+               /* TCP */
+               tcp = (struct tcphdr *)((ip->ip_hl*4) + (char *)ip);
+       
+               /* tell the caller which one we've found */
+               src->ip.sin_family      = AF_INET;
+               src->ip.sin_addr.s_addr = ip->ip_src.s_addr;
+               src->ip.sin_port        = tcp->th_sport;
+               dst->ip.sin_family      = AF_INET;
+               dst->ip.sin_addr.s_addr = ip->ip_dst.s_addr;
+               dst->ip.sin_port        = tcp->th_dport;
+               *ack_seq                = tcp->th_ack;
+               *seq                    = tcp->th_seq;
+
+
+               return 0;
+#ifndef ETHERTYPE_IP6
+#define ETHERTYPE_IP6 0x86dd
+#endif
+       } else if (eth->ether_type == htons(ETHERTYPE_IP6)) {
+                       /* IP6 */
+               ip6 = (struct ip6_hdr *)(eth+1);
+
+               /* we only want TCP */
+               if (ip6->ip6_nxt != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* TCP */
+               tcp = (struct tcphdr *)(ip6+1);
+
+               /* tell the caller which one we've found */
+               src->ip6.sin6_family = AF_INET6;
+               src->ip6.sin6_port   = tcp->th_sport;
+               src->ip6.sin6_addr   = ip6->ip6_src;
+
+               dst->ip6.sin6_family = AF_INET6;
+               dst->ip6.sin6_port   = tcp->th_dport;
+               dst->ip6.sin6_addr   = ip6->ip6_dst;
+
+               *ack_seq             = tcp->th_ack;
+               *seq                 = tcp->th_seq;
+
+               return 0;
+       }
+
+       return -1;
+}
+
+
+bool ctdb_sys_check_iface_exists(const char *iface)
+{
+       /* FIXME AIX: Interface always considered present */
+       return true;
+}
+
+int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
+{
+       struct peercred_struct cr;
+       socklen_t crl = sizeof(struct peercred_struct);
+       int ret;
+       if ((ret = getsockopt(fd, SOL_SOCKET, SO_PEERID, &cr, &crl) == 0)) {
+               *peer_pid = cr.pid;
+       }
+       return ret;
+}
+
+char *ctdb_get_process_name(pid_t pid)
+{
+       /* FIXME AIX: get_process_name not implemented */
+       return NULL;
+}
+
+int ctdb_set_process_name(const char *name)
+{
+       /* FIXME AIX: set_process_name not implemented */
+       return -ENOSYS;
+}
+
+bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
+{
+       /* FIXME AIX: get_lock_info not implemented */
+       return false;
+}
+
+bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
+{
+       /* FIXME AIX: get_blocker_pid not implemented */
+       return false;
+}
diff --git a/ctdb/common/system_common.c b/ctdb/common/system_common.c
new file mode 100644 (file)
index 0000000..7563ff3
--- /dev/null
@@ -0,0 +1,187 @@
+/* 
+   ctdb system specific code to manage raw sockets on linux
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include <libgen.h>
+
+/*
+  uint16 checksum for n bytes
+ */
+uint32_t uint16_checksum(uint16_t *data, size_t n)
+{
+       uint32_t sum=0;
+       while (n>=2) {
+               sum += (uint32_t)ntohs(*data);
+               data++;
+               n -= 2;
+       }
+       if (n == 1) {
+               sum += (uint32_t)ntohs(*(uint8_t *)data);
+       }
+       return sum;
+}
+
+/*
+  see if we currently have an interface with the given IP
+
+  we try to bind to it, and if that fails then we don't have that IP
+  on an interface
+ */
+bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
+{
+       int s;
+       int ret;
+       ctdb_sock_addr __addr = *_addr;
+       ctdb_sock_addr *addr = &__addr;
+       socklen_t addrlen = 0;
+
+       switch (addr->sa.sa_family) {
+       case AF_INET:
+               addr->ip.sin_port = 0;
+               addrlen = sizeof(struct sockaddr_in);
+               break;
+       case AF_INET6:
+               addr->ip6.sin6_port = 0;
+               addrlen = sizeof(struct sockaddr_in6);
+               break;
+       }
+
+       s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
+       if (s == -1) {
+               return false;
+       }
+
+       ret = bind(s, (struct sockaddr *)addr, addrlen);
+
+       close(s);
+       return ret == 0;
+}
+
+
+/* find which interface an ip address is currently assigned to */
+char *ctdb_sys_find_ifname(ctdb_sock_addr *addr)
+{
+       int s;
+       int size;
+       struct ifconf ifc;
+       char *ptr;
+
+       s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+       if (s == -1) {
+               DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
+                        strerror(errno)));
+               return NULL;
+       }
+
+
+       size = sizeof(struct ifreq);
+       ifc.ifc_buf = NULL;
+       ifc.ifc_len = size;
+
+       while(ifc.ifc_len > (size - sizeof(struct ifreq))) {
+               size *= 2;
+
+               free(ifc.ifc_buf);      
+               ifc.ifc_len = size;
+               ifc.ifc_buf = malloc(size);
+               memset(ifc.ifc_buf, 0, size);
+               if (ioctl(s, SIOCGIFCONF, (caddr_t)&ifc) < 0) {
+                       DEBUG(DEBUG_CRIT,("Failed to read ifc buffer from socket\n"));
+                       free(ifc.ifc_buf);      
+                       close(s);
+                       return NULL;
+               }
+       }
+
+       for (ptr =(char *)ifc.ifc_buf; ptr < ((char *)ifc.ifc_buf) + ifc.ifc_len; ) {
+               char *ifname;
+               struct ifreq *ifr;
+
+               ifr = (struct ifreq *)ptr;
+
+#ifdef HAVE_SOCKADDR_LEN
+               if (ifr->ifr_addr.sa_len > sizeof(struct sockaddr)) {
+                       ptr += sizeof(ifr->ifr_name) + ifr->ifr_addr.sa_len;
+               } else {
+                       ptr += sizeof(ifr->ifr_name) + sizeof(struct sockaddr);
+               }
+#else
+               ptr += sizeof(struct ifreq);
+#endif
+
+               if (ifr->ifr_addr.sa_family != addr->sa.sa_family) {
+                       continue;
+               }
+
+               switch (addr->sa.sa_family) {
+               case AF_INET:
+
+
+                       if (memcmp(&addr->ip.sin_addr, &((struct sockaddr_in *)&ifr->ifr_addr)->sin_addr, sizeof(addr->ip.sin_addr))) {
+                               continue;
+                       }
+                       break;
+               case AF_INET6:
+                       if (memcmp(&addr->ip6.sin6_addr, &((struct sockaddr_in6 *)&ifr->ifr_addr)->sin6_addr, sizeof(addr->ip6.sin6_addr))) {
+                               continue;
+                       }
+                       break;
+               }
+
+               ifname = strdup(ifr->ifr_name);
+               free(ifc.ifc_buf);      
+               close(s);
+               return ifname;
+       }
+
+
+       free(ifc.ifc_buf);      
+       close(s);
+
+       return NULL;
+}
+
+int mkdir_p(const char *dir, int mode)
+{
+       char * t;
+       int ret;
+
+       if (strcmp(dir, "/") == 0) {
+               return 0;
+       }
+
+       t = talloc_strdup(NULL, dir);
+       if (t == NULL) {
+               return ENOMEM;
+       }
+       ret = mkdir_p(dirname(t), mode);
+       talloc_free(t);
+
+       if (ret == 0) {
+               ret = mkdir(dir, mode);
+               if ((ret == -1) &&  (errno == EEXIST)) {
+                       ret = 0;
+               }
+       }
+
+       return ret;
+}
diff --git a/ctdb/common/system_freebsd.c b/ctdb/common/system_freebsd.c
new file mode 100644 (file)
index 0000000..9597a7a
--- /dev/null
@@ -0,0 +1,409 @@
+/* 
+   ctdb system specific code to manage raw sockets on freebsd
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Marc Dequènes (Duck) 2009
+   Copyright (C) Volker Lendecke 2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+
+  This file is a copy of 'common/system_linux.c' adapted for Hurd^W kFreeBSD
+  needs, and inspired by 'common/system_aix.c' for the pcap usage.
+*/
+
+#include "includes.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include <net/ethernet.h>
+#include <netinet/ip6.h>
+#include <net/if_arp.h>
+#include <pcap.h>
+
+
+#ifndef ETHERTYPE_IP6
+#define ETHERTYPE_IP6 0x86dd
+#endif
+
+/*
+  calculate the tcp checksum for tcp over ipv6
+*/
+static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
+{
+       uint32_t phdr[2];
+       uint32_t sum = 0;
+       uint16_t sum2;
+
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
+
+       phdr[0] = htonl(n);
+       phdr[1] = htonl(ip6->ip6_nxt);
+       sum += uint16_checksum((uint16_t *)phdr, 8);
+
+       sum += uint16_checksum(data, n);
+
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  send gratuitous arp reply after we have taken over an ip address
+
+  saddr is the address we are trying to claim
+  iface is the interface name we will be using to claim the address
+ */
+int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
+{
+       /* FIXME FreeBSD: We dont do gratuitous arp yet */
+       return -1;
+}
+
+
+/*
+  simple TCP checksum - assumes data is multiple of 2 bytes long
+ */
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct ip *ip)
+{
+       uint32_t sum = uint16_checksum(data, n);
+       uint16_t sum2;
+       sum += uint16_checksum((uint16_t *)(void *)&ip->ip_src,
+                              sizeof(ip->ip_src));
+       sum += uint16_checksum((uint16_t *)(void *)&ip->ip_dst,
+                              sizeof(ip->ip_dst));
+       sum += ip->ip_p + n;
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  Send tcp segment from the specified IP/port to the specified
+  destination IP/port. 
+
+  This is used to trigger the receiving host into sending its own ACK,
+  which should trigger early detection of TCP reset by the client
+  after IP takeover
+
+  This can also be used to send RST segments (if rst is true) and also
+  if correct seq and ack numbers are provided.
+ */
+int ctdb_sys_send_tcp(const ctdb_sock_addr *dest, 
+                     const ctdb_sock_addr *src,
+                     uint32_t seq, uint32_t ack, int rst)
+{
+       int s;
+       int ret;
+       uint32_t one = 1;
+       uint16_t tmpport;
+       ctdb_sock_addr *tmpdest;
+       struct {
+               struct ip ip;
+               struct tcphdr tcp;
+       } ip4pkt;
+       struct {
+               struct ip6_hdr ip6;
+               struct tcphdr tcp;
+       } ip6pkt;
+
+       switch (src->ip.sin_family) {
+       case AF_INET:
+               ZERO_STRUCT(ip4pkt);
+               ip4pkt.ip.ip_v  = 4;
+               ip4pkt.ip.ip_hl    = sizeof(ip4pkt.ip)/4;
+               ip4pkt.ip.ip_len   = htons(sizeof(ip4pkt));
+               ip4pkt.ip.ip_ttl   = 255;
+               ip4pkt.ip.ip_p     = IPPROTO_TCP;
+               ip4pkt.ip.ip_src.s_addr = src->ip.sin_addr.s_addr;
+               ip4pkt.ip.ip_dst.s_addr = dest->ip.sin_addr.s_addr;
+               ip4pkt.ip.ip_sum   = 0;
+
+               ip4pkt.tcp.th_sport = src->ip.sin_port;
+               ip4pkt.tcp.th_dport = dest->ip.sin_port;
+               ip4pkt.tcp.th_seq   = seq;
+               ip4pkt.tcp.th_ack   = ack;
+               ip4pkt.tcp.th_flags = 0;
+               ip4pkt.tcp.th_flags |= TH_ACK;
+               if (rst) {
+                       ip4pkt.tcp.th_flags |= TH_RST;
+               }
+               ip4pkt.tcp.th_off   = sizeof(ip4pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip4pkt.tcp.th_win   = htons(1234);
+               ip4pkt.tcp.th_sum   = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
+
+               /* open a raw socket to send this segment from */
+               s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
+                                strerror(errno)));
+                       return -1;
+               }
+
+               ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
+                                strerror(errno)));
+                       close(s);
+                       return -1;
+               }
+
+               set_nonblocking(s);
+               set_close_on_exec(s);
+
+               ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
+                            (const struct sockaddr *)&dest->ip,
+                            sizeof(dest->ip));
+               close(s);
+               if (ret != sizeof(ip4pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+       case AF_INET6:
+               ZERO_STRUCT(ip6pkt);
+               ip6pkt.ip6.ip6_vfc  = 0x60;
+               ip6pkt.ip6.ip6_plen = htons(20);
+               ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
+               ip6pkt.ip6.ip6_hlim = 64;
+               ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
+               ip6pkt.ip6.ip6_dst  = dest->ip6.sin6_addr;
+
+               ip6pkt.tcp.th_sport = src->ip6.sin6_port;
+               ip6pkt.tcp.th_dport = dest->ip6.sin6_port;
+               ip6pkt.tcp.th_seq   = seq;
+               ip6pkt.tcp.th_ack   = ack;
+               ip6pkt.tcp.th_flags = 0;
+               ip6pkt.tcp.th_flags |= TH_ACK;
+               if (rst) {
+                       ip6pkt.tcp.th_flags |= TH_RST;
+               }
+               ip6pkt.tcp.th_off   = sizeof(ip6pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip6pkt.tcp.th_win   = htons(1234);
+               ip6pkt.tcp.th_sum   = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
+
+               s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
+                       return -1;
+
+               }
+               /* sendto() dont like if the port is set and the socket is
+                  in raw mode.
+               */
+               tmpdest = discard_const(dest);
+               tmpport = tmpdest->ip6.sin6_port;
+
+               tmpdest->ip6.sin6_port = 0;
+               ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
+                            (const struct sockaddr *)&dest->ip6,
+                            sizeof(dest->ip6));
+               tmpdest->ip6.sin6_port = tmpport;
+               close(s);
+
+               if (ret != sizeof(ip6pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+
+       default:
+               DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* 
+   This function is used to open a raw socket to capture from
+ */
+int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
+{
+       pcap_t *pt;
+
+       pt=pcap_open_live(iface, 100, 0, 0, NULL);
+       if (pt == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to open capture device %s\n", iface));
+               return -1;
+       }
+       *((pcap_t **)private_data) = pt;
+
+       return pcap_fileno(pt);
+}
+
+/* This function is used to close the capture socket
+ */
+int ctdb_sys_close_capture_socket(void *private_data)
+{
+       pcap_t *pt = (pcap_t *)private_data;
+       pcap_close(pt);
+       return 0;
+}
+
+
+/*
+  called when the raw socket becomes readable
+ */
+int ctdb_sys_read_tcp_packet(int s, void *private_data, 
+                       ctdb_sock_addr *src, ctdb_sock_addr *dst,
+                       uint32_t *ack_seq, uint32_t *seq)
+{
+       int ret;
+#define RCVPKTSIZE 100
+       char pkt[RCVPKTSIZE];
+       struct ether_header *eth;
+       struct ip *ip;
+       struct ip6_hdr *ip6;
+       struct tcphdr *tcp;
+
+       ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
+       if (ret < sizeof(*eth)+sizeof(*ip)) {
+               return -1;
+       }
+
+       /* Ethernet */
+       eth = (struct ether_header *)pkt;
+
+       /* we want either IPv4 or IPv6 */
+       if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
+               /* IP */
+               ip = (struct ip *)(eth+1);
+
+               /* We only want IPv4 packets */
+               if (ip->ip_v != 4) {
+                       return -1;
+               }
+               /* Dont look at fragments */
+               if ((ntohs(ip->ip_off)&0x1fff) != 0) {
+                       return -1;
+               }
+               /* we only want TCP */
+               if (ip->ip_p != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* make sure its not a short packet */
+               if (offsetof(struct tcphdr, th_ack) + 4 + 
+                   (ip->ip_hl*4) + sizeof(*eth) > ret) {
+                       return -1;
+               }
+               /* TCP */
+               tcp = (struct tcphdr *)((ip->ip_hl*4) + (char *)ip);
+
+               /* tell the caller which one we've found */
+               src->ip.sin_family      = AF_INET;
+               src->ip.sin_addr.s_addr = ip->ip_src.s_addr;
+               src->ip.sin_port        = tcp->th_sport;
+               dst->ip.sin_family      = AF_INET;
+               dst->ip.sin_addr.s_addr = ip->ip_dst.s_addr;
+               dst->ip.sin_port        = tcp->th_dport;
+               *ack_seq                = tcp->th_ack;
+               *seq                    = tcp->th_seq;
+
+               return 0;
+       } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
+               /* IP6 */
+               ip6 = (struct ip6_hdr *)(eth+1);
+
+               /* we only want TCP */
+               if (ip6->ip6_nxt != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* TCP */
+               tcp = (struct tcphdr *)(ip6+1);
+
+               /* tell the caller which one we've found */
+               src->ip6.sin6_family = AF_INET6;
+               src->ip6.sin6_port   = tcp->th_sport;
+               src->ip6.sin6_addr   = ip6->ip6_src;
+
+               dst->ip6.sin6_family = AF_INET6;
+               dst->ip6.sin6_port   = tcp->th_dport;
+               dst->ip6.sin6_addr   = ip6->ip6_dst;
+
+               *ack_seq             = tcp->th_ack;
+               *seq                 = tcp->th_seq;
+
+               return 0;
+       }
+
+       return -1;
+}
+
+bool ctdb_sys_check_iface_exists(const char *iface)
+{
+       /* FIXME FreeBSD: Interface always considered present */
+       return true;
+}
+
+int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
+{
+       /* FIXME FreeBSD: get_peer_pid not implemented */
+       return 1;
+}
+
+char *ctdb_get_process_name(pid_t pid)
+{
+       char path[32];
+       char buf[PATH_MAX];
+       char *ptr;
+       int n;
+
+       snprintf(path, sizeof(path), "/proc/%d/exe", pid);
+       n = readlink(path, buf, sizeof(buf));
+       if (n < 0) {
+               return NULL;
+       }
+
+       /* Remove any extra fields */
+       buf[n] = '\0';
+       ptr = strtok(buf, " ");
+       return strdup(ptr);
+       return NULL;
+}
+
+int ctdb_set_process_name(const char *name)
+{
+       /* FIXME FreeBSD: set_process_name not implemented */
+       return -ENOSYS;
+}
+
+bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
+{
+       /* FIXME FreeBSD: get_lock_info not implemented */
+       return false;
+}
+
+bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
+{
+       /* FIXME FreeBSD: get_blocker_pid not implemented */
+       return false;
+}
diff --git a/ctdb/common/system_gnu.c b/ctdb/common/system_gnu.c
new file mode 100644 (file)
index 0000000..2ab1399
--- /dev/null
@@ -0,0 +1,388 @@
+/* 
+   ctdb system specific code to manage raw sockets on linux
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Marc Dequènes (Duck) 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+
+  This file is a copy of 'common/system_linux.c' adapted for Hurd needs,
+  and inspired by 'common/system_aix.c' for the pcap usage.
+*/
+
+#include "includes.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include <net/ethernet.h>
+#include <netinet/ip6.h>
+#include <net/if_arp.h>
+#include <pcap.h>
+
+
+#ifndef ETHERTYPE_IP6
+#define ETHERTYPE_IP6 0x86dd
+#endif
+
+/*
+  calculate the tcp checksum for tcp over ipv6
+*/
+static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
+{
+       uint32_t phdr[2];
+       uint32_t sum = 0;
+       uint16_t sum2;
+
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
+
+       phdr[0] = htonl(n);
+       phdr[1] = htonl(ip6->ip6_nxt);
+       sum += uint16_checksum((uint16_t *)phdr, 8);
+
+       sum += uint16_checksum(data, n);
+
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  send gratuitous arp reply after we have taken over an ip address
+
+  saddr is the address we are trying to claim
+  iface is the interface name we will be using to claim the address
+ */
+int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
+{
+       /* FIXME GNU/Hurd: We dont do gratuitous arp yet */
+       return -1;
+}
+
+
+/*
+  simple TCP checksum - assumes data is multiple of 2 bytes long
+ */
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
+{
+       uint32_t sum = uint16_checksum(data, n);
+       uint16_t sum2;
+       sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
+                              sizeof(ip->saddr));
+       sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
+                              sizeof(ip->daddr));
+       sum += ip->protocol + n;
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  Send tcp segment from the specified IP/port to the specified
+  destination IP/port. 
+
+  This is used to trigger the receiving host into sending its own ACK,
+  which should trigger early detection of TCP reset by the client
+  after IP takeover
+
+  This can also be used to send RST segments (if rst is true) and also
+  if correct seq and ack numbers are provided.
+ */
+int ctdb_sys_send_tcp(const ctdb_sock_addr *dest, 
+                     const ctdb_sock_addr *src,
+                     uint32_t seq, uint32_t ack, int rst)
+{
+       int s;
+       int ret;
+       uint32_t one = 1;
+       uint16_t tmpport;
+       ctdb_sock_addr *tmpdest;
+       struct {
+               struct iphdr ip;
+               struct tcphdr tcp;
+       } ip4pkt;
+       struct {
+               struct ip6_hdr ip6;
+               struct tcphdr tcp;
+       } ip6pkt;
+
+       switch (src->ip.sin_family) {
+       case AF_INET:
+               ZERO_STRUCT(ip4pkt);
+               ip4pkt.ip.version  = 4;
+               ip4pkt.ip.ihl      = sizeof(ip4pkt.ip)/4;
+               ip4pkt.ip.tot_len  = htons(sizeof(ip4pkt));
+               ip4pkt.ip.ttl      = 255;
+               ip4pkt.ip.protocol = IPPROTO_TCP;
+               ip4pkt.ip.saddr    = src->ip.sin_addr.s_addr;
+               ip4pkt.ip.daddr    = dest->ip.sin_addr.s_addr;
+               ip4pkt.ip.check    = 0;
+
+               ip4pkt.tcp.source   = src->ip.sin_port;
+               ip4pkt.tcp.dest     = dest->ip.sin_port;
+               ip4pkt.tcp.seq      = seq;
+               ip4pkt.tcp.ack_seq  = ack;
+               ip4pkt.tcp.ack      = 1;
+               if (rst) {
+                       ip4pkt.tcp.rst      = 1;
+               }
+               ip4pkt.tcp.doff     = sizeof(ip4pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip4pkt.tcp.window   = htons(1234);
+               ip4pkt.tcp.check    = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
+
+               /* open a raw socket to send this segment from */
+               s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
+                                strerror(errno)));
+                       return -1;
+               }
+
+               ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
+                                strerror(errno)));
+                       close(s);
+                       return -1;
+               }
+
+               set_nonblocking(s);
+               set_close_on_exec(s);
+
+               ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, &dest->ip, sizeof(dest->ip));
+               close(s);
+               if (ret != sizeof(ip4pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+       case AF_INET6:
+               ZERO_STRUCT(ip6pkt);
+               ip6pkt.ip6.ip6_vfc  = 0x60;
+               ip6pkt.ip6.ip6_plen = htons(20);
+               ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
+               ip6pkt.ip6.ip6_hlim = 64;
+               ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
+               ip6pkt.ip6.ip6_dst  = dest->ip6.sin6_addr;
+
+               ip6pkt.tcp.source   = src->ip6.sin6_port;
+               ip6pkt.tcp.dest     = dest->ip6.sin6_port;
+               ip6pkt.tcp.seq      = seq;
+               ip6pkt.tcp.ack_seq  = ack;
+               ip6pkt.tcp.ack      = 1;
+               if (rst) {
+                       ip6pkt.tcp.rst      = 1;
+               }
+               ip6pkt.tcp.doff     = sizeof(ip6pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip6pkt.tcp.window   = htons(1234);
+               ip6pkt.tcp.check    = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
+
+               s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
+                       return -1;
+
+               }
+               /* sendto() dont like if the port is set and the socket is
+                  in raw mode.
+               */
+               tmpdest = discard_const(dest);
+               tmpport = tmpdest->ip6.sin6_port;
+
+               tmpdest->ip6.sin6_port = 0;
+               ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, &dest->ip6, sizeof(dest->ip6));
+               tmpdest->ip6.sin6_port = tmpport;
+               close(s);
+
+               if (ret != sizeof(ip6pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+
+       default:
+               DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* 
+   This function is used to open a raw socket to capture from
+ */
+int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
+{
+       pcap_t *pt;
+
+       pt=pcap_open_live(iface, 100, 0, 0, NULL);
+       if (pt == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to open capture device %s\n", iface));
+               return -1;
+       }
+       *((pcap_t **)private_data) = pt;
+
+       return pcap_fileno(pt);
+}
+
+/* This function is used to close the capture socket
+ */
+int ctdb_sys_close_capture_socket(void *private_data)
+{
+       pcap_t *pt = (pcap_t *)private_data;
+       pcap_close(pt);
+       return 0;
+}
+
+
+/*
+  called when the raw socket becomes readable
+ */
+int ctdb_sys_read_tcp_packet(int s, void *private_data, 
+                       ctdb_sock_addr *src, ctdb_sock_addr *dst,
+                       uint32_t *ack_seq, uint32_t *seq)
+{
+       int ret;
+#define RCVPKTSIZE 100
+       char pkt[RCVPKTSIZE];
+       struct ether_header *eth;
+       struct iphdr *ip;
+       struct ip6_hdr *ip6;
+       struct tcphdr *tcp;
+
+       ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
+       if (ret < sizeof(*eth)+sizeof(*ip)) {
+               return -1;
+       }
+
+       /* Ethernet */
+       eth = (struct ether_header *)pkt;
+
+       /* we want either IPv4 or IPv6 */
+       if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
+               /* IP */
+               ip = (struct iphdr *)(eth+1);
+
+               /* We only want IPv4 packets */
+               if (ip->version != 4) {
+                       return -1;
+               }
+               /* Dont look at fragments */
+               if ((ntohs(ip->frag_off)&0x1fff) != 0) {
+                       return -1;
+               }
+               /* we only want TCP */
+               if (ip->protocol != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* make sure its not a short packet */
+               if (offsetof(struct tcphdr, ack_seq) + 4 + 
+                   (ip->ihl*4) + sizeof(*eth) > ret) {
+                       return -1;
+               }
+               /* TCP */
+               tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
+
+               /* tell the caller which one we've found */
+               src->ip.sin_family      = AF_INET;
+               src->ip.sin_addr.s_addr = ip->saddr;
+               src->ip.sin_port        = tcp->source;
+               dst->ip.sin_family      = AF_INET;
+               dst->ip.sin_addr.s_addr = ip->daddr;
+               dst->ip.sin_port        = tcp->dest;
+               *ack_seq                = tcp->ack_seq;
+               *seq                    = tcp->seq;
+
+               return 0;
+       } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
+               /* IP6 */
+               ip6 = (struct ip6_hdr *)(eth+1);
+
+               /* we only want TCP */
+               if (ip6->ip6_nxt != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* TCP */
+               tcp = (struct tcphdr *)(ip6+1);
+
+               /* tell the caller which one we've found */
+               src->ip6.sin6_family = AF_INET6;
+               src->ip6.sin6_port   = tcp->source;
+               src->ip6.sin6_addr   = ip6->ip6_src;
+
+               dst->ip6.sin6_family = AF_INET6;
+               dst->ip6.sin6_port   = tcp->dest;
+               dst->ip6.sin6_addr   = ip6->ip6_dst;
+
+               *ack_seq             = tcp->ack_seq;
+               *seq                 = tcp->seq;
+
+               return 0;
+       }
+
+       return -1;
+}
+
+bool ctdb_sys_check_iface_exists(const char *iface)
+{
+       /* FIXME GNU/Hurd: Interface always considered present */
+       return true;
+}
+
+int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
+{
+       /* FIXME GNU/Hurd: get_peer_pid not implemented */
+       return 1;
+}
+
+char *ctdb_get_process_name(pid_t pid)
+{
+       /* FIXME GNU/Hurd: get_process_name not implemented */
+       return NULL;
+}
+
+int ctdb_set_process_name(const char *name)
+{
+       /* FIXME GNU/Hurd: set_process_name not implemented */
+       return -ENOSYS;
+}
+
+bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
+{
+       /* FIXME GNU/Hurd: get_lock_info not implemented */
+       return false;
+}
+
+bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
+{
+       /* FIXME GNU/Hurd: get_blocker_pid not implemented */
+       return false;
+}
diff --git a/ctdb/common/system_kfreebsd.c b/ctdb/common/system_kfreebsd.c
new file mode 100644 (file)
index 0000000..41aa4d6
--- /dev/null
@@ -0,0 +1,401 @@
+/* 
+   ctdb system specific code to manage raw sockets on linux
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Marc Dequènes (Duck) 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+
+  This file is a copy of 'common/system_linux.c' adapted for Hurd^W kFreeBSD
+  needs, and inspired by 'common/system_aix.c' for the pcap usage.
+*/
+
+#include "includes.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include <net/ethernet.h>
+#include <netinet/ip6.h>
+#include <net/if_arp.h>
+#include <pcap.h>
+
+
+#ifndef ETHERTYPE_IP6
+#define ETHERTYPE_IP6 0x86dd
+#endif
+
+/*
+  calculate the tcp checksum for tcp over ipv6
+*/
+static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
+{
+       uint32_t phdr[2];
+       uint32_t sum = 0;
+       uint16_t sum2;
+
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
+
+       phdr[0] = htonl(n);
+       phdr[1] = htonl(ip6->ip6_nxt);
+       sum += uint16_checksum((uint16_t *)phdr, 8);
+
+       sum += uint16_checksum(data, n);
+
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  send gratuitous arp reply after we have taken over an ip address
+
+  saddr is the address we are trying to claim
+  iface is the interface name we will be using to claim the address
+ */
+int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
+{
+       /* FIXME kFreeBSD: We dont do gratuitous arp yet */
+       return -1;
+}
+
+
+/*
+  simple TCP checksum - assumes data is multiple of 2 bytes long
+ */
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
+{
+       uint32_t sum = uint16_checksum(data, n);
+       uint16_t sum2;
+       sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
+                              sizeof(ip->saddr));
+       sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
+                              sizeof(ip->daddr));
+       sum += ip->protocol + n;
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  Send tcp segment from the specified IP/port to the specified
+  destination IP/port. 
+
+  This is used to trigger the receiving host into sending its own ACK,
+  which should trigger early detection of TCP reset by the client
+  after IP takeover
+
+  This can also be used to send RST segments (if rst is true) and also
+  if correct seq and ack numbers are provided.
+ */
+int ctdb_sys_send_tcp(const ctdb_sock_addr *dest, 
+                     const ctdb_sock_addr *src,
+                     uint32_t seq, uint32_t ack, int rst)
+{
+       int s;
+       int ret;
+       uint32_t one = 1;
+       uint16_t tmpport;
+       ctdb_sock_addr *tmpdest;
+       struct {
+               struct iphdr ip;
+               struct tcphdr tcp;
+       } ip4pkt;
+       struct {
+               struct ip6_hdr ip6;
+               struct tcphdr tcp;
+       } ip6pkt;
+
+       switch (src->ip.sin_family) {
+       case AF_INET:
+               ZERO_STRUCT(ip4pkt);
+               ip4pkt.ip.version  = 4;
+               ip4pkt.ip.ihl      = sizeof(ip4pkt.ip)/4;
+               ip4pkt.ip.tot_len  = htons(sizeof(ip4pkt));
+               ip4pkt.ip.ttl      = 255;
+               ip4pkt.ip.protocol = IPPROTO_TCP;
+               ip4pkt.ip.saddr    = src->ip.sin_addr.s_addr;
+               ip4pkt.ip.daddr    = dest->ip.sin_addr.s_addr;
+               ip4pkt.ip.check    = 0;
+
+               ip4pkt.tcp.source   = src->ip.sin_port;
+               ip4pkt.tcp.dest     = dest->ip.sin_port;
+               ip4pkt.tcp.seq      = seq;
+               ip4pkt.tcp.ack_seq  = ack;
+               ip4pkt.tcp.ack      = 1;
+               if (rst) {
+                       ip4pkt.tcp.rst      = 1;
+               }
+               ip4pkt.tcp.doff     = sizeof(ip4pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip4pkt.tcp.window   = htons(1234);
+               ip4pkt.tcp.check    = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
+
+               /* open a raw socket to send this segment from */
+               s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
+                                strerror(errno)));
+                       return -1;
+               }
+
+               ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
+                                strerror(errno)));
+                       close(s);
+                       return -1;
+               }
+
+               set_nonblocking(s);
+               set_close_on_exec(s);
+
+               ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0, &dest->ip, sizeof(dest->ip));
+               close(s);
+               if (ret != sizeof(ip4pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+       case AF_INET6:
+               ZERO_STRUCT(ip6pkt);
+               ip6pkt.ip6.ip6_vfc  = 0x60;
+               ip6pkt.ip6.ip6_plen = htons(20);
+               ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
+               ip6pkt.ip6.ip6_hlim = 64;
+               ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
+               ip6pkt.ip6.ip6_dst  = dest->ip6.sin6_addr;
+
+               ip6pkt.tcp.source   = src->ip6.sin6_port;
+               ip6pkt.tcp.dest     = dest->ip6.sin6_port;
+               ip6pkt.tcp.seq      = seq;
+               ip6pkt.tcp.ack_seq  = ack;
+               ip6pkt.tcp.ack      = 1;
+               if (rst) {
+                       ip6pkt.tcp.rst      = 1;
+               }
+               ip6pkt.tcp.doff     = sizeof(ip6pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip6pkt.tcp.window   = htons(1234);
+               ip6pkt.tcp.check    = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
+
+               s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
+                       return -1;
+
+               }
+               /* sendto() dont like if the port is set and the socket is
+                  in raw mode.
+               */
+               tmpdest = discard_const(dest);
+               tmpport = tmpdest->ip6.sin6_port;
+
+               tmpdest->ip6.sin6_port = 0;
+               ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0, &dest->ip6, sizeof(dest->ip6));
+               tmpdest->ip6.sin6_port = tmpport;
+               close(s);
+
+               if (ret != sizeof(ip6pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+
+       default:
+               DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* 
+   This function is used to open a raw socket to capture from
+ */
+int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
+{
+       pcap_t *pt;
+
+       pt=pcap_open_live(iface, 100, 0, 0, NULL);
+       if (pt == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to open capture device %s\n", iface));
+               return -1;
+       }
+       *((pcap_t **)private_data) = pt;
+
+       return pcap_fileno(pt);
+}
+
+/* This function is used to close the capture socket
+ */
+int ctdb_sys_close_capture_socket(void *private_data)
+{
+       pcap_t *pt = (pcap_t *)private_data;
+       pcap_close(pt);
+       return 0;
+}
+
+
+/*
+  called when the raw socket becomes readable
+ */
+int ctdb_sys_read_tcp_packet(int s, void *private_data, 
+                       ctdb_sock_addr *src, ctdb_sock_addr *dst,
+                       uint32_t *ack_seq, uint32_t *seq)
+{
+       int ret;
+#define RCVPKTSIZE 100
+       char pkt[RCVPKTSIZE];
+       struct ether_header *eth;
+       struct iphdr *ip;
+       struct ip6_hdr *ip6;
+       struct tcphdr *tcp;
+
+       ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
+       if (ret < sizeof(*eth)+sizeof(*ip)) {
+               return -1;
+       }
+
+       /* Ethernet */
+       eth = (struct ether_header *)pkt;
+
+       /* we want either IPv4 or IPv6 */
+       if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
+               /* IP */
+               ip = (struct iphdr *)(eth+1);
+
+               /* We only want IPv4 packets */
+               if (ip->version != 4) {
+                       return -1;
+               }
+               /* Dont look at fragments */
+               if ((ntohs(ip->frag_off)&0x1fff) != 0) {
+                       return -1;
+               }
+               /* we only want TCP */
+               if (ip->protocol != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* make sure its not a short packet */
+               if (offsetof(struct tcphdr, ack_seq) + 4 + 
+                   (ip->ihl*4) + sizeof(*eth) > ret) {
+                       return -1;
+               }
+               /* TCP */
+               tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
+
+               /* tell the caller which one we've found */
+               src->ip.sin_family      = AF_INET;
+               src->ip.sin_addr.s_addr = ip->saddr;
+               src->ip.sin_port        = tcp->source;
+               dst->ip.sin_family      = AF_INET;
+               dst->ip.sin_addr.s_addr = ip->daddr;
+               dst->ip.sin_port        = tcp->dest;
+               *ack_seq                = tcp->ack_seq;
+               *seq                    = tcp->seq;
+
+               return 0;
+       } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
+               /* IP6 */
+               ip6 = (struct ip6_hdr *)(eth+1);
+
+               /* we only want TCP */
+               if (ip6->ip6_nxt != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* TCP */
+               tcp = (struct tcphdr *)(ip6+1);
+
+               /* tell the caller which one we've found */
+               src->ip6.sin6_family = AF_INET6;
+               src->ip6.sin6_port   = tcp->source;
+               src->ip6.sin6_addr   = ip6->ip6_src;
+
+               dst->ip6.sin6_family = AF_INET6;
+               dst->ip6.sin6_port   = tcp->dest;
+               dst->ip6.sin6_addr   = ip6->ip6_dst;
+
+               *ack_seq             = tcp->ack_seq;
+               *seq                 = tcp->seq;
+
+               return 0;
+       }
+
+       return -1;
+}
+
+bool ctdb_sys_check_iface_exists(const char *iface)
+{
+       /* FIXME kFreeBSD: Interface always considered present */
+       return true;
+}
+
+int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
+{
+       /* FIXME kFreeBSD: get_peer_pid not implemented */
+       return 1;
+}
+
+char *ctdb_get_process_name(pid_t pid)
+{
+       char path[32];
+       char buf[PATH_MAX];
+       char *ptr;
+       int n;
+
+       snprintf(path, sizeof(path), "/proc/%d/exe", pid);
+       n = readlink(path, buf, sizeof(buf));
+       if (n < 0) {
+               return NULL;
+       }
+
+       /* Remove any extra fields */
+       buf[n] = '\0';
+       ptr = strtok(buf, " ");
+       return strdup(ptr);
+}
+
+int ctdb_set_process_name(const char *name)
+{
+       /* FIXME kFreeBSD: set_process_name not implemented */
+       return -ENOSYS;
+}
+
+bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
+{
+       /* FIXME kFreeBSD: get_lock_info not implemented */
+       return false;
+}
+
+bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
+{
+       /* FIXME kFreeBSD: get_blocker_pid not implemented */
+       return false;
+}
diff --git a/ctdb/common/system_linux.c b/ctdb/common/system_linux.c
new file mode 100644 (file)
index 0000000..84daba4
--- /dev/null
@@ -0,0 +1,768 @@
+/* 
+   ctdb system specific code to manage raw sockets on linux
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include <netinet/if_ether.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <net/if_arp.h>
+#include <netpacket/packet.h>
+#include <sys/prctl.h>
+
+#ifndef ETHERTYPE_IP6
+#define ETHERTYPE_IP6 0x86dd
+#endif
+
+/*
+  calculate the tcp checksum for tcp over ipv6
+*/
+static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
+{
+       uint32_t phdr[2];
+       uint32_t sum = 0;
+       uint16_t sum2;
+
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
+       sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
+
+       phdr[0] = htonl(n);
+       phdr[1] = htonl(ip6->ip6_nxt);
+       sum += uint16_checksum((uint16_t *)phdr, 8);
+
+       sum += uint16_checksum(data, n);
+
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  send gratuitous arp reply after we have taken over an ip address
+
+  saddr is the address we are trying to claim
+  iface is the interface name we will be using to claim the address
+ */
+int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
+{
+       int s, ret;
+       struct sockaddr_ll sall;
+       struct ether_header *eh;
+       struct arphdr *ah;
+       struct ip6_hdr *ip6;
+       struct nd_neighbor_solicit *nd_ns;
+       struct ifreq if_hwaddr;
+       unsigned char buffer[78]; /* ipv6 neigh solicitation size */
+       char *ptr;
+       char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
+       struct ifreq ifr;
+
+       ZERO_STRUCT(sall);
+
+       switch (addr->ip.sin_family) {
+       case AF_INET:
+               s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
+               if (s == -1){
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
+                       return -1;
+               }
+
+               DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
+               strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
+               if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
+                       close(s);
+                       return -1;
+               }
+
+               /* get the mac address */
+               strcpy(if_hwaddr.ifr_name, iface);
+               ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
+               if ( ret < 0 ) {
+                       close(s);
+                       DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
+                       return -1;
+               }
+               if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
+                       DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
+                       close(s);
+                       return 0;
+               }
+               if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
+                       close(s);
+                       errno = EINVAL;
+                       DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
+                                if_hwaddr.ifr_hwaddr.sa_family));
+                       return -1;
+               }
+
+
+               memset(buffer, 0 , 64);
+               eh = (struct ether_header *)buffer;
+               memset(eh->ether_dhost, 0xff, ETH_ALEN);
+               memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+               eh->ether_type = htons(ETHERTYPE_ARP);
+       
+               ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
+               ah->ar_hrd = htons(ARPHRD_ETHER);
+               ah->ar_pro = htons(ETH_P_IP);
+               ah->ar_hln = ETH_ALEN;
+               ah->ar_pln = 4;
+
+               /* send a gratious arp */
+               ah->ar_op  = htons(ARPOP_REQUEST);
+               ptr = (char *)&ah[1];
+               memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+               ptr+=ETH_ALEN;
+               memcpy(ptr, &addr->ip.sin_addr, 4);       
+               ptr+=4;
+               memset(ptr, 0, ETH_ALEN); 
+               ptr+=ETH_ALEN;
+               memcpy(ptr, &addr->ip.sin_addr, 4);       
+               ptr+=4;
+       
+               sall.sll_family = AF_PACKET;
+               sall.sll_halen = 6;
+               memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
+               sall.sll_protocol = htons(ETH_P_ALL);
+               sall.sll_ifindex = ifr.ifr_ifindex;
+               ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
+               if (ret < 0 ){
+                       close(s);
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
+                       return -1;
+               }       
+
+               /* send unsolicited arp reply broadcast */
+               ah->ar_op  = htons(ARPOP_REPLY);
+               ptr = (char *)&ah[1];
+               memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+               ptr+=ETH_ALEN;
+               memcpy(ptr, &addr->ip.sin_addr, 4);       
+               ptr+=4;
+               memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+               ptr+=ETH_ALEN;
+               memcpy(ptr, &addr->ip.sin_addr, 4);       
+               ptr+=4;
+
+               ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
+               if (ret < 0 ){
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
+                       close(s);
+                       return -1;
+               }
+
+               close(s);
+               break;
+       case AF_INET6:
+               s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
+               if (s == -1){
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
+                       return -1;
+               }
+
+               DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
+               strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
+               if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
+                       close(s);
+                       return -1;
+               }
+
+               /* get the mac address */
+               strcpy(if_hwaddr.ifr_name, iface);
+               ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
+               if ( ret < 0 ) {
+                       close(s);
+                       DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
+                       return -1;
+               }
+               if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
+                       DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
+                       close(s);
+                       return 0;
+               }
+               if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
+                       close(s);
+                       errno = EINVAL;
+                       DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
+                                if_hwaddr.ifr_hwaddr.sa_family));
+                       return -1;
+               }
+
+               memset(buffer, 0 , sizeof(buffer));
+               eh = (struct ether_header *)buffer;
+               memset(eh->ether_dhost, 0xff, ETH_ALEN);
+               memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
+               eh->ether_type = htons(ETHERTYPE_IP6);
+
+               ip6 = (struct ip6_hdr *)(eh+1);
+               ip6->ip6_vfc  = 0x60;
+               ip6->ip6_plen = htons(sizeof(*nd_ns));
+               ip6->ip6_nxt  = IPPROTO_ICMPV6;
+               ip6->ip6_hlim = 255;
+               ip6->ip6_dst  = addr->ip6.sin6_addr;
+
+               nd_ns = (struct nd_neighbor_solicit *)(ip6+1);
+               nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
+               nd_ns->nd_ns_code = 0;
+               nd_ns->nd_ns_reserved = 0;
+               nd_ns->nd_ns_target = addr->ip6.sin6_addr;
+
+               nd_ns->nd_ns_cksum = tcp_checksum6((uint16_t *)nd_ns, ntohs(ip6->ip6_plen), ip6);
+
+               sall.sll_family = AF_PACKET;
+               sall.sll_halen = 6;
+               memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
+               sall.sll_protocol = htons(ETH_P_ALL);
+               sall.sll_ifindex = ifr.ifr_ifindex;
+               ret = sendto(s, buffer, 78, 0, (struct sockaddr *)&sall, sizeof(sall));
+               if (ret < 0 ){
+                       close(s);
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
+                       return -1;
+               }       
+
+               close(s);
+               break;
+       default:
+               DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/ipv6 address (family is %u)\n", addr->ip.sin_family));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  simple TCP checksum - assumes data is multiple of 2 bytes long
+ */
+static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
+{
+       uint32_t sum = uint16_checksum(data, n);
+       uint16_t sum2;
+       sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
+                              sizeof(ip->saddr));
+       sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
+                              sizeof(ip->daddr));
+       sum += ip->protocol + n;
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum = (sum & 0xFFFF) + (sum >> 16);
+       sum2 = htons(sum);
+       sum2 = ~sum2;
+       if (sum2 == 0) {
+               return 0xFFFF;
+       }
+       return sum2;
+}
+
+/*
+  Send tcp segment from the specified IP/port to the specified
+  destination IP/port. 
+
+  This is used to trigger the receiving host into sending its own ACK,
+  which should trigger early detection of TCP reset by the client
+  after IP takeover
+
+  This can also be used to send RST segments (if rst is true) and also
+  if correct seq and ack numbers are provided.
+ */
+int ctdb_sys_send_tcp(const ctdb_sock_addr *dest, 
+                     const ctdb_sock_addr *src,
+                     uint32_t seq, uint32_t ack, int rst)
+{
+       int s;
+       int ret;
+       uint32_t one = 1;
+       uint16_t tmpport;
+       ctdb_sock_addr *tmpdest;
+       struct {
+               struct iphdr ip;
+               struct tcphdr tcp;
+       } ip4pkt;
+       struct {
+               struct ip6_hdr ip6;
+               struct tcphdr tcp;
+       } ip6pkt;
+
+       switch (src->ip.sin_family) {
+       case AF_INET:
+               ZERO_STRUCT(ip4pkt);
+               ip4pkt.ip.version  = 4;
+               ip4pkt.ip.ihl      = sizeof(ip4pkt.ip)/4;
+               ip4pkt.ip.tot_len  = htons(sizeof(ip4pkt));
+               ip4pkt.ip.ttl      = 255;
+               ip4pkt.ip.protocol = IPPROTO_TCP;
+               ip4pkt.ip.saddr    = src->ip.sin_addr.s_addr;
+               ip4pkt.ip.daddr    = dest->ip.sin_addr.s_addr;
+               ip4pkt.ip.check    = 0;
+
+               ip4pkt.tcp.source   = src->ip.sin_port;
+               ip4pkt.tcp.dest     = dest->ip.sin_port;
+               ip4pkt.tcp.seq      = seq;
+               ip4pkt.tcp.ack_seq  = ack;
+               ip4pkt.tcp.ack      = 1;
+               if (rst) {
+                       ip4pkt.tcp.rst      = 1;
+               }
+               ip4pkt.tcp.doff     = sizeof(ip4pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip4pkt.tcp.window   = htons(1234);
+               ip4pkt.tcp.check    = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
+
+               /* open a raw socket to send this segment from */
+               s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
+                                strerror(errno)));
+                       return -1;
+               }
+
+               ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
+                                strerror(errno)));
+                       close(s);
+                       return -1;
+               }
+
+               set_nonblocking(s);
+               set_close_on_exec(s);
+
+               ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
+                            (const struct sockaddr *)&dest->ip,
+                            sizeof(dest->ip));
+               close(s);
+               if (ret != sizeof(ip4pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+       case AF_INET6:
+               ZERO_STRUCT(ip6pkt);
+               ip6pkt.ip6.ip6_vfc  = 0x60;
+               ip6pkt.ip6.ip6_plen = htons(20);
+               ip6pkt.ip6.ip6_nxt  = IPPROTO_TCP;
+               ip6pkt.ip6.ip6_hlim = 64;
+               ip6pkt.ip6.ip6_src  = src->ip6.sin6_addr;
+               ip6pkt.ip6.ip6_dst  = dest->ip6.sin6_addr;
+
+               ip6pkt.tcp.source   = src->ip6.sin6_port;
+               ip6pkt.tcp.dest     = dest->ip6.sin6_port;
+               ip6pkt.tcp.seq      = seq;
+               ip6pkt.tcp.ack_seq  = ack;
+               ip6pkt.tcp.ack      = 1;
+               if (rst) {
+                       ip6pkt.tcp.rst      = 1;
+               }
+               ip6pkt.tcp.doff     = sizeof(ip6pkt.tcp)/4;
+               /* this makes it easier to spot in a sniffer */
+               ip6pkt.tcp.window   = htons(1234);
+               ip6pkt.tcp.check    = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
+
+               s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
+               if (s == -1) {
+                       DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
+                       return -1;
+
+               }
+               /* sendto() dont like if the port is set and the socket is
+                  in raw mode.
+               */
+               tmpdest = discard_const(dest);
+               tmpport = tmpdest->ip6.sin6_port;
+
+               tmpdest->ip6.sin6_port = 0;
+               ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
+                            (const struct sockaddr *)&dest->ip6,
+                            sizeof(dest->ip6));
+               tmpdest->ip6.sin6_port = tmpport;
+               close(s);
+
+               if (ret != sizeof(ip6pkt)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
+                       return -1;
+               }
+               break;
+
+       default:
+               DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* 
+   This function is used to open a raw socket to capture from
+ */
+int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
+{
+       int s;
+
+       /* Open a socket to capture all traffic */
+       s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
+       if (s == -1) {
+               DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
+               return -1;
+       }
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created RAW SOCKET FD:%d for tcp tickle\n", s));
+
+       set_nonblocking(s);
+       set_close_on_exec(s);
+
+       return s;
+}
+
+/* 
+   This function is used to do any additional cleanup required when closing
+   a capture socket.
+   Note that the socket itself is closed automatically in the caller.
+ */
+int ctdb_sys_close_capture_socket(void *private_data)
+{
+       return 0;
+}
+
+
+/*
+  called when the raw socket becomes readable
+ */
+int ctdb_sys_read_tcp_packet(int s, void *private_data, 
+                       ctdb_sock_addr *src, ctdb_sock_addr *dst,
+                       uint32_t *ack_seq, uint32_t *seq)
+{
+       int ret;
+#define RCVPKTSIZE 100
+       char pkt[RCVPKTSIZE];
+       struct ether_header *eth;
+       struct iphdr *ip;
+       struct ip6_hdr *ip6;
+       struct tcphdr *tcp;
+
+       ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
+       if (ret < sizeof(*eth)+sizeof(*ip)) {
+               return -1;
+       }
+
+       /* Ethernet */
+       eth = (struct ether_header *)pkt;
+
+       /* we want either IPv4 or IPv6 */
+       if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
+               /* IP */
+               ip = (struct iphdr *)(eth+1);
+
+               /* We only want IPv4 packets */
+               if (ip->version != 4) {
+                       return -1;
+               }
+               /* Dont look at fragments */
+               if ((ntohs(ip->frag_off)&0x1fff) != 0) {
+                       return -1;
+               }
+               /* we only want TCP */
+               if (ip->protocol != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* make sure its not a short packet */
+               if (offsetof(struct tcphdr, ack_seq) + 4 + 
+                   (ip->ihl*4) + sizeof(*eth) > ret) {
+                       return -1;
+               }
+               /* TCP */
+               tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
+
+               /* tell the caller which one we've found */
+               src->ip.sin_family      = AF_INET;
+               src->ip.sin_addr.s_addr = ip->saddr;
+               src->ip.sin_port        = tcp->source;
+               dst->ip.sin_family      = AF_INET;
+               dst->ip.sin_addr.s_addr = ip->daddr;
+               dst->ip.sin_port        = tcp->dest;
+               *ack_seq                = tcp->ack_seq;
+               *seq                    = tcp->seq;
+
+               return 0;
+       } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
+               /* IP6 */
+               ip6 = (struct ip6_hdr *)(eth+1);
+
+               /* we only want TCP */
+               if (ip6->ip6_nxt != IPPROTO_TCP) {
+                       return -1;
+               }
+
+               /* TCP */
+               tcp = (struct tcphdr *)(ip6+1);
+
+               /* tell the caller which one we've found */
+               src->ip6.sin6_family = AF_INET6;
+               src->ip6.sin6_port   = tcp->source;
+               src->ip6.sin6_addr   = ip6->ip6_src;
+
+               dst->ip6.sin6_family = AF_INET6;
+               dst->ip6.sin6_port   = tcp->dest;
+               dst->ip6.sin6_addr   = ip6->ip6_dst;
+
+               *ack_seq             = tcp->ack_seq;
+               *seq                 = tcp->seq;
+
+               return 0;
+       }
+
+       return -1;
+}
+
+
+bool ctdb_sys_check_iface_exists(const char *iface)
+{
+       int s;
+       struct ifreq ifr;
+
+       s = socket(PF_PACKET, SOCK_RAW, 0);
+       if (s == -1){
+               /* We dont know if the interface exists, so assume yes */
+               DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
+               return true;
+       }
+
+       strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
+       if (ioctl(s, SIOCGIFINDEX, &ifr) < 0 && errno == ENODEV) {
+               DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
+               close(s);
+               return false;
+       }
+       close(s);
+       
+       return true;
+}
+
+int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
+{
+       struct ucred cr;
+       socklen_t crl = sizeof(struct ucred);
+       int ret;
+       if ((ret = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &crl) == 0)) {
+               *peer_pid = cr.pid;
+       }
+       return ret;
+}
+
+/*
+ * Find the process name from process ID
+ */
+char *ctdb_get_process_name(pid_t pid)
+{
+       char path[32];
+       char buf[PATH_MAX];
+       char *ptr;
+       int n;
+
+       snprintf(path, sizeof(path), "/proc/%d/exe", pid);
+       n = readlink(path, buf, sizeof(buf));
+       if (n < 0) {
+               return NULL;
+       }
+
+       /* Remove any extra fields */
+       buf[n] = '\0';
+       ptr = strtok(buf, " ");
+       return strdup(ptr);
+}
+
+/*
+ * Set process name
+ */
+int ctdb_set_process_name(const char *name)
+{
+       char procname[16];
+
+       strncpy(procname, name, 15);
+       procname[15] = '\0';
+       return prctl(PR_SET_NAME, (unsigned long)procname, 0, 0, 0);
+}
+
+/*
+ * Parsing a line from /proc/locks,
+ */
+static bool parse_proc_locks_line(char *line, pid_t *pid,
+                                 struct ctdb_lock_info *curlock)
+{
+       char *ptr, *saveptr;
+
+       /* output of /proc/locks
+        *
+        * lock assigned
+        * 1: POSIX  ADVISORY  WRITE 25945 fd:00:6424820 212 212
+        *
+        * lock waiting
+        * 1: -> POSIX  ADVISORY  WRITE 25946 fd:00:6424820 212 212
+        */
+
+       /* Id: */
+       ptr = strtok_r(line, " ", &saveptr);
+       if (ptr == NULL) return false;
+
+       /* -> */
+       ptr = strtok_r(NULL, " ", &saveptr);
+       if (ptr == NULL) return false;
+       if (strcmp(ptr, "->") == 0) {
+               curlock->waiting = true;
+               ptr = strtok_r(NULL, " ", &saveptr);
+       } else {
+               curlock->waiting = false;
+       }
+
+       /* POSIX */
+       if (ptr == NULL || strcmp(ptr, "POSIX") != 0) {
+               return false;
+       }
+
+       /* ADVISORY */
+       ptr = strtok_r(NULL, " ", &saveptr);
+       if (ptr == NULL) return false;
+
+       /* WRITE */
+       ptr = strtok_r(NULL, " ", &saveptr);
+       if (ptr == NULL) return false;
+       if (strcmp(ptr, "READ") == 0) {
+               curlock->read_only = true;
+       } else if (strcmp(ptr, "WRITE") == 0) {
+               curlock->read_only = false;
+       } else {
+               return false;
+       }
+
+       /* PID */
+       ptr = strtok_r(NULL, " ", &saveptr);
+       if (ptr == NULL) return false;
+       *pid = atoi(ptr);
+
+       /* MAJOR:MINOR:INODE */
+       ptr = strtok_r(NULL, " :", &saveptr);
+       if (ptr == NULL) return false;
+       ptr = strtok_r(NULL, " :", &saveptr);
+       if (ptr == NULL) return false;
+       ptr = strtok_r(NULL, " :", &saveptr);
+       if (ptr == NULL) return false;
+       curlock->inode = atol(ptr);
+
+       /* START OFFSET */
+       ptr = strtok_r(NULL, " ", &saveptr);
+       if (ptr == NULL) return false;
+       curlock->start = atol(ptr);
+
+       /* END OFFSET */
+       ptr = strtok_r(NULL, " ", &saveptr);
+       if (ptr == NULL) return false;
+       if (strncmp(ptr, "EOF", 3) == 0) {
+               curlock->end = (off_t)-1;
+       } else {
+               curlock->end = atol(ptr);
+       }
+
+       return true;
+}
+
+/*
+ * Find information of lock being waited on for given process ID
+ */
+bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
+{
+       FILE *fp;
+       struct ctdb_lock_info curlock;
+       pid_t pid;
+       char buf[1024];
+       char *ptr;
+       bool status = false;
+
+       if ((fp = fopen("/proc/locks", "r")) == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to read locks information"));
+               return false;
+       }
+       while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
+               if (! parse_proc_locks_line(buf, &pid, &curlock)) {
+                       continue;
+               }
+               if (pid == req_pid && curlock.waiting) {
+                       *lock_info = curlock;
+                       status = true;
+                       break;
+               }
+       }
+       fclose(fp);
+
+       return status;
+}
+
+/*
+ * Find process ID which holds an overlapping byte lock for required
+ * inode and byte range.
+ */
+bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
+{
+       FILE *fp;
+       struct ctdb_lock_info curlock;
+       pid_t pid;
+       char buf[1024];
+       char *ptr;
+       bool status = false;
+
+       if ((fp = fopen("/proc/locks", "r")) == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to read locks information"));
+               return false;
+       }
+       while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
+               if (! parse_proc_locks_line(buf, &pid, &curlock)) {
+                       continue;
+               }
+
+               if (curlock.waiting) {
+                       continue;
+               }
+
+               if (curlock.inode != reqlock->inode) {
+                       continue;
+               }
+
+               if (curlock.start > reqlock->end ||
+                   curlock.end < reqlock->start) {
+                       /* Outside the required range */
+                       continue;
+               }
+               *blocker_pid = pid;
+               status = true;
+               break;
+       }
+       fclose(fp);
+
+       return status;
+}
diff --git a/ctdb/config.guess b/ctdb/config.guess
new file mode 100644 (file)
index 0000000..0aee604
--- /dev/null
@@ -0,0 +1,1535 @@
+#! /bin/sh
+# Attempt to guess a canonical system name.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012, 2013 Free Software Foundation, Inc.
+
+timestamp='2012-12-30'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+#
+# Originally written by Per Bothner.
+#
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+#
+# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION]
+
+Output the configuration name of the system \`$me' is run on.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.guess ($timestamp)
+
+Originally written by Per Bothner.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
+2012, 2013 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )        # Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help" >&2
+       exit 1 ;;
+    * )
+       break ;;
+  esac
+done
+
+if test $# != 0; then
+  echo "$me: too many arguments$help" >&2
+  exit 1
+fi
+
+trap 'exit 1' 1 2 15
+
+# CC_FOR_BUILD -- compiler used by this script. Note that the use of a
+# compiler to aid in system detection is discouraged as it requires
+# temporary files to be created and, as you can see below, it is a
+# headache to deal with in a portable fashion.
+
+# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still
+# use `HOST_CC' if defined, but it is deprecated.
+
+# Portable tmp directory creation inspired by the Autoconf team.
+
+set_cc_for_build='
+trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ;
+trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ;
+: ${TMPDIR=/tmp} ;
+ { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } ||
+ { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } ||
+ { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } ||
+ { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ;
+dummy=$tmp/dummy ;
+tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ;
+case $CC_FOR_BUILD,$HOST_CC,$CC in
+ ,,)    echo "int x;" > $dummy.c ;
+       for c in cc gcc c89 c99 ; do
+         if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then
+            CC_FOR_BUILD="$c"; break ;
+         fi ;
+       done ;
+       if test x"$CC_FOR_BUILD" = x ; then
+         CC_FOR_BUILD=no_compiler_found ;
+       fi
+       ;;
+ ,,*)   CC_FOR_BUILD=$CC ;;
+ ,*,*)  CC_FOR_BUILD=$HOST_CC ;;
+esac ; set_cc_for_build= ;'
+
+# This is needed to find uname on a Pyramid OSx when run in the BSD universe.
+# (ghazi@noc.rutgers.edu 1994-08-24)
+if (test -f /.attbin/uname) >/dev/null 2>&1 ; then
+       PATH=$PATH:/.attbin ; export PATH
+fi
+
+UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown
+UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
+UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
+UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
+
+# Note: order is significant - the case branches are not exclusive.
+
+case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
+    *:NetBSD:*:*)
+       # NetBSD (nbsd) targets should (where applicable) match one or
+       # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
+       # *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
+       # switched to ELF, *-*-netbsd* would select the old
+       # object file format.  This provides both forward
+       # compatibility and a consistent mechanism for selecting the
+       # object file format.
+       #
+       # Note: NetBSD doesn't particularly care about the vendor
+       # portion of the name.  We always set it to "unknown".
+       sysctl="sysctl -n hw.machine_arch"
+       UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+           /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
+       case "${UNAME_MACHINE_ARCH}" in
+           armeb) machine=armeb-unknown ;;
+           arm*) machine=arm-unknown ;;
+           sh3el) machine=shl-unknown ;;
+           sh3eb) machine=sh-unknown ;;
+           sh5el) machine=sh5le-unknown ;;
+           *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
+       esac
+       # The Operating System including object format, if it has switched
+       # to ELF recently, or will in the future.
+       case "${UNAME_MACHINE_ARCH}" in
+           arm*|i386|m68k|ns32k|sh3*|sparc|vax)
+               eval $set_cc_for_build
+               if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
+                       | grep -q __ELF__
+               then
+                   # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
+                   # Return netbsd for either.  FIX?
+                   os=netbsd
+               else
+                   os=netbsdelf
+               fi
+               ;;
+           *)
+               os=netbsd
+               ;;
+       esac
+       # The OS release
+       # Debian GNU/NetBSD machines have a different userland, and
+       # thus, need a distinct triplet. However, they do not need
+       # kernel version information, so it can be replaced with a
+       # suitable tag, in the style of linux-gnu.
+       case "${UNAME_VERSION}" in
+           Debian*)
+               release='-gnu'
+               ;;
+           *)
+               release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
+               ;;
+       esac
+       # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
+       # contains redundant information, the shorter form:
+       # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
+       echo "${machine}-${os}${release}"
+       exit ;;
+    *:Bitrig:*:*)
+       UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
+       echo ${UNAME_MACHINE_ARCH}-unknown-bitrig${UNAME_RELEASE}
+       exit ;;
+    *:OpenBSD:*:*)
+       UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
+       echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
+       exit ;;
+    *:ekkoBSD:*:*)
+       echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
+       exit ;;
+    *:SolidBSD:*:*)
+       echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE}
+       exit ;;
+    macppc:MirBSD:*:*)
+       echo powerpc-unknown-mirbsd${UNAME_RELEASE}
+       exit ;;
+    *:MirBSD:*:*)
+       echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
+       exit ;;
+    alpha:OSF1:*:*)
+       case $UNAME_RELEASE in
+       *4.0)
+               UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
+               ;;
+       *5.*)
+               UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+               ;;
+       esac
+       # According to Compaq, /usr/sbin/psrinfo has been available on
+       # OSF/1 and Tru64 systems produced since 1995.  I hope that
+       # covers most systems running today.  This code pipes the CPU
+       # types through head -n 1, so we only detect the type of CPU 0.
+       ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
+       case "$ALPHA_CPU_TYPE" in
+           "EV4 (21064)")
+               UNAME_MACHINE="alpha" ;;
+           "EV4.5 (21064)")
+               UNAME_MACHINE="alpha" ;;
+           "LCA4 (21066/21068)")
+               UNAME_MACHINE="alpha" ;;
+           "EV5 (21164)")
+               UNAME_MACHINE="alphaev5" ;;
+           "EV5.6 (21164A)")
+               UNAME_MACHINE="alphaev56" ;;
+           "EV5.6 (21164PC)")
+               UNAME_MACHINE="alphapca56" ;;
+           "EV5.7 (21164PC)")
+               UNAME_MACHINE="alphapca57" ;;
+           "EV6 (21264)")
+               UNAME_MACHINE="alphaev6" ;;
+           "EV6.7 (21264A)")
+               UNAME_MACHINE="alphaev67" ;;
+           "EV6.8CB (21264C)")
+               UNAME_MACHINE="alphaev68" ;;
+           "EV6.8AL (21264B)")
+               UNAME_MACHINE="alphaev68" ;;
+           "EV6.8CX (21264D)")
+               UNAME_MACHINE="alphaev68" ;;
+           "EV6.9A (21264/EV69A)")
+               UNAME_MACHINE="alphaev69" ;;
+           "EV7 (21364)")
+               UNAME_MACHINE="alphaev7" ;;
+           "EV7.9 (21364A)")
+               UNAME_MACHINE="alphaev79" ;;
+       esac
+       # A Pn.n version is a patched version.
+       # A Vn.n version is a released version.
+       # A Tn.n version is a released field test version.
+       # A Xn.n version is an unreleased experimental baselevel.
+       # 1.2 uses "1.2" for uname -r.
+       echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+       # Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+       exitcode=$?
+       trap '' 0
+       exit $exitcode ;;
+    Alpha\ *:Windows_NT*:*)
+       # How do we know it's Interix rather than the generic POSIX subsystem?
+       # Should we change UNAME_MACHINE based on the output of uname instead
+       # of the specific Alpha model?
+       echo alpha-pc-interix
+       exit ;;
+    21064:Windows_NT:50:3)
+       echo alpha-dec-winnt3.5
+       exit ;;
+    Amiga*:UNIX_System_V:4.0:*)
+       echo m68k-unknown-sysv4
+       exit ;;
+    *:[Aa]miga[Oo][Ss]:*:*)
+       echo ${UNAME_MACHINE}-unknown-amigaos
+       exit ;;
+    *:[Mm]orph[Oo][Ss]:*:*)
+       echo ${UNAME_MACHINE}-unknown-morphos
+       exit ;;
+    *:OS/390:*:*)
+       echo i370-ibm-openedition
+       exit ;;
+    *:z/VM:*:*)
+       echo s390-ibm-zvmoe
+       exit ;;
+    *:OS400:*:*)
+       echo powerpc-ibm-os400
+       exit ;;
+    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
+       echo arm-acorn-riscix${UNAME_RELEASE}
+       exit ;;
+    arm*:riscos:*:*|arm*:RISCOS:*:*)
+       echo arm-unknown-riscos
+       exit ;;
+    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
+       echo hppa1.1-hitachi-hiuxmpp
+       exit ;;
+    Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*)
+       # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE.
+       if test "`(/bin/universe) 2>/dev/null`" = att ; then
+               echo pyramid-pyramid-sysv3
+       else
+               echo pyramid-pyramid-bsd
+       fi
+       exit ;;
+    NILE*:*:*:dcosx)
+       echo pyramid-pyramid-svr4
+       exit ;;
+    DRS?6000:unix:4.0:6*)
+       echo sparc-icl-nx6
+       exit ;;
+    DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*)
+       case `/usr/bin/uname -p` in
+           sparc) echo sparc-icl-nx7; exit ;;
+       esac ;;
+    s390x:SunOS:*:*)
+       echo ${UNAME_MACHINE}-ibm-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+       exit ;;
+    sun4H:SunOS:5.*:*)
+       echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+       exit ;;
+    sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
+       echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+       exit ;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+       echo i386-pc-auroraux${UNAME_RELEASE}
+       exit ;;
+    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
+       eval $set_cc_for_build
+       SUN_ARCH="i386"
+       # If there is a compiler, see if it is configured for 64-bit objects.
+       # Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
+       # This test works for both compilers.
+       if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+           if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
+               (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+               grep IS_64BIT_ARCH >/dev/null
+           then
+               SUN_ARCH="x86_64"
+           fi
+       fi
+       echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+       exit ;;
+    sun4*:SunOS:6*:*)
+       # According to config.sub, this is the proper way to canonicalize
+       # SunOS6.  Hard to guess exactly what SunOS6 will be like, but
+       # it's likely to be more like Solaris than SunOS4.
+       echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+       exit ;;
+    sun4*:SunOS:*:*)
+       case "`/usr/bin/arch -k`" in
+           Series*|S4*)
+               UNAME_RELEASE=`uname -v`
+               ;;
+       esac
+       # Japanese Language versions have a version number like `4.1.3-JL'.
+       echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'`
+       exit ;;
+    sun3*:SunOS:*:*)
+       echo m68k-sun-sunos${UNAME_RELEASE}
+       exit ;;
+    sun*:*:4.2BSD:*)
+       UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
+       test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
+       case "`/bin/arch`" in
+           sun3)
+               echo m68k-sun-sunos${UNAME_RELEASE}
+               ;;
+           sun4)
+               echo sparc-sun-sunos${UNAME_RELEASE}
+               ;;
+       esac
+       exit ;;
+    aushp:SunOS:*:*)
+       echo sparc-auspex-sunos${UNAME_RELEASE}
+       exit ;;
+    # The situation for MiNT is a little confusing.  The machine name
+    # can be virtually everything (everything which is not
+    # "atarist" or "atariste" at least should have a processor
+    # > m68000).  The system name ranges from "MiNT" over "FreeMiNT"
+    # to the lowercase version "mint" (or "freemint").  Finally
+    # the system name "TOS" denotes a system which is actually not
+    # MiNT.  But MiNT is downward compatible to TOS, so this should
+    # be no problem.
+    atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
+       echo m68k-atari-mint${UNAME_RELEASE}
+       exit ;;
+    atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
+       echo m68k-atari-mint${UNAME_RELEASE}
+       exit ;;
+    *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
+       echo m68k-atari-mint${UNAME_RELEASE}
+       exit ;;
+    milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
+       echo m68k-milan-mint${UNAME_RELEASE}
+       exit ;;
+    hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
+       echo m68k-hades-mint${UNAME_RELEASE}
+       exit ;;
+    *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
+       echo m68k-unknown-mint${UNAME_RELEASE}
+       exit ;;
+    m68k:machten:*:*)
+       echo m68k-apple-machten${UNAME_RELEASE}
+       exit ;;
+    powerpc:machten:*:*)
+       echo powerpc-apple-machten${UNAME_RELEASE}
+       exit ;;
+    RISC*:Mach:*:*)
+       echo mips-dec-mach_bsd4.3
+       exit ;;
+    RISC*:ULTRIX:*:*)
+       echo mips-dec-ultrix${UNAME_RELEASE}
+       exit ;;
+    VAX*:ULTRIX*:*:*)
+       echo vax-dec-ultrix${UNAME_RELEASE}
+       exit ;;
+    2020:CLIX:*:* | 2430:CLIX:*:*)
+       echo clipper-intergraph-clix${UNAME_RELEASE}
+       exit ;;
+    mips:*:*:UMIPS | mips:*:*:RISCos)
+       eval $set_cc_for_build
+       sed 's/^        //' << EOF >$dummy.c
+#ifdef __cplusplus
+#include <stdio.h>  /* for printf() prototype */
+       int main (int argc, char *argv[]) {
+#else
+       int main (argc, argv) int argc; char *argv[]; {
+#endif
+       #if defined (host_mips) && defined (MIPSEB)
+       #if defined (SYSTYPE_SYSV)
+         printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0);
+       #endif
+       #if defined (SYSTYPE_SVR4)
+         printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0);
+       #endif
+       #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD)
+         printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0);
+       #endif
+       #endif
+         exit (-1);
+       }
+EOF
+       $CC_FOR_BUILD -o $dummy $dummy.c &&
+         dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` &&
+         SYSTEM_NAME=`$dummy $dummyarg` &&
+           { echo "$SYSTEM_NAME"; exit; }
+       echo mips-mips-riscos${UNAME_RELEASE}
+       exit ;;
+    Motorola:PowerMAX_OS:*:*)
+       echo powerpc-motorola-powermax
+       exit ;;
+    Motorola:*:4.3:PL8-*)
+       echo powerpc-harris-powermax
+       exit ;;
+    Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*)
+       echo powerpc-harris-powermax
+       exit ;;
+    Night_Hawk:Power_UNIX:*:*)
+       echo powerpc-harris-powerunix
+       exit ;;
+    m88k:CX/UX:7*:*)
+       echo m88k-harris-cxux7
+       exit ;;
+    m88k:*:4*:R4*)
+       echo m88k-motorola-sysv4
+       exit ;;
+    m88k:*:3*:R3*)
+       echo m88k-motorola-sysv3
+       exit ;;
+    AViiON:dgux:*:*)
+       # DG/UX returns AViiON for all architectures
+       UNAME_PROCESSOR=`/usr/bin/uname -p`
+       if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
+       then
+           if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
+              [ ${TARGET_BINARY_INTERFACE}x = x ]
+           then
+               echo m88k-dg-dgux${UNAME_RELEASE}
+           else
+               echo m88k-dg-dguxbcs${UNAME_RELEASE}
+           fi
+       else
+           echo i586-dg-dgux${UNAME_RELEASE}
+       fi
+       exit ;;
+    M88*:DolphinOS:*:*)        # DolphinOS (SVR3)
+       echo m88k-dolphin-sysv3
+       exit ;;
+    M88*:*:R3*:*)
+       # Delta 88k system running SVR3
+       echo m88k-motorola-sysv3
+       exit ;;
+    XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3)
+       echo m88k-tektronix-sysv3
+       exit ;;
+    Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD)
+       echo m68k-tektronix-bsd
+       exit ;;
+    *:IRIX*:*:*)
+       echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'`
+       exit ;;
+    ????????:AIX?:[12].1:2)   # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX.
+       echo romp-ibm-aix     # uname -m gives an 8 hex-code CPU id
+       exit ;;               # Note that: echo "'`uname -s`'" gives 'AIX '
+    i*86:AIX:*:*)
+       echo i386-ibm-aix
+       exit ;;
+    ia64:AIX:*:*)
+       if [ -x /usr/bin/oslevel ] ; then
+               IBM_REV=`/usr/bin/oslevel`
+       else
+               IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+       fi
+       echo ${UNAME_MACHINE}-ibm-aix${IBM_REV}
+       exit ;;
+    *:AIX:2:3)
+       if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then
+               eval $set_cc_for_build
+               sed 's/^                //' << EOF >$dummy.c
+               #include <sys/systemcfg.h>
+
+               main()
+                       {
+                       if (!__power_pc())
+                               exit(1);
+                       puts("powerpc-ibm-aix3.2.5");
+                       exit(0);
+                       }
+EOF
+               if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy`
+               then
+                       echo "$SYSTEM_NAME"
+               else
+                       echo rs6000-ibm-aix3.2.5
+               fi
+       elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then
+               echo rs6000-ibm-aix3.2.4
+       else
+               echo rs6000-ibm-aix3.2
+       fi
+       exit ;;
+    *:AIX:*:[4567])
+       IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
+       if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
+               IBM_ARCH=rs6000
+       else
+               IBM_ARCH=powerpc
+       fi
+       if [ -x /usr/bin/oslevel ] ; then
+               IBM_REV=`/usr/bin/oslevel`
+       else
+               IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
+       fi
+       echo ${IBM_ARCH}-ibm-aix${IBM_REV}
+       exit ;;
+    *:AIX:*:*)
+       echo rs6000-ibm-aix
+       exit ;;
+    ibmrt:4.4BSD:*|romp-ibm:BSD:*)
+       echo romp-ibm-bsd4.4
+       exit ;;
+    ibmrt:*BSD:*|romp-ibm:BSD:*)            # covers RT/PC BSD and
+       echo romp-ibm-bsd${UNAME_RELEASE}   # 4.3 with uname added to
+       exit ;;                             # report: romp-ibm BSD 4.3
+    *:BOSX:*:*)
+       echo rs6000-bull-bosx
+       exit ;;
+    DPX/2?00:B.O.S.:*:*)
+       echo m68k-bull-sysv3
+       exit ;;
+    9000/[34]??:4.3bsd:1.*:*)
+       echo m68k-hp-bsd
+       exit ;;
+    hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*)
+       echo m68k-hp-bsd4.4
+       exit ;;
+    9000/[34678]??:HP-UX:*:*)
+       HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+       case "${UNAME_MACHINE}" in
+           9000/31? )            HP_ARCH=m68000 ;;
+           9000/[34]?? )         HP_ARCH=m68k ;;
+           9000/[678][0-9][0-9])
+               if [ -x /usr/bin/getconf ]; then
+                   sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
+                   sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+                   case "${sc_cpu_version}" in
+                     523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+                     528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+                     532)                      # CPU_PA_RISC2_0
+                       case "${sc_kernel_bits}" in
+                         32) HP_ARCH="hppa2.0n" ;;
+                         64) HP_ARCH="hppa2.0w" ;;
+                         '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
+                       esac ;;
+                   esac
+               fi
+               if [ "${HP_ARCH}" = "" ]; then
+                   eval $set_cc_for_build
+                   sed 's/^            //' << EOF >$dummy.c
+
+               #define _HPUX_SOURCE
+               #include <stdlib.h>
+               #include <unistd.h>
+
+               int main ()
+               {
+               #if defined(_SC_KERNEL_BITS)
+                   long bits = sysconf(_SC_KERNEL_BITS);
+               #endif
+                   long cpu  = sysconf (_SC_CPU_VERSION);
+
+                   switch (cpu)
+                       {
+                       case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+                       case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+                       case CPU_PA_RISC2_0:
+               #if defined(_SC_KERNEL_BITS)
+                           switch (bits)
+                               {
+                               case 64: puts ("hppa2.0w"); break;
+                               case 32: puts ("hppa2.0n"); break;
+                               default: puts ("hppa2.0"); break;
+                               } break;
+               #else  /* !defined(_SC_KERNEL_BITS) */
+                           puts ("hppa2.0"); break;
+               #endif
+                       default: puts ("hppa1.0"); break;
+                       }
+                   exit (0);
+               }
+EOF
+                   (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+                   test -z "$HP_ARCH" && HP_ARCH=hppa
+               fi ;;
+       esac
+       if [ ${HP_ARCH} = "hppa2.0w" ]
+       then
+           eval $set_cc_for_build
+
+           # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating
+           # 32-bit code.  hppa64-hp-hpux* has the same kernel and a compiler
+           # generating 64-bit code.  GNU and HP use different nomenclature:
+           #
+           # $ CC_FOR_BUILD=cc ./config.guess
+           # => hppa2.0w-hp-hpux11.23
+           # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
+           # => hppa64-hp-hpux11.23
+
+           if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
+               grep -q __LP64__
+           then
+               HP_ARCH="hppa2.0w"
+           else
+               HP_ARCH="hppa64"
+           fi
+       fi
+       echo ${HP_ARCH}-hp-hpux${HPUX_REV}
+       exit ;;
+    ia64:HP-UX:*:*)
+       HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'`
+       echo ia64-hp-hpux${HPUX_REV}
+       exit ;;
+    3050*:HI-UX:*:*)
+       eval $set_cc_for_build
+       sed 's/^        //' << EOF >$dummy.c
+       #include <unistd.h>
+       int
+       main ()
+       {
+         long cpu = sysconf (_SC_CPU_VERSION);
+         /* The order matters, because CPU_IS_HP_MC68K erroneously returns
+            true for CPU_PA_RISC1_0.  CPU_IS_PA_RISC returns correct
+            results, however.  */
+         if (CPU_IS_PA_RISC (cpu))
+           {
+             switch (cpu)
+               {
+                 case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break;
+                 case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break;
+                 case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break;
+                 default: puts ("hppa-hitachi-hiuxwe2"); break;
+               }
+           }
+         else if (CPU_IS_HP_MC68K (cpu))
+           puts ("m68k-hitachi-hiuxwe2");
+         else puts ("unknown-hitachi-hiuxwe2");
+         exit (0);
+       }
+EOF
+       $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` &&
+               { echo "$SYSTEM_NAME"; exit; }
+       echo unknown-hitachi-hiuxwe2
+       exit ;;
+    9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* )
+       echo hppa1.1-hp-bsd
+       exit ;;
+    9000/8??:4.3bsd:*:*)
+       echo hppa1.0-hp-bsd
+       exit ;;
+    *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*)
+       echo hppa1.0-hp-mpeix
+       exit ;;
+    hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* )
+       echo hppa1.1-hp-osf
+       exit ;;
+    hp8??:OSF1:*:*)
+       echo hppa1.0-hp-osf
+       exit ;;
+    i*86:OSF1:*:*)
+       if [ -x /usr/sbin/sysversion ] ; then
+           echo ${UNAME_MACHINE}-unknown-osf1mk
+       else
+           echo ${UNAME_MACHINE}-unknown-osf1
+       fi
+       exit ;;
+    parisc*:Lites*:*:*)
+       echo hppa1.1-hp-lites
+       exit ;;
+    C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
+       echo c1-convex-bsd
+       exit ;;
+    C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
+       if getsysinfo -f scalar_acc
+       then echo c32-convex-bsd
+       else echo c2-convex-bsd
+       fi
+       exit ;;
+    C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
+       echo c34-convex-bsd
+       exit ;;
+    C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
+       echo c38-convex-bsd
+       exit ;;
+    C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
+       echo c4-convex-bsd
+       exit ;;
+    CRAY*Y-MP:*:*:*)
+       echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+       exit ;;
+    CRAY*[A-Z]90:*:*:*)
+       echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \
+       | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \
+             -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \
+             -e 's/\.[^.]*$/.X/'
+       exit ;;
+    CRAY*TS:*:*:*)
+       echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+       exit ;;
+    CRAY*T3E:*:*:*)
+       echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+       exit ;;
+    CRAY*SV1:*:*:*)
+       echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+       exit ;;
+    *:UNICOS/mp:*:*)
+       echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
+       exit ;;
+    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
+       FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+       FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+       FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+       echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+       exit ;;
+    5000:UNIX_System_V:4.*:*)
+       FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+       FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+       echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+       exit ;;
+    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
+       echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
+       exit ;;
+    sparc*:BSD/OS:*:*)
+       echo sparc-unknown-bsdi${UNAME_RELEASE}
+       exit ;;
+    *:BSD/OS:*:*)
+       echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
+       exit ;;
+    *:FreeBSD:*:*)
+       UNAME_PROCESSOR=`/usr/bin/uname -p`
+       case ${UNAME_PROCESSOR} in
+           amd64)
+               echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+           *)
+               echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+       esac
+       exit ;;
+    i*:CYGWIN*:*)
+       echo ${UNAME_MACHINE}-pc-cygwin
+       exit ;;
+    *:MINGW64*:*)
+       echo ${UNAME_MACHINE}-pc-mingw64
+       exit ;;
+    *:MINGW*:*)
+       echo ${UNAME_MACHINE}-pc-mingw32
+       exit ;;
+    i*:MSYS*:*)
+       echo ${UNAME_MACHINE}-pc-msys
+       exit ;;
+    i*:windows32*:*)
+       # uname -m includes "-pc" on this system.
+       echo ${UNAME_MACHINE}-mingw32
+       exit ;;
+    i*:PW*:*)
+       echo ${UNAME_MACHINE}-pc-pw32
+       exit ;;
+    *:Interix*:*)
+       case ${UNAME_MACHINE} in
+           x86)
+               echo i586-pc-interix${UNAME_RELEASE}
+               exit ;;
+           authenticamd | genuineintel | EM64T)
+               echo x86_64-unknown-interix${UNAME_RELEASE}
+               exit ;;
+           IA64)
+               echo ia64-unknown-interix${UNAME_RELEASE}
+               exit ;;
+       esac ;;
+    [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
+       echo i${UNAME_MACHINE}-pc-mks
+       exit ;;
+    8664:Windows_NT:*)
+       echo x86_64-pc-mks
+       exit ;;
+    i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
+       # How do we know it's Interix rather than the generic POSIX subsystem?
+       # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
+       # UNAME_MACHINE based on the output of uname instead of i386?
+       echo i586-pc-interix
+       exit ;;
+    i*:UWIN*:*)
+       echo ${UNAME_MACHINE}-pc-uwin
+       exit ;;
+    amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*)
+       echo x86_64-unknown-cygwin
+       exit ;;
+    p*:CYGWIN*:*)
+       echo powerpcle-unknown-cygwin
+       exit ;;
+    prep*:SunOS:5.*:*)
+       echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
+       exit ;;
+    *:GNU:*:*)
+       # the GNU system
+       echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+       exit ;;
+    *:GNU/*:*:*)
+       # other systems with GNU libc and userland
+       echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
+       exit ;;
+    i*86:Minix:*:*)
+       echo ${UNAME_MACHINE}-pc-minix
+       exit ;;
+    aarch64:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    aarch64_be:Linux:*:*)
+       UNAME_MACHINE=aarch64_be
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    alpha:Linux:*:*)
+       case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+         EV5)   UNAME_MACHINE=alphaev5 ;;
+         EV56)  UNAME_MACHINE=alphaev56 ;;
+         PCA56) UNAME_MACHINE=alphapca56 ;;
+         PCA57) UNAME_MACHINE=alphapca56 ;;
+         EV6)   UNAME_MACHINE=alphaev6 ;;
+         EV67)  UNAME_MACHINE=alphaev67 ;;
+         EV68*) UNAME_MACHINE=alphaev68 ;;
+       esac
+       objdump --private-headers /bin/sh | grep -q ld.so.1
+       if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+       echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+       exit ;;
+    arm*:Linux:*:*)
+       eval $set_cc_for_build
+       if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
+           | grep -q __ARM_EABI__
+       then
+           echo ${UNAME_MACHINE}-unknown-linux-gnu
+       else
+           if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+               | grep -q __ARM_PCS_VFP
+           then
+               echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+           else
+               echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
+           fi
+       fi
+       exit ;;
+    avr32*:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    cris:Linux:*:*)
+       echo ${UNAME_MACHINE}-axis-linux-gnu
+       exit ;;
+    crisv32:Linux:*:*)
+       echo ${UNAME_MACHINE}-axis-linux-gnu
+       exit ;;
+    frv:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    hexagon:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    i*86:Linux:*:*)
+       LIBC=gnu
+       eval $set_cc_for_build
+       sed 's/^        //' << EOF >$dummy.c
+       #ifdef __dietlibc__
+       LIBC=dietlibc
+       #endif
+EOF
+       eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+       echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
+       exit ;;
+    ia64:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    m32r*:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    m68*:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    mips:Linux:*:* | mips64:Linux:*:*)
+       eval $set_cc_for_build
+       sed 's/^        //' << EOF >$dummy.c
+       #undef CPU
+       #undef ${UNAME_MACHINE}
+       #undef ${UNAME_MACHINE}el
+       #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
+       CPU=${UNAME_MACHINE}el
+       #else
+       #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
+       CPU=${UNAME_MACHINE}
+       #else
+       CPU=
+       #endif
+       #endif
+EOF
+       eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
+       test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
+       ;;
+    or32:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    padre:Linux:*:*)
+       echo sparc-unknown-linux-gnu
+       exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+       echo hppa64-unknown-linux-gnu
+       exit ;;
+    parisc:Linux:*:* | hppa:Linux:*:*)
+       # Look for CPU level
+       case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
+         PA7*) echo hppa1.1-unknown-linux-gnu ;;
+         PA8*) echo hppa2.0-unknown-linux-gnu ;;
+         *)    echo hppa-unknown-linux-gnu ;;
+       esac
+       exit ;;
+    ppc64:Linux:*:*)
+       echo powerpc64-unknown-linux-gnu
+       exit ;;
+    ppc:Linux:*:*)
+       echo powerpc-unknown-linux-gnu
+       exit ;;
+    s390:Linux:*:* | s390x:Linux:*:*)
+       echo ${UNAME_MACHINE}-ibm-linux
+       exit ;;
+    sh64*:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    sh*:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    sparc:Linux:*:* | sparc64:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    tile*:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    vax:Linux:*:*)
+       echo ${UNAME_MACHINE}-dec-linux-gnu
+       exit ;;
+    x86_64:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    xtensa*:Linux:*:*)
+       echo ${UNAME_MACHINE}-unknown-linux-gnu
+       exit ;;
+    i*86:DYNIX/ptx:4*:*)
+       # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
+       # earlier versions are messed up and put the nodename in both
+       # sysname and nodename.
+       echo i386-sequent-sysv4
+       exit ;;
+    i*86:UNIX_SV:4.2MP:2.*)
+       # Unixware is an offshoot of SVR4, but it has its own version
+       # number series starting with 2...
+       # I am not positive that other SVR4 systems won't match this,
+       # I just have to hope.  -- rms.
+       # Use sysv4.2uw... so that sysv4* matches it.
+       echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
+       exit ;;
+    i*86:OS/2:*:*)
+       # If we were able to find `uname', then EMX Unix compatibility
+       # is probably installed.
+       echo ${UNAME_MACHINE}-pc-os2-emx
+       exit ;;
+    i*86:XTS-300:*:STOP)
+       echo ${UNAME_MACHINE}-unknown-stop
+       exit ;;
+    i*86:atheos:*:*)
+       echo ${UNAME_MACHINE}-unknown-atheos
+       exit ;;
+    i*86:syllable:*:*)
+       echo ${UNAME_MACHINE}-pc-syllable
+       exit ;;
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
+       echo i386-unknown-lynxos${UNAME_RELEASE}
+       exit ;;
+    i*86:*DOS:*:*)
+       echo ${UNAME_MACHINE}-pc-msdosdjgpp
+       exit ;;
+    i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*)
+       UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'`
+       if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then
+               echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL}
+       else
+               echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL}
+       fi
+       exit ;;
+    i*86:*:5:[678]*)
+       # UnixWare 7.x, OpenUNIX and OpenServer 6.
+       case `/bin/uname -X | grep "^Machine"` in
+           *486*)           UNAME_MACHINE=i486 ;;
+           *Pentium)        UNAME_MACHINE=i586 ;;
+           *Pent*|*Celeron) UNAME_MACHINE=i686 ;;
+       esac
+       echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION}
+       exit ;;
+    i*86:*:3.2:*)
+       if test -f /usr/options/cb.name; then
+               UNAME_REL=`sed -n 's/.*Version //p' </usr/options/cb.name`
+               echo ${UNAME_MACHINE}-pc-isc$UNAME_REL
+       elif /bin/uname -X 2>/dev/null >/dev/null ; then
+               UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')`
+               (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486
+               (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \
+                       && UNAME_MACHINE=i586
+               (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \
+                       && UNAME_MACHINE=i686
+               (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \
+                       && UNAME_MACHINE=i686
+               echo ${UNAME_MACHINE}-pc-sco$UNAME_REL
+       else
+               echo ${UNAME_MACHINE}-pc-sysv32
+       fi
+       exit ;;
+    pc:*:*:*)
+       # Left here for compatibility:
+       # uname -m prints for DJGPP always 'pc', but it prints nothing about
+       # the processor, so we play safe by assuming i586.
+       # Note: whatever this is, it MUST be the same as what config.sub
+       # prints for the "djgpp" host, or else GDB configury will decide that
+       # this is a cross-build.
+       echo i586-pc-msdosdjgpp
+       exit ;;
+    Intel:Mach:3*:*)
+       echo i386-pc-mach3
+       exit ;;
+    paragon:*:*:*)
+       echo i860-intel-osf1
+       exit ;;
+    i860:*:4.*:*) # i860-SVR4
+       if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then
+         echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4
+       else # Add other i860-SVR4 vendors below as they are discovered.
+         echo i860-unknown-sysv${UNAME_RELEASE}  # Unknown i860-SVR4
+       fi
+       exit ;;
+    mini*:CTIX:SYS*5:*)
+       # "miniframe"
+       echo m68010-convergent-sysv
+       exit ;;
+    mc68k:UNIX:SYSTEM5:3.51m)
+       echo m68k-convergent-sysv
+       exit ;;
+    M680?0:D-NIX:5.3:*)
+       echo m68k-diab-dnix
+       exit ;;
+    M68*:*:R3V[5678]*:*)
+       test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;;
+    3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0)
+       OS_REL=''
+       test -r /etc/.relid \
+       && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+       /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+         && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+       /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+         && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
+       /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+         && { echo i486-ncr-sysv4; exit; } ;;
+    NCR*:*:4.2:* | MPRAS*:*:4.2:*)
+       OS_REL='.3'
+       test -r /etc/.relid \
+           && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid`
+       /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+           && { echo i486-ncr-sysv4.3${OS_REL}; exit; }
+       /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
+           && { echo i586-ncr-sysv4.3${OS_REL}; exit; }
+       /bin/uname -p 2>/dev/null | /bin/grep pteron >/dev/null \
+           && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
+    m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*)
+       echo m68k-unknown-lynxos${UNAME_RELEASE}
+       exit ;;
+    mc68030:UNIX_System_V:4.*:*)
+       echo m68k-atari-sysv4
+       exit ;;
+    TSUNAMI:LynxOS:2.*:*)
+       echo sparc-unknown-lynxos${UNAME_RELEASE}
+       exit ;;
+    rs6000:LynxOS:2.*:*)
+       echo rs6000-unknown-lynxos${UNAME_RELEASE}
+       exit ;;
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
+       echo powerpc-unknown-lynxos${UNAME_RELEASE}
+       exit ;;
+    SM[BE]S:UNIX_SV:*:*)
+       echo mips-dde-sysv${UNAME_RELEASE}
+       exit ;;
+    RM*:ReliantUNIX-*:*:*)
+       echo mips-sni-sysv4
+       exit ;;
+    RM*:SINIX-*:*:*)
+       echo mips-sni-sysv4
+       exit ;;
+    *:SINIX-*:*:*)
+       if uname -p 2>/dev/null >/dev/null ; then
+               UNAME_MACHINE=`(uname -p) 2>/dev/null`
+               echo ${UNAME_MACHINE}-sni-sysv4
+       else
+               echo ns32k-sni-sysv
+       fi
+       exit ;;
+    PENTIUM:*:4.0*:*)  # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+                       # says <Richard.M.Bartel@ccMail.Census.GOV>
+       echo i586-unisys-sysv4
+       exit ;;
+    *:UNIX_System_V:4*:FTX*)
+       # From Gerald Hewes <hewes@openmarket.com>.
+       # How about differentiating between stratus architectures? -djm
+       echo hppa1.1-stratus-sysv4
+       exit ;;
+    *:*:*:FTX*)
+       # From seanf@swdc.stratus.com.
+       echo i860-stratus-sysv4
+       exit ;;
+    i*86:VOS:*:*)
+       # From Paul.Green@stratus.com.
+       echo ${UNAME_MACHINE}-stratus-vos
+       exit ;;
+    *:VOS:*:*)
+       # From Paul.Green@stratus.com.
+       echo hppa1.1-stratus-vos
+       exit ;;
+    mc68*:A/UX:*:*)
+       echo m68k-apple-aux${UNAME_RELEASE}
+       exit ;;
+    news*:NEWS-OS:6*:*)
+       echo mips-sony-newsos6
+       exit ;;
+    R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
+       if [ -d /usr/nec ]; then
+               echo mips-nec-sysv${UNAME_RELEASE}
+       else
+               echo mips-unknown-sysv${UNAME_RELEASE}
+       fi
+       exit ;;
+    BeBox:BeOS:*:*)    # BeOS running on hardware made by Be, PPC only.
+       echo powerpc-be-beos
+       exit ;;
+    BeMac:BeOS:*:*)    # BeOS running on Mac or Mac clone, PPC only.
+       echo powerpc-apple-beos
+       exit ;;
+    BePC:BeOS:*:*)     # BeOS running on Intel PC compatible.
+       echo i586-pc-beos
+       exit ;;
+    BePC:Haiku:*:*)    # Haiku running on Intel PC compatible.
+       echo i586-pc-haiku
+       exit ;;
+    x86_64:Haiku:*:*)
+       echo x86_64-unknown-haiku
+       exit ;;
+    SX-4:SUPER-UX:*:*)
+       echo sx4-nec-superux${UNAME_RELEASE}
+       exit ;;
+    SX-5:SUPER-UX:*:*)
+       echo sx5-nec-superux${UNAME_RELEASE}
+       exit ;;
+    SX-6:SUPER-UX:*:*)
+       echo sx6-nec-superux${UNAME_RELEASE}
+       exit ;;
+    SX-7:SUPER-UX:*:*)
+       echo sx7-nec-superux${UNAME_RELEASE}
+       exit ;;
+    SX-8:SUPER-UX:*:*)
+       echo sx8-nec-superux${UNAME_RELEASE}
+       exit ;;
+    SX-8R:SUPER-UX:*:*)
+       echo sx8r-nec-superux${UNAME_RELEASE}
+       exit ;;
+    Power*:Rhapsody:*:*)
+       echo powerpc-apple-rhapsody${UNAME_RELEASE}
+       exit ;;
+    *:Rhapsody:*:*)
+       echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE}
+       exit ;;
+    *:Darwin:*:*)
+       UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+       case $UNAME_PROCESSOR in
+           i386)
+               eval $set_cc_for_build
+               if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+                 if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+                     (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+                     grep IS_64BIT_ARCH >/dev/null
+                 then
+                     UNAME_PROCESSOR="x86_64"
+                 fi
+               fi ;;
+           unknown) UNAME_PROCESSOR=powerpc ;;
+       esac
+       echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
+       exit ;;
+    *:procnto*:*:* | *:QNX:[0123456789]*:*)
+       UNAME_PROCESSOR=`uname -p`
+       if test "$UNAME_PROCESSOR" = "x86"; then
+               UNAME_PROCESSOR=i386
+               UNAME_MACHINE=pc
+       fi
+       echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE}
+       exit ;;
+    *:QNX:*:4*)
+       echo i386-pc-qnx
+       exit ;;
+    NEO-?:NONSTOP_KERNEL:*:*)
+       echo neo-tandem-nsk${UNAME_RELEASE}
+       exit ;;
+    NSE-*:NONSTOP_KERNEL:*:*)
+       echo nse-tandem-nsk${UNAME_RELEASE}
+       exit ;;
+    NSR-?:NONSTOP_KERNEL:*:*)
+       echo nsr-tandem-nsk${UNAME_RELEASE}
+       exit ;;
+    *:NonStop-UX:*:*)
+       echo mips-compaq-nonstopux
+       exit ;;
+    BS2000:POSIX*:*:*)
+       echo bs2000-siemens-sysv
+       exit ;;
+    DS/*:UNIX_System_V:*:*)
+       echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE}
+       exit ;;
+    *:Plan9:*:*)
+       # "uname -m" is not consistent, so use $cputype instead. 386
+       # is converted to i386 for consistency with other x86
+       # operating systems.
+       if test "$cputype" = "386"; then
+           UNAME_MACHINE=i386
+       else
+           UNAME_MACHINE="$cputype"
+       fi
+       echo ${UNAME_MACHINE}-unknown-plan9
+       exit ;;
+    *:TOPS-10:*:*)
+       echo pdp10-unknown-tops10
+       exit ;;
+    *:TENEX:*:*)
+       echo pdp10-unknown-tenex
+       exit ;;
+    KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*)
+       echo pdp10-dec-tops20
+       exit ;;
+    XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*)
+       echo pdp10-xkl-tops20
+       exit ;;
+    *:TOPS-20:*:*)
+       echo pdp10-unknown-tops20
+       exit ;;
+    *:ITS:*:*)
+       echo pdp10-unknown-its
+       exit ;;
+    SEI:*:*:SEIUX)
+       echo mips-sei-seiux${UNAME_RELEASE}
+       exit ;;
+    *:DragonFly:*:*)
+       echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
+       exit ;;
+    *:*VMS:*:*)
+       UNAME_MACHINE=`(uname -p) 2>/dev/null`
+       case "${UNAME_MACHINE}" in
+           A*) echo alpha-dec-vms ; exit ;;
+           I*) echo ia64-dec-vms ; exit ;;
+           V*) echo vax-dec-vms ; exit ;;
+       esac ;;
+    *:XENIX:*:SysV)
+       echo i386-pc-xenix
+       exit ;;
+    i*86:skyos:*:*)
+       echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
+       exit ;;
+    i*86:rdos:*:*)
+       echo ${UNAME_MACHINE}-pc-rdos
+       exit ;;
+    i*86:AROS:*:*)
+       echo ${UNAME_MACHINE}-pc-aros
+       exit ;;
+    x86_64:VMkernel:*:*)
+       echo ${UNAME_MACHINE}-unknown-esx
+       exit ;;
+esac
+
+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+       "4"
+#else
+       ""
+#endif
+       ); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+       printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+       printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+       { echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+       echo c1-convex-bsd
+       exit ;;
+    c2*)
+       if getsysinfo -f scalar_acc
+       then echo c32-convex-bsd
+       else echo c2-convex-bsd
+       fi
+       exit ;;
+    c34*)
+       echo c34-convex-bsd
+       exit ;;
+    c38*)
+       echo c38-convex-bsd
+       exit ;;
+    c4*)
+       echo c4-convex-bsd
+       exit ;;
+    esac
+fi
+
+cat >&2 <<EOF
+$0: unable to guess system type
+
+This script, last modified $timestamp, has failed to recognize
+the operating system you are using. It is advised that you
+download the most up to date version of the config scripts from
+
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
+and
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
+If the version you run ($0) is already up to date, please
+send the following data and any information you think might be
+pertinent to <config-patches@gnu.org> in order to provide the needed
+information to handle your system.
+
+config.guess timestamp = $timestamp
+
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null`
+
+hostinfo               = `(hostinfo) 2>/dev/null`
+/bin/universe          = `(/bin/universe) 2>/dev/null`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null`
+/bin/arch              = `(/bin/arch) 2>/dev/null`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null`
+
+UNAME_MACHINE = ${UNAME_MACHINE}
+UNAME_RELEASE = ${UNAME_RELEASE}
+UNAME_SYSTEM  = ${UNAME_SYSTEM}
+UNAME_VERSION = ${UNAME_VERSION}
+EOF
+
+exit 1
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/ctdb/config.mk b/ctdb/config.mk
new file mode 100644 (file)
index 0000000..61f1e36
--- /dev/null
@@ -0,0 +1,18 @@
+##################
+[SUBSYSTEM::brlock_ctdb]
+OBJ_FILES = brlock_ctdb.o
+
+##################
+[SUBSYSTEM::opendb_ctdb]
+OBJ_FILES = opendb_ctdb.o
+
+##################
+[SUBSYSTEM::ctdb]
+OBJ_FILES = \
+               ctdb_cluster.o \
+               client/ctdb_client.o \
+               common/ctdb_io.o \
+               common/ctdb_ltdb.o \
+               common/ctdb_message.o \
+               common/ctdb_util.o
+PUBLIC_DEPENDENCIES = LIBTDB LIBTALLOC
diff --git a/ctdb/config.sub b/ctdb/config.sub
new file mode 100644 (file)
index 0000000..707e9e2
--- /dev/null
@@ -0,0 +1,1789 @@
+#! /bin/sh
+# Configuration validation subroutine script.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012, 2013 Free Software Foundation, Inc.
+
+timestamp='2013-01-11'
+
+# This file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+#
+# As a special exception to the GNU General Public License, if you
+# distribute this file as part of a program that contains a
+# configuration script generated by Autoconf, you may include it under
+# the same distribution terms that you use for the rest of that
+# program.  This Exception is an additional permission under section 7
+# of the GNU General Public License, version 3 ("GPLv3").
+
+
+# Please send patches with a ChangeLog entry to config-patches@gnu.org.
+#
+# Configuration subroutine to validate and canonicalize a configuration type.
+# Supply the specified configuration type as an argument.
+# If it is invalid, we print an error message on stderr and exit with code 1.
+# Otherwise, we print the canonical config type on stdout and succeed.
+
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
+
+# This file is supposed to be the same for all GNU packages
+# and recognize all the CPU types, system types and aliases
+# that are meaningful with *any* GNU software.
+# Each package is responsible for reporting which valid configurations
+# it does not support.  The user should be able to distinguish
+# a failure to support a valid configuration from a meaningless
+# configuration.
+
+# The goal of this file is to map all the various variations of a given
+# machine specification into a single specification in the form:
+#      CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM
+# or in some cases, the newer four-part form:
+#      CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM
+# It is wrong to echo any other type of specification.
+
+me=`echo "$0" | sed -e 's,.*/,,'`
+
+usage="\
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS
+
+Canonicalize a configuration name.
+
+Operation modes:
+  -h, --help         print this help, then exit
+  -t, --time-stamp   print date of last modification, then exit
+  -v, --version      print version number, then exit
+
+Report bugs and patches to <config-patches@gnu.org>."
+
+version="\
+GNU config.sub ($timestamp)
+
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
+2012, 2013 Free Software Foundation, Inc.
+
+This is free software; see the source for copying conditions.  There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
+
+help="
+Try \`$me --help' for more information."
+
+# Parse command line
+while test $# -gt 0 ; do
+  case $1 in
+    --time-stamp | --time* | -t )
+       echo "$timestamp" ; exit ;;
+    --version | -v )
+       echo "$version" ; exit ;;
+    --help | --h* | -h )
+       echo "$usage"; exit ;;
+    -- )     # Stop option processing
+       shift; break ;;
+    - )        # Use stdin as input.
+       break ;;
+    -* )
+       echo "$me: invalid option $1$help"
+       exit 1 ;;
+
+    *local*)
+       # First pass through any local machine types.
+       echo $1
+       exit ;;
+
+    * )
+       break ;;
+  esac
+done
+
+case $# in
+ 0) echo "$me: missing argument$help" >&2
+    exit 1;;
+ 1) ;;
+ *) echo "$me: too many arguments$help" >&2
+    exit 1;;
+esac
+
+# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any).
+# Here we must recognize all the valid KERNEL-OS combinations.
+maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
+case $maybe_os in
+  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
+  linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
+  knetbsd*-gnu* | netbsd*-gnu* | \
+  kopensolaris*-gnu* | \
+  storm-chaos* | os2-emx* | rtmk-nova*)
+    os=-$maybe_os
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
+    ;;
+  android-linux)
+    os=-linux-android
+    basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`-unknown
+    ;;
+  *)
+    basic_machine=`echo $1 | sed 's/-[^-]*$//'`
+    if [ $basic_machine != $1 ]
+    then os=`echo $1 | sed 's/.*-/-/'`
+    else os=; fi
+    ;;
+esac
+
+### Let's recognize common machines as not being operating systems so
+### that things like config.sub decstation-3100 work.  We also
+### recognize some manufacturers as not being operating systems, so we
+### can provide default operating systems below.
+case $os in
+       -sun*os*)
+               # Prevent following clause from handling this invalid input.
+               ;;
+       -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \
+       -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \
+       -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \
+       -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
+       -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
+       -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
+       -apple | -axis | -knuth | -cray | -microblaze*)
+               os=
+               basic_machine=$1
+               ;;
+       -bluegene*)
+               os=-cnk
+               ;;
+       -sim | -cisco | -oki | -wec | -winbond)
+               os=
+               basic_machine=$1
+               ;;
+       -scout)
+               ;;
+       -wrs)
+               os=-vxworks
+               basic_machine=$1
+               ;;
+       -chorusos*)
+               os=-chorusos
+               basic_machine=$1
+               ;;
+       -chorusrdb)
+               os=-chorusrdb
+               basic_machine=$1
+               ;;
+       -hiux*)
+               os=-hiuxwe2
+               ;;
+       -sco6)
+               os=-sco5v6
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -sco5)
+               os=-sco3.2v5
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -sco4)
+               os=-sco3.2v4
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -sco3.2.[4-9]*)
+               os=`echo $os | sed -e 's/sco3.2./sco3.2v/'`
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -sco3.2v[4-9]*)
+               # Don't forget version if it is 3.2v4 or newer.
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -sco5v6*)
+               # Don't forget version if it is 3.2v4 or newer.
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -sco*)
+               os=-sco3.2v2
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -udk*)
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -isc)
+               os=-isc2.2
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -clix*)
+               basic_machine=clipper-intergraph
+               ;;
+       -isc*)
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'`
+               ;;
+       -lynx*178)
+               os=-lynxos178
+               ;;
+       -lynx*5)
+               os=-lynxos5
+               ;;
+       -lynx*)
+               os=-lynxos
+               ;;
+       -ptx*)
+               basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'`
+               ;;
+       -windowsnt*)
+               os=`echo $os | sed -e 's/windowsnt/winnt/'`
+               ;;
+       -psos*)
+               os=-psos
+               ;;
+       -mint | -mint[0-9]*)
+               basic_machine=m68k-atari
+               os=-mint
+               ;;
+esac
+
+# Decode aliases for certain CPU-COMPANY combinations.
+case $basic_machine in
+       # Recognize the basic CPU types without company name.
+       # Some are omitted here because they have special meanings below.
+       1750a | 580 \
+       | a29k \
+       | aarch64 | aarch64_be \
+       | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
+       | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
+       | am33_2.0 \
+       | arc \
+       | arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
+       | avr | avr32 \
+       | be32 | be64 \
+       | bfin \
+       | c4x | clipper \
+       | d10v | d30v | dlx | dsp16xx \
+       | epiphany \
+       | fido | fr30 | frv \
+       | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
+       | hexagon \
+       | i370 | i860 | i960 | ia64 \
+       | ip2k | iq2000 \
+       | le32 | le64 \
+       | lm32 \
+       | m32c | m32r | m32rle | m68000 | m68k | m88k \
+       | maxq | mb | microblaze | microblazeel | mcore | mep | metag \
+       | mips | mipsbe | mipseb | mipsel | mipsle \
+       | mips16 \
+       | mips64 | mips64el \
+       | mips64octeon | mips64octeonel \
+       | mips64orion | mips64orionel \
+       | mips64r5900 | mips64r5900el \
+       | mips64vr | mips64vrel \
+       | mips64vr4100 | mips64vr4100el \
+       | mips64vr4300 | mips64vr4300el \
+       | mips64vr5000 | mips64vr5000el \
+       | mips64vr5900 | mips64vr5900el \
+       | mipsisa32 | mipsisa32el \
+       | mipsisa32r2 | mipsisa32r2el \
+       | mipsisa64 | mipsisa64el \
+       | mipsisa64r2 | mipsisa64r2el \
+       | mipsisa64sb1 | mipsisa64sb1el \
+       | mipsisa64sr71k | mipsisa64sr71kel \
+       | mipsr5900 | mipsr5900el \
+       | mipstx39 | mipstx39el \
+       | mn10200 | mn10300 \
+       | moxie \
+       | mt \
+       | msp430 \
+       | nds32 | nds32le | nds32be \
+       | nios | nios2 \
+       | ns16k | ns32k \
+       | open8 \
+       | or32 \
+       | pdp10 | pdp11 | pj | pjl \
+       | powerpc | powerpc64 | powerpc64le | powerpcle \
+       | pyramid \
+       | rl78 | rx \
+       | score \
+       | sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+       | sh64 | sh64le \
+       | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
+       | sparcv8 | sparcv9 | sparcv9b | sparcv9v \
+       | spu \
+       | tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
+       | ubicom32 \
+       | v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
+       | we32k \
+       | x86 | xc16x | xstormy16 | xtensa \
+       | z8k | z80)
+               basic_machine=$basic_machine-unknown
+               ;;
+       c54x)
+               basic_machine=tic54x-unknown
+               ;;
+       c55x)
+               basic_machine=tic55x-unknown
+               ;;
+       c6x)
+               basic_machine=tic6x-unknown
+               ;;
+       m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
+               basic_machine=$basic_machine-unknown
+               os=-none
+               ;;
+       m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k)
+               ;;
+       ms1)
+               basic_machine=mt-unknown
+               ;;
+
+       strongarm | thumb | xscale)
+               basic_machine=arm-unknown
+               ;;
+       xgate)
+               basic_machine=$basic_machine-unknown
+               os=-none
+               ;;
+       xscaleeb)
+               basic_machine=armeb-unknown
+               ;;
+
+       xscaleel)
+               basic_machine=armel-unknown
+               ;;
+
+       # We use `pc' rather than `unknown'
+       # because (1) that's what they normally are, and
+       # (2) the word "unknown" tends to confuse beginning users.
+       i*86 | x86_64)
+         basic_machine=$basic_machine-pc
+         ;;
+       # Object if more than one company name word.
+       *-*-*)
+               echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+               exit 1
+               ;;
+       # Recognize the basic CPU types with company name.
+       580-* \
+       | a29k-* \
+       | aarch64-* | aarch64_be-* \
+       | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
+       | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
+       | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
+       | arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
+       | avr-* | avr32-* \
+       | be32-* | be64-* \
+       | bfin-* | bs2000-* \
+       | c[123]* | c30-* | [cjt]90-* | c4x-* \
+       | clipper-* | craynv-* | cydra-* \
+       | d10v-* | d30v-* | dlx-* \
+       | elxsi-* \
+       | f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
+       | h8300-* | h8500-* \
+       | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
+       | hexagon-* \
+       | i*86-* | i860-* | i960-* | ia64-* \
+       | ip2k-* | iq2000-* \
+       | le32-* | le64-* \
+       | lm32-* \
+       | m32c-* | m32r-* | m32rle-* \
+       | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
+       | m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
+       | microblaze-* | microblazeel-* \
+       | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
+       | mips16-* \
+       | mips64-* | mips64el-* \
+       | mips64octeon-* | mips64octeonel-* \
+       | mips64orion-* | mips64orionel-* \
+       | mips64r5900-* | mips64r5900el-* \
+       | mips64vr-* | mips64vrel-* \
+       | mips64vr4100-* | mips64vr4100el-* \
+       | mips64vr4300-* | mips64vr4300el-* \
+       | mips64vr5000-* | mips64vr5000el-* \
+       | mips64vr5900-* | mips64vr5900el-* \
+       | mipsisa32-* | mipsisa32el-* \
+       | mipsisa32r2-* | mipsisa32r2el-* \
+       | mipsisa64-* | mipsisa64el-* \
+       | mipsisa64r2-* | mipsisa64r2el-* \
+       | mipsisa64sb1-* | mipsisa64sb1el-* \
+       | mipsisa64sr71k-* | mipsisa64sr71kel-* \
+       | mipsr5900-* | mipsr5900el-* \
+       | mipstx39-* | mipstx39el-* \
+       | mmix-* \
+       | mt-* \
+       | msp430-* \
+       | nds32-* | nds32le-* | nds32be-* \
+       | nios-* | nios2-* \
+       | none-* | np1-* | ns16k-* | ns32k-* \
+       | open8-* \
+       | orion-* \
+       | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
+       | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
+       | pyramid-* \
+       | rl78-* | romp-* | rs6000-* | rx-* \
+       | sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
+       | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
+       | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
+       | sparclite-* \
+       | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
+       | tahoe-* \
+       | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
+       | tile*-* \
+       | tron-* \
+       | ubicom32-* \
+       | v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
+       | vax-* \
+       | we32k-* \
+       | x86-* | x86_64-* | xc16x-* | xps100-* \
+       | xstormy16-* | xtensa*-* \
+       | ymp-* \
+       | z8k-* | z80-*)
+               ;;
+       # Recognize the basic CPU types without company name, with glob match.
+       xtensa*)
+               basic_machine=$basic_machine-unknown
+               ;;
+       # Recognize the various machine names and aliases which stand
+       # for a CPU type and a company and sometimes even an OS.
+       386bsd)
+               basic_machine=i386-unknown
+               os=-bsd
+               ;;
+       3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc)
+               basic_machine=m68000-att
+               ;;
+       3b*)
+               basic_machine=we32k-att
+               ;;
+       a29khif)
+               basic_machine=a29k-amd
+               os=-udi
+               ;;
+       abacus)
+               basic_machine=abacus-unknown
+               ;;
+       adobe68k)
+               basic_machine=m68010-adobe
+               os=-scout
+               ;;
+       alliant | fx80)
+               basic_machine=fx80-alliant
+               ;;
+       altos | altos3068)
+               basic_machine=m68k-altos
+               ;;
+       am29k)
+               basic_machine=a29k-none
+               os=-bsd
+               ;;
+       amd64)
+               basic_machine=x86_64-pc
+               ;;
+       amd64-*)
+               basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       amdahl)
+               basic_machine=580-amdahl
+               os=-sysv
+               ;;
+       amiga | amiga-*)
+               basic_machine=m68k-unknown
+               ;;
+       amigaos | amigados)
+               basic_machine=m68k-unknown
+               os=-amigaos
+               ;;
+       amigaunix | amix)
+               basic_machine=m68k-unknown
+               os=-sysv4
+               ;;
+       apollo68)
+               basic_machine=m68k-apollo
+               os=-sysv
+               ;;
+       apollo68bsd)
+               basic_machine=m68k-apollo
+               os=-bsd
+               ;;
+       aros)
+               basic_machine=i386-pc
+               os=-aros
+               ;;
+       aux)
+               basic_machine=m68k-apple
+               os=-aux
+               ;;
+       balance)
+               basic_machine=ns32k-sequent
+               os=-dynix
+               ;;
+       blackfin)
+               basic_machine=bfin-unknown
+               os=-linux
+               ;;
+       blackfin-*)
+               basic_machine=bfin-`echo $basic_machine | sed 's/^[^-]*-//'`
+               os=-linux
+               ;;
+       bluegene*)
+               basic_machine=powerpc-ibm
+               os=-cnk
+               ;;
+       c54x-*)
+               basic_machine=tic54x-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       c55x-*)
+               basic_machine=tic55x-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       c6x-*)
+               basic_machine=tic6x-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       c90)
+               basic_machine=c90-cray
+               os=-unicos
+               ;;
+       cegcc)
+               basic_machine=arm-unknown
+               os=-cegcc
+               ;;
+       convex-c1)
+               basic_machine=c1-convex
+               os=-bsd
+               ;;
+       convex-c2)
+               basic_machine=c2-convex
+               os=-bsd
+               ;;
+       convex-c32)
+               basic_machine=c32-convex
+               os=-bsd
+               ;;
+       convex-c34)
+               basic_machine=c34-convex
+               os=-bsd
+               ;;
+       convex-c38)
+               basic_machine=c38-convex
+               os=-bsd
+               ;;
+       cray | j90)
+               basic_machine=j90-cray
+               os=-unicos
+               ;;
+       craynv)
+               basic_machine=craynv-cray
+               os=-unicosmp
+               ;;
+       cr16 | cr16-*)
+               basic_machine=cr16-unknown
+               os=-elf
+               ;;
+       crds | unos)
+               basic_machine=m68k-crds
+               ;;
+       crisv32 | crisv32-* | etraxfs*)
+               basic_machine=crisv32-axis
+               ;;
+       cris | cris-* | etrax*)
+               basic_machine=cris-axis
+               ;;
+       crx)
+               basic_machine=crx-unknown
+               os=-elf
+               ;;
+       da30 | da30-*)
+               basic_machine=m68k-da30
+               ;;
+       decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn)
+               basic_machine=mips-dec
+               ;;
+       decsystem10* | dec10*)
+               basic_machine=pdp10-dec
+               os=-tops10
+               ;;
+       decsystem20* | dec20*)
+               basic_machine=pdp10-dec
+               os=-tops20
+               ;;
+       delta | 3300 | motorola-3300 | motorola-delta \
+             | 3300-motorola | delta-motorola)
+               basic_machine=m68k-motorola
+               ;;
+       delta88)
+               basic_machine=m88k-motorola
+               os=-sysv3
+               ;;
+       dicos)
+               basic_machine=i686-pc
+               os=-dicos
+               ;;
+       djgpp)
+               basic_machine=i586-pc
+               os=-msdosdjgpp
+               ;;
+       dpx20 | dpx20-*)
+               basic_machine=rs6000-bull
+               os=-bosx
+               ;;
+       dpx2* | dpx2*-bull)
+               basic_machine=m68k-bull
+               os=-sysv3
+               ;;
+       ebmon29k)
+               basic_machine=a29k-amd
+               os=-ebmon
+               ;;
+       elxsi)
+               basic_machine=elxsi-elxsi
+               os=-bsd
+               ;;
+       encore | umax | mmax)
+               basic_machine=ns32k-encore
+               ;;
+       es1800 | OSE68k | ose68k | ose | OSE)
+               basic_machine=m68k-ericsson
+               os=-ose
+               ;;
+       fx2800)
+               basic_machine=i860-alliant
+               ;;
+       genix)
+               basic_machine=ns32k-ns
+               ;;
+       gmicro)
+               basic_machine=tron-gmicro
+               os=-sysv
+               ;;
+       go32)
+               basic_machine=i386-pc
+               os=-go32
+               ;;
+       h3050r* | hiux*)
+               basic_machine=hppa1.1-hitachi
+               os=-hiuxwe2
+               ;;
+       h8300hms)
+               basic_machine=h8300-hitachi
+               os=-hms
+               ;;
+       h8300xray)
+               basic_machine=h8300-hitachi
+               os=-xray
+               ;;
+       h8500hms)
+               basic_machine=h8500-hitachi
+               os=-hms
+               ;;
+       harris)
+               basic_machine=m88k-harris
+               os=-sysv3
+               ;;
+       hp300-*)
+               basic_machine=m68k-hp
+               ;;
+       hp300bsd)
+               basic_machine=m68k-hp
+               os=-bsd
+               ;;
+       hp300hpux)
+               basic_machine=m68k-hp
+               os=-hpux
+               ;;
+       hp3k9[0-9][0-9] | hp9[0-9][0-9])
+               basic_machine=hppa1.0-hp
+               ;;
+       hp9k2[0-9][0-9] | hp9k31[0-9])
+               basic_machine=m68000-hp
+               ;;
+       hp9k3[2-9][0-9])
+               basic_machine=m68k-hp
+               ;;
+       hp9k6[0-9][0-9] | hp6[0-9][0-9])
+               basic_machine=hppa1.0-hp
+               ;;
+       hp9k7[0-79][0-9] | hp7[0-79][0-9])
+               basic_machine=hppa1.1-hp
+               ;;
+       hp9k78[0-9] | hp78[0-9])
+               # FIXME: really hppa2.0-hp
+               basic_machine=hppa1.1-hp
+               ;;
+       hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893)
+               # FIXME: really hppa2.0-hp
+               basic_machine=hppa1.1-hp
+               ;;
+       hp9k8[0-9][13679] | hp8[0-9][13679])
+               basic_machine=hppa1.1-hp
+               ;;
+       hp9k8[0-9][0-9] | hp8[0-9][0-9])
+               basic_machine=hppa1.0-hp
+               ;;
+       hppa-next)
+               os=-nextstep3
+               ;;
+       hppaosf)
+               basic_machine=hppa1.1-hp
+               os=-osf
+               ;;
+       hppro)
+               basic_machine=hppa1.1-hp
+               os=-proelf
+               ;;
+       i370-ibm* | ibm*)
+               basic_machine=i370-ibm
+               ;;
+       i*86v32)
+               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+               os=-sysv32
+               ;;
+       i*86v4*)
+               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+               os=-sysv4
+               ;;
+       i*86v)
+               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+               os=-sysv
+               ;;
+       i*86sol2)
+               basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'`
+               os=-solaris2
+               ;;
+       i386mach)
+               basic_machine=i386-mach
+               os=-mach
+               ;;
+       i386-vsta | vsta)
+               basic_machine=i386-unknown
+               os=-vsta
+               ;;
+       iris | iris4d)
+               basic_machine=mips-sgi
+               case $os in
+                   -irix*)
+                       ;;
+                   *)
+                       os=-irix4
+                       ;;
+               esac
+               ;;
+       isi68 | isi)
+               basic_machine=m68k-isi
+               os=-sysv
+               ;;
+       m68knommu)
+               basic_machine=m68k-unknown
+               os=-linux
+               ;;
+       m68knommu-*)
+               basic_machine=m68k-`echo $basic_machine | sed 's/^[^-]*-//'`
+               os=-linux
+               ;;
+       m88k-omron*)
+               basic_machine=m88k-omron
+               ;;
+       magnum | m3230)
+               basic_machine=mips-mips
+               os=-sysv
+               ;;
+       merlin)
+               basic_machine=ns32k-utek
+               os=-sysv
+               ;;
+       microblaze*)
+               basic_machine=microblaze-xilinx
+               ;;
+       mingw64)
+               basic_machine=x86_64-pc
+               os=-mingw64
+               ;;
+       mingw32)
+               basic_machine=i386-pc
+               os=-mingw32
+               ;;
+       mingw32ce)
+               basic_machine=arm-unknown
+               os=-mingw32ce
+               ;;
+       miniframe)
+               basic_machine=m68000-convergent
+               ;;
+       *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*)
+               basic_machine=m68k-atari
+               os=-mint
+               ;;
+       mips3*-*)
+               basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`
+               ;;
+       mips3*)
+               basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown
+               ;;
+       monitor)
+               basic_machine=m68k-rom68k
+               os=-coff
+               ;;
+       morphos)
+               basic_machine=powerpc-unknown
+               os=-morphos
+               ;;
+       msdos)
+               basic_machine=i386-pc
+               os=-msdos
+               ;;
+       ms1-*)
+               basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
+               ;;
+       msys)
+               basic_machine=i386-pc
+               os=-msys
+               ;;
+       mvs)
+               basic_machine=i370-ibm
+               os=-mvs
+               ;;
+       nacl)
+               basic_machine=le32-unknown
+               os=-nacl
+               ;;
+       ncr3000)
+               basic_machine=i486-ncr
+               os=-sysv4
+               ;;
+       netbsd386)
+               basic_machine=i386-unknown
+               os=-netbsd
+               ;;
+       netwinder)
+               basic_machine=armv4l-rebel
+               os=-linux
+               ;;
+       news | news700 | news800 | news900)
+               basic_machine=m68k-sony
+               os=-newsos
+               ;;
+       news1000)
+               basic_machine=m68030-sony
+               os=-newsos
+               ;;
+       news-3600 | risc-news)
+               basic_machine=mips-sony
+               os=-newsos
+               ;;
+       necv70)
+               basic_machine=v70-nec
+               os=-sysv
+               ;;
+       next | m*-next )
+               basic_machine=m68k-next
+               case $os in
+                   -nextstep* )
+                       ;;
+                   -ns2*)
+                     os=-nextstep2
+                       ;;
+                   *)
+                     os=-nextstep3
+                       ;;
+               esac
+               ;;
+       nh3000)
+               basic_machine=m68k-harris
+               os=-cxux
+               ;;
+       nh[45]000)
+               basic_machine=m88k-harris
+               os=-cxux
+               ;;
+       nindy960)
+               basic_machine=i960-intel
+               os=-nindy
+               ;;
+       mon960)
+               basic_machine=i960-intel
+               os=-mon960
+               ;;
+       nonstopux)
+               basic_machine=mips-compaq
+               os=-nonstopux
+               ;;
+       np1)
+               basic_machine=np1-gould
+               ;;
+       neo-tandem)
+               basic_machine=neo-tandem
+               ;;
+       nse-tandem)
+               basic_machine=nse-tandem
+               ;;
+       nsr-tandem)
+               basic_machine=nsr-tandem
+               ;;
+       op50n-* | op60c-*)
+               basic_machine=hppa1.1-oki
+               os=-proelf
+               ;;
+       openrisc | openrisc-*)
+               basic_machine=or32-unknown
+               ;;
+       os400)
+               basic_machine=powerpc-ibm
+               os=-os400
+               ;;
+       OSE68000 | ose68000)
+               basic_machine=m68000-ericsson
+               os=-ose
+               ;;
+       os68k)
+               basic_machine=m68k-none
+               os=-os68k
+               ;;
+       pa-hitachi)
+               basic_machine=hppa1.1-hitachi
+               os=-hiuxwe2
+               ;;
+       paragon)
+               basic_machine=i860-intel
+               os=-osf
+               ;;
+       parisc)
+               basic_machine=hppa-unknown
+               os=-linux
+               ;;
+       parisc-*)
+               basic_machine=hppa-`echo $basic_machine | sed 's/^[^-]*-//'`
+               os=-linux
+               ;;
+       pbd)
+               basic_machine=sparc-tti
+               ;;
+       pbb)
+               basic_machine=m68k-tti
+               ;;
+       pc532 | pc532-*)
+               basic_machine=ns32k-pc532
+               ;;
+       pc98)
+               basic_machine=i386-pc
+               ;;
+       pc98-*)
+               basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       pentium | p5 | k5 | k6 | nexgen | viac3)
+               basic_machine=i586-pc
+               ;;
+       pentiumpro | p6 | 6x86 | athlon | athlon_*)
+               basic_machine=i686-pc
+               ;;
+       pentiumii | pentium2 | pentiumiii | pentium3)
+               basic_machine=i686-pc
+               ;;
+       pentium4)
+               basic_machine=i786-pc
+               ;;
+       pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*)
+               basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       pentiumpro-* | p6-* | 6x86-* | athlon-*)
+               basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*)
+               basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       pentium4-*)
+               basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       pn)
+               basic_machine=pn-gould
+               ;;
+       power)  basic_machine=power-ibm
+               ;;
+       ppc | ppcbe)    basic_machine=powerpc-unknown
+               ;;
+       ppc-* | ppcbe-*)
+               basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       ppcle | powerpclittle | ppc-le | powerpc-little)
+               basic_machine=powerpcle-unknown
+               ;;
+       ppcle-* | powerpclittle-*)
+               basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       ppc64)  basic_machine=powerpc64-unknown
+               ;;
+       ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       ppc64le | powerpc64little | ppc64-le | powerpc64-little)
+               basic_machine=powerpc64le-unknown
+               ;;
+       ppc64le-* | powerpc64little-*)
+               basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       ps2)
+               basic_machine=i386-ibm
+               ;;
+       pw32)
+               basic_machine=i586-unknown
+               os=-pw32
+               ;;
+       rdos | rdos64)
+               basic_machine=x86_64-pc
+               os=-rdos
+               ;;
+       rdos32)
+               basic_machine=i386-pc
+               os=-rdos
+               ;;
+       rom68k)
+               basic_machine=m68k-rom68k
+               os=-coff
+               ;;
+       rm[46]00)
+               basic_machine=mips-siemens
+               ;;
+       rtpc | rtpc-*)
+               basic_machine=romp-ibm
+               ;;
+       s390 | s390-*)
+               basic_machine=s390-ibm
+               ;;
+       s390x | s390x-*)
+               basic_machine=s390x-ibm
+               ;;
+       sa29200)
+               basic_machine=a29k-amd
+               os=-udi
+               ;;
+       sb1)
+               basic_machine=mipsisa64sb1-unknown
+               ;;
+       sb1el)
+               basic_machine=mipsisa64sb1el-unknown
+               ;;
+       sde)
+               basic_machine=mipsisa32-sde
+               os=-elf
+               ;;
+       sei)
+               basic_machine=mips-sei
+               os=-seiux
+               ;;
+       sequent)
+               basic_machine=i386-sequent
+               ;;
+       sh)
+               basic_machine=sh-hitachi
+               os=-hms
+               ;;
+       sh5el)
+               basic_machine=sh5le-unknown
+               ;;
+       sh64)
+               basic_machine=sh64-unknown
+               ;;
+       sparclite-wrs | simso-wrs)
+               basic_machine=sparclite-wrs
+               os=-vxworks
+               ;;
+       sps7)
+               basic_machine=m68k-bull
+               os=-sysv2
+               ;;
+       spur)
+               basic_machine=spur-unknown
+               ;;
+       st2000)
+               basic_machine=m68k-tandem
+               ;;
+       stratus)
+               basic_machine=i860-stratus
+               os=-sysv4
+               ;;
+       strongarm-* | thumb-*)
+               basic_machine=arm-`echo $basic_machine | sed 's/^[^-]*-//'`
+               ;;
+       sun2)
+               basic_machine=m68000-sun
+               ;;
+       sun2os3)
+               basic_machine=m68000-sun
+               os=-sunos3
+               ;;
+       sun2os4)
+               basic_machine=m68000-sun
+               os=-sunos4
+               ;;
+       sun3os3)
+               basic_machine=m68k-sun
+               os=-sunos3
+               ;;
+       sun3os4)
+               basic_machine=m68k-sun
+               os=-sunos4
+               ;;
+       sun4os3)
+               basic_machine=sparc-sun
+               os=-sunos3
+               ;;
+       sun4os4)
+               basic_machine=sparc-sun
+               os=-sunos4
+               ;;
+       sun4sol2)
+               basic_machine=sparc-sun
+               os=-solaris2
+               ;;
+       sun3 | sun3-*)
+               basic_machine=m68k-sun
+               ;;
+       sun4)
+               basic_machine=sparc-sun
+               ;;
+       sun386 | sun386i | roadrunner)
+               basic_machine=i386-sun
+               ;;
+       sv1)
+               basic_machine=sv1-cray
+               os=-unicos
+               ;;
+       symmetry)
+               basic_machine=i386-sequent
+               os=-dynix
+               ;;
+       t3e)
+               basic_machine=alphaev5-cray
+               os=-unicos
+               ;;
+       t90)
+               basic_machine=t90-cray
+               os=-unicos
+               ;;
+       tile*)
+               basic_machine=$basic_machine-unknown
+               os=-linux-gnu
+               ;;
+       tx39)
+               basic_machine=mipstx39-unknown
+               ;;
+       tx39el)
+               basic_machine=mipstx39el-unknown
+               ;;
+       toad1)
+               basic_machine=pdp10-xkl
+               os=-tops20
+               ;;
+       tower | tower-32)
+               basic_machine=m68k-ncr
+               ;;
+       tpf)
+               basic_machine=s390x-ibm
+               os=-tpf
+               ;;
+       udi29k)
+               basic_machine=a29k-amd
+               os=-udi
+               ;;
+       ultra3)
+               basic_machine=a29k-nyu
+               os=-sym1
+               ;;
+       v810 | necv810)
+               basic_machine=v810-nec
+               os=-none
+               ;;
+       vaxv)
+               basic_machine=vax-dec
+               os=-sysv
+               ;;
+       vms)
+               basic_machine=vax-dec
+               os=-vms
+               ;;
+       vpp*|vx|vx-*)
+               basic_machine=f301-fujitsu
+               ;;
+       vxworks960)
+               basic_machine=i960-wrs
+               os=-vxworks
+               ;;
+       vxworks68)
+               basic_machine=m68k-wrs
+               os=-vxworks
+               ;;
+       vxworks29k)
+               basic_machine=a29k-wrs
+               os=-vxworks
+               ;;
+       w65*)
+               basic_machine=w65-wdc
+               os=-none
+               ;;
+       w89k-*)
+               basic_machine=hppa1.1-winbond
+               os=-proelf
+               ;;
+       xbox)
+               basic_machine=i686-pc
+               os=-mingw32
+               ;;
+       xps | xps100)
+               basic_machine=xps100-honeywell
+               ;;
+       xscale-* | xscalee[bl]-*)
+               basic_machine=`echo $basic_machine | sed 's/^xscale/arm/'`
+               ;;
+       ymp)
+               basic_machine=ymp-cray
+               os=-unicos
+               ;;
+       z8k-*-coff)
+               basic_machine=z8k-unknown
+               os=-sim
+               ;;
+       z80-*-coff)
+               basic_machine=z80-unknown
+               os=-sim
+               ;;
+       none)
+               basic_machine=none-none
+               os=-none
+               ;;
+
+# Here we handle the default manufacturer of certain CPU types.  It is in
+# some cases the only manufacturer, in others, it is the most popular.
+       w89k)
+               basic_machine=hppa1.1-winbond
+               ;;
+       op50n)
+               basic_machine=hppa1.1-oki
+               ;;
+       op60c)
+               basic_machine=hppa1.1-oki
+               ;;
+       romp)
+               basic_machine=romp-ibm
+               ;;
+       mmix)
+               basic_machine=mmix-knuth
+               ;;
+       rs6000)
+               basic_machine=rs6000-ibm
+               ;;
+       vax)
+               basic_machine=vax-dec
+               ;;
+       pdp10)
+               # there are many clones, so DEC is not a safe bet
+               basic_machine=pdp10-unknown
+               ;;
+       pdp11)
+               basic_machine=pdp11-dec
+               ;;
+       we32k)
+               basic_machine=we32k-att
+               ;;
+       sh[1234] | sh[24]a | sh[24]aeb | sh[34]eb | sh[1234]le | sh[23]ele)
+               basic_machine=sh-unknown
+               ;;
+       sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v)
+               basic_machine=sparc-sun
+               ;;
+       cydra)
+               basic_machine=cydra-cydrome
+               ;;
+       orion)
+               basic_machine=orion-highlevel
+               ;;
+       orion105)
+               basic_machine=clipper-highlevel
+               ;;
+       mac | mpw | mac-mpw)
+               basic_machine=m68k-apple
+               ;;
+       pmac | pmac-mpw)
+               basic_machine=powerpc-apple
+               ;;
+       *-unknown)
+               # Make sure to match an already-canonicalized machine name.
+               ;;
+       *)
+               echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
+               exit 1
+               ;;
+esac
+
+# Here we canonicalize certain aliases for manufacturers.
+case $basic_machine in
+       *-digital*)
+               basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'`
+               ;;
+       *-commodore*)
+               basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'`
+               ;;
+       *)
+               ;;
+esac
+
+# Decode manufacturer-specific aliases for certain operating systems.
+
+if [ x"$os" != x"" ]
+then
+case $os in
+       # First match some system type aliases
+       # that might get confused with valid system types.
+       # -solaris* is a basic system type, with this one exception.
+       -auroraux)
+               os=-auroraux
+               ;;
+       -solaris1 | -solaris1.*)
+               os=`echo $os | sed -e 's|solaris1|sunos4|'`
+               ;;
+       -solaris)
+               os=-solaris2
+               ;;
+       -svr4*)
+               os=-sysv4
+               ;;
+       -unixware*)
+               os=-sysv4.2uw
+               ;;
+       -gnu/linux*)
+               os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'`
+               ;;
+       # First accept the basic system types.
+       # The portable systems comes first.
+       # Each alternative MUST END IN A *, to match a version number.
+       # -sysv* is not here because it comes later, after sysvr4.
+       -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
+             | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
+             | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
+             | -sym* | -kopensolaris* | -plan9* \
+             | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
+             | -aos* | -aros* \
+             | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
+             | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
+             | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
+             | -bitrig* | -openbsd* | -solidbsd* \
+             | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
+             | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
+             | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
+             | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \
+             | -chorusos* | -chorusrdb* | -cegcc* \
+             | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
+             | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
+             | -linux-newlib* | -linux-musl* | -linux-uclibc* \
+             | -uxpv* | -beos* | -mpeix* | -udk* \
+             | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
+             | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
+             | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
+             | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
+             | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
+             | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
+             | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
+       # Remember, each alternative MUST END IN *, to match a version number.
+               ;;
+       -qnx*)
+               case $basic_machine in
+                   x86-* | i*86-*)
+                       ;;
+                   *)
+                       os=-nto$os
+                       ;;
+               esac
+               ;;
+       -nto-qnx*)
+               ;;
+       -nto*)
+               os=`echo $os | sed -e 's|nto|nto-qnx|'`
+               ;;
+       -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \
+             | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \
+             | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*)
+               ;;
+       -mac*)
+               os=`echo $os | sed -e 's|mac|macos|'`
+               ;;
+       -linux-dietlibc)
+               os=-linux-dietlibc
+               ;;
+       -linux*)
+               os=`echo $os | sed -e 's|linux|linux-gnu|'`
+               ;;
+       -sunos5*)
+               os=`echo $os | sed -e 's|sunos5|solaris2|'`
+               ;;
+       -sunos6*)
+               os=`echo $os | sed -e 's|sunos6|solaris3|'`
+               ;;
+       -opened*)
+               os=-openedition
+               ;;
+       -os400*)
+               os=-os400
+               ;;
+       -wince*)
+               os=-wince
+               ;;
+       -osfrose*)
+               os=-osfrose
+               ;;
+       -osf*)
+               os=-osf
+               ;;
+       -utek*)
+               os=-bsd
+               ;;
+       -dynix*)
+               os=-bsd
+               ;;
+       -acis*)
+               os=-aos
+               ;;
+       -atheos*)
+               os=-atheos
+               ;;
+       -syllable*)
+               os=-syllable
+               ;;
+       -386bsd)
+               os=-bsd
+               ;;
+       -ctix* | -uts*)
+               os=-sysv
+               ;;
+       -nova*)
+               os=-rtmk-nova
+               ;;
+       -ns2 )
+               os=-nextstep2
+               ;;
+       -nsk*)
+               os=-nsk
+               ;;
+       # Preserve the version number of sinix5.
+       -sinix5.*)
+               os=`echo $os | sed -e 's|sinix|sysv|'`
+               ;;
+       -sinix*)
+               os=-sysv4
+               ;;
+       -tpf*)
+               os=-tpf
+               ;;
+       -triton*)
+               os=-sysv3
+               ;;
+       -oss*)
+               os=-sysv3
+               ;;
+       -svr4)
+               os=-sysv4
+               ;;
+       -svr3)
+               os=-sysv3
+               ;;
+       -sysvr4)
+               os=-sysv4
+               ;;
+       # This must come after -sysvr4.
+       -sysv*)
+               ;;
+       -ose*)
+               os=-ose
+               ;;
+       -es1800*)
+               os=-ose
+               ;;
+       -xenix)
+               os=-xenix
+               ;;
+       -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+               os=-mint
+               ;;
+       -aros*)
+               os=-aros
+               ;;
+       -zvmoe)
+               os=-zvmoe
+               ;;
+       -dicos*)
+               os=-dicos
+               ;;
+       -nacl*)
+               ;;
+       -none)
+               ;;
+       *)
+               # Get rid of the `-' at the beginning of $os.
+               os=`echo $os | sed 's/[^-]*-//'`
+               echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2
+               exit 1
+               ;;
+esac
+else
+
+# Here we handle the default operating systems that come with various machines.
+# The value should be what the vendor currently ships out the door with their
+# machine or put another way, the most popular os provided with the machine.
+
+# Note that if you're going to try to match "-MANUFACTURER" here (say,
+# "-sun"), then you have to tell the case statement up towards the top
+# that MANUFACTURER isn't an operating system.  Otherwise, code above
+# will signal an error saying that MANUFACTURER isn't an operating
+# system, and we'll never get to this point.
+
+case $basic_machine in
+       score-*)
+               os=-elf
+               ;;
+       spu-*)
+               os=-elf
+               ;;
+       *-acorn)
+               os=-riscix1.2
+               ;;
+       arm*-rebel)
+               os=-linux
+               ;;
+       arm*-semi)
+               os=-aout
+               ;;
+       c4x-* | tic4x-*)
+               os=-coff
+               ;;
+       hexagon-*)
+               os=-elf
+               ;;
+       tic54x-*)
+               os=-coff
+               ;;
+       tic55x-*)
+               os=-coff
+               ;;
+       tic6x-*)
+               os=-coff
+               ;;
+       # This must come before the *-dec entry.
+       pdp10-*)
+               os=-tops20
+               ;;
+       pdp11-*)
+               os=-none
+               ;;
+       *-dec | vax-*)
+               os=-ultrix4.2
+               ;;
+       m68*-apollo)
+               os=-domain
+               ;;
+       i386-sun)
+               os=-sunos4.0.2
+               ;;
+       m68000-sun)
+               os=-sunos3
+               ;;
+       m68*-cisco)
+               os=-aout
+               ;;
+       mep-*)
+               os=-elf
+               ;;
+       mips*-cisco)
+               os=-elf
+               ;;
+       mips*-*)
+               os=-elf
+               ;;
+       or32-*)
+               os=-coff
+               ;;
+       *-tti)  # must be before sparc entry or we get the wrong os.
+               os=-sysv3
+               ;;
+       sparc-* | *-sun)
+               os=-sunos4.1.1
+               ;;
+       *-be)
+               os=-beos
+               ;;
+       *-haiku)
+               os=-haiku
+               ;;
+       *-ibm)
+               os=-aix
+               ;;
+       *-knuth)
+               os=-mmixware
+               ;;
+       *-wec)
+               os=-proelf
+               ;;
+       *-winbond)
+               os=-proelf
+               ;;
+       *-oki)
+               os=-proelf
+               ;;
+       *-hp)
+               os=-hpux
+               ;;
+       *-hitachi)
+               os=-hiux
+               ;;
+       i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent)
+               os=-sysv
+               ;;
+       *-cbm)
+               os=-amigaos
+               ;;
+       *-dg)
+               os=-dgux
+               ;;
+       *-dolphin)
+               os=-sysv3
+               ;;
+       m68k-ccur)
+               os=-rtu
+               ;;
+       m88k-omron*)
+               os=-luna
+               ;;
+       *-next )
+               os=-nextstep
+               ;;
+       *-sequent)
+               os=-ptx
+               ;;
+       *-crds)
+               os=-unos
+               ;;
+       *-ns)
+               os=-genix
+               ;;
+       i370-*)
+               os=-mvs
+               ;;
+       *-next)
+               os=-nextstep3
+               ;;
+       *-gould)
+               os=-sysv
+               ;;
+       *-highlevel)
+               os=-bsd
+               ;;
+       *-encore)
+               os=-bsd
+               ;;
+       *-sgi)
+               os=-irix
+               ;;
+       *-siemens)
+               os=-sysv4
+               ;;
+       *-masscomp)
+               os=-rtu
+               ;;
+       f30[01]-fujitsu | f700-fujitsu)
+               os=-uxpv
+               ;;
+       *-rom68k)
+               os=-coff
+               ;;
+       *-*bug)
+               os=-coff
+               ;;
+       *-apple)
+               os=-macos
+               ;;
+       *-atari*)
+               os=-mint
+               ;;
+       *)
+               os=-none
+               ;;
+esac
+fi
+
+# Here we handle the case where we know the os, and the CPU type, but not the
+# manufacturer.  We pick the logical manufacturer.
+vendor=unknown
+case $basic_machine in
+       *-unknown)
+               case $os in
+                       -riscix*)
+                               vendor=acorn
+                               ;;
+                       -sunos*)
+                               vendor=sun
+                               ;;
+                       -cnk*|-aix*)
+                               vendor=ibm
+                               ;;
+                       -beos*)
+                               vendor=be
+                               ;;
+                       -hpux*)
+                               vendor=hp
+                               ;;
+                       -mpeix*)
+                               vendor=hp
+                               ;;
+                       -hiux*)
+                               vendor=hitachi
+                               ;;
+                       -unos*)
+                               vendor=crds
+                               ;;
+                       -dgux*)
+                               vendor=dg
+                               ;;
+                       -luna*)
+                               vendor=omron
+                               ;;
+                       -genix*)
+                               vendor=ns
+                               ;;
+                       -mvs* | -opened*)
+                               vendor=ibm
+                               ;;
+                       -os400*)
+                               vendor=ibm
+                               ;;
+                       -ptx*)
+                               vendor=sequent
+                               ;;
+                       -tpf*)
+                               vendor=ibm
+                               ;;
+                       -vxsim* | -vxworks* | -windiss*)
+                               vendor=wrs
+                               ;;
+                       -aux*)
+                               vendor=apple
+                               ;;
+                       -hms*)
+                               vendor=hitachi
+                               ;;
+                       -mpw* | -macos*)
+                               vendor=apple
+                               ;;
+                       -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*)
+                               vendor=atari
+                               ;;
+                       -vos*)
+                               vendor=stratus
+                               ;;
+               esac
+               basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"`
+               ;;
+esac
+
+echo $basic_machine$os
+exit
+
+# Local variables:
+# eval: (add-hook 'write-file-hooks 'time-stamp)
+# time-stamp-start: "timestamp='"
+# time-stamp-format: "%:y-%02m-%02d"
+# time-stamp-end: "'"
+# End:
diff --git a/ctdb/config/README b/ctdb/config/README
new file mode 100644 (file)
index 0000000..ffbeb0e
--- /dev/null
@@ -0,0 +1,31 @@
+This directory contains run-time support scripts for CTDB.
+
+Selected highlights:
+
+  ctdb.init
+
+    An initscript for starting ctdbd at boot time.
+
+  events.d/
+
+    Eventscripts.  See events.d/README for more details.
+
+  functions
+
+    Support functions, sourced by eventscripts and other scripts.
+
+  statd-callout
+
+    rpc.statd high-availability callout to support lock migration on
+    failover.
+
+Notes:
+
+* All of these scripts are written in POSIX Bourne shell.  Please
+  avoid bash-isms, including the use of "local" variables (which are
+  not available in POSIX shell).
+
+* Do not use absolute paths for commands.  Unit tests attempt to
+  replace many commands with stubs and can not do this if commands are
+  specified with absolute paths.  The functions file controls $PATH so
+  absolute paths should not be required.
diff --git a/ctdb/config/ctdb-crash-cleanup.sh b/ctdb/config/ctdb-crash-cleanup.sh
new file mode 100755 (executable)
index 0000000..78eaa93
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+#
+# This script can be called from a cronjob to automatically drop/release
+# all public ip addresses if CTDBD has crashed or stopped running.
+#
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD")
+
+. "$CTDB_BASE/functions"
+
+# If ctdb is running, just exit
+if service ctdb status >/dev/null 2>&1 ; then
+    exit 0
+fi
+
+loadconfig ctdb
+
+[ -n "$CTDB_PUBLIC_ADDRESSES" ] || \
+       CTDB_PUBLIC_ADDRESSES="$CTDB_BASE/public_addresses"
+
+[ -f "$CTDB_PUBLIC_ADDRESSES" ] || \
+    die "No public addresses file found. Can't clean up."
+
+drop_all_public_ips 2>&1 | script_log "ctdb-crash-cleanup.sh"
+
+if [ -n "$CTDB_NATGW_PUBLIC_IP" ] ; then
+    drop_ip "$CTDB_NATGW_PUBLIC_IP" "ctdb-crash-cleanup.sh"
+fi
diff --git a/ctdb/config/ctdb.init b/ctdb/config/ctdb.init
new file mode 100755 (executable)
index 0000000..0e0d379
--- /dev/null
@@ -0,0 +1,171 @@
+#!/bin/sh
+
+# Start and stop CTDB (Clustered TDB daemon)
+#
+# chkconfig: - 90 01
+#
+# description: Starts and stops CTDB
+# pidfile: /var/run/ctdb/ctdbd.pid
+# config: /etc/sysconfig/ctdb
+
+### BEGIN INIT INFO
+# Provides:            ctdb
+# Required-Start:      $local_fs $syslog $network $remote_fs
+# Required-Stop:       $local_fs $syslog $network $remote_fs
+# Default-Start:       2 3 4 5
+# Default-Stop:        0 1 6
+# Short-Description:   start and stop ctdb service
+# Description:         Start and stop CTDB (Clustered TDB daemon)
+### END INIT INFO
+
+# Source function library.
+if [ -f /etc/init.d/functions ] ; then
+    # Red Hat
+    . /etc/init.d/functions
+elif [ -f /etc/rc.d/init.d/functions ] ; then
+    # Red Hat
+    . /etc/rc.d/init.d/functions
+elif [ -f /etc/rc.status ] ; then
+    # SUSE
+    . /etc/rc.status
+    rc_reset
+    LC_ALL=en_US.UTF-8
+elif [ -f /lib/lsb/init-functions ] ; then
+    # Debian
+    . /lib/lsb/init-functions
+fi
+
+# Avoid using root's TMPDIR
+unset TMPDIR
+
+[ -n "$CTDB_BASE" ] || export CTDB_BASE="/etc/ctdb"
+
+. "${CTDB_BASE}/functions"
+loadconfig "network"
+loadconfig "ctdb"
+
+# check networking is up (for redhat)
+if [ "$NETWORKING" = "no" ] ; then
+    exit 0
+fi
+
+detect_init_style
+export CTDB_INIT_STYLE
+
+ctdbd="${CTDBD:-/usr/sbin/ctdbd}"
+ctdbd_wrapper="${CTDBD_WRAPPER:-/usr/sbin/ctdbd_wrapper}"
+pidfile="${CTDB_PIDFILE:-/var/run/ctdb/ctdbd.pid}"
+
+############################################################
+
+start()
+{
+    echo -n "Starting ctdbd service: "
+
+    case "$CTDB_INIT_STYLE" in
+       suse)
+           startproc \
+               "$ctdbd_wrapper" "$pidfile" "start"
+           rc_status -v
+           ;;
+       redhat)
+           daemon --pidfile "$pidfile" \
+               "$ctdbd_wrapper" "$pidfile" "start"
+           RETVAL=$?
+           echo
+           [ $RETVAL -eq 0 ] && touch /var/lock/subsys/ctdb || RETVAL=1
+           return $RETVAL
+           ;;
+       debian)
+           eval start-stop-daemon --start --quiet --background --exec \
+               "$ctdbd_wrapper" "$pidfile" "start"
+           ;;
+    esac
+}
+
+stop()
+{
+    echo -n "Shutting down ctdbd service: "
+
+    case "$CTDB_INIT_STYLE" in
+       suse)
+           "$ctdbd_wrapper" "$pidfile" "stop"
+           rc_status -v
+           ;;
+       redhat)
+           "$ctdbd_wrapper" "$pidfile" "stop"
+           RETVAL=$?
+            [ $RETVAL -eq 0 ] && success || failure
+           echo ""
+           [ $RETVAL -eq 0 ] && rm -f /var/lock/subsys/ctdb
+           return $RETVAL
+           ;;
+       debian)
+           "$ctdbd_wrapper" "$pidfile" "stop"
+           log_end_msg $?
+           ;;
+    esac
+}
+
+restart()
+{
+    stop
+    start
+}
+
+check_status ()
+{
+    # Backward compatibility.  When we arrange to pass --pidfile to
+    # ctdbd we also create the directory that will contain it.  If
+    # that directory is missing then we don't use the pidfile to check
+    # status.  Note that this probably won't work if
+    # $CTDB_VALGRIND="yes" but this doesn't need full backward
+    # compatibility because it is a debug option.
+    if [ -d $(dirname "$pidfile") ] ; then
+       _pf_opt="-p $pidfile"
+    else
+       _pf_opt=""
+    fi
+
+    case "$CTDB_INIT_STYLE" in
+       suse)
+           checkproc $_pf_opt "$ctdbd"
+           rc_status -v
+           ;;
+       redhat)
+           status $_pf_opt -l "ctdb" "$ctdbd"
+           ;;
+       debian)
+           status_of_proc $_pf_opt "$ctdbd" "ctdb"
+           ;;
+    esac
+}
+
+############################################################
+
+case "$1" in
+    start)
+       start
+       ;;
+    stop)
+       stop
+       ;;
+    restart|reload|force-reload)
+       restart
+       ;;
+    status)
+       check_status
+       ;;
+    condrestart|try-restart)
+       if check_status >/dev/null ; then
+           restart
+       fi
+       ;;
+    cron)
+       # used from cron to auto-restart ctdb
+       check_status >/dev/null 2>&1 || restart
+       ;;
+    *)
+       echo "Usage: $0 {start|stop|restart|reload|force-reload|status|cron|condrestart|try-restart}"
+       exit 1
+esac
diff --git a/ctdb/config/ctdb.service b/ctdb/config/ctdb.service
new file mode 100644 (file)
index 0000000..ea37c30
--- /dev/null
@@ -0,0 +1,15 @@
+[Unit]
+Description=CTDB
+After=network.target
+
+[Service]
+Type=forking
+LimitCORE=infinity
+PIDFile=/run/ctdb/ctdbd.pid
+ExecStart=/usr/sbin/ctdbd_wrapper /run/ctdb/ctdbd.pid start
+ExecStop=/usr/sbin/ctdbd_wrapper /run/ctdb/ctdbd.pid stop
+KillMode=control-group
+Restart=no
+
+[Install]
+WantedBy=multi-user.target
diff --git a/ctdb/config/ctdb.sudoers b/ctdb/config/ctdb.sudoers
new file mode 100644 (file)
index 0000000..1c6619b
--- /dev/null
@@ -0,0 +1,3 @@
+Defaults!/etc/ctdb/statd-callout       !requiretty
+
+rpcuser                ALL=(ALL)       NOPASSWD: /etc/ctdb/statd-callout
diff --git a/ctdb/config/ctdb.sysconfig b/ctdb/config/ctdb.sysconfig
new file mode 100644 (file)
index 0000000..35bf5f8
--- /dev/null
@@ -0,0 +1,336 @@
+# Options to ctdbd. This is read by /etc/init.d/ctdb
+
+# You must specify the location of a shared lock file across all the
+# nodes for split brain prevention to work.
+# This must be on shared storage.
+# CTDB can operate without a reclock file, but this means that there is no
+# protection against a split brain.
+# It is strongly suggested to NOT run ctdb without a reclock file.
+CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
+
+# when doing IP takeover you also may specify what network interface
+# to use by default for the public addresses. Otherwise you must
+# specify an interface on each line of the public addresses file
+# there is no default
+# CTDB_PUBLIC_INTERFACE=eth0
+
+# Should ctdb do IP takeover? If it should, then specify a file
+# containing the list of public IP addresses that ctdb will manage
+# Note that these IPs must be different from those in $NODES above
+# there is no default.
+# The syntax is one line per public address of the form :
+#   <ipaddress>/<netmask> <interface>
+# Example: 10.1.1.1/24 eth0
+#
+# CTDB_PUBLIC_ADDRESSES=/etc/ctdb/public_addresses
+
+# Should CTDB present the cluster using a single public ip address to clients
+# and multiplex clients across all CONNECTED nodes ?
+# This is based on LVS 
+# When this is enabled, the entire cluster will present one single ip address
+# which clients will connect to.
+# CTDB_LVS_PUBLIC_IP=10.1.1.1
+
+
+# should ctdb manage starting/stopping the Samba service for you?
+# default is to not manage Samba
+# CTDB_MANAGES_SAMBA=yes
+
+# If there are very many shares it may not be feasible to check that all
+# of them are available during each monitoring interval.
+# In that case this check can be disabled
+# CTDB_SAMBA_SKIP_SHARE_CHECK=yes
+# CTDB_NFS_SKIP_SHARE_CHECK=yes
+
+# specify which ports we should check that there is a daemon listening to
+# by default we use testparm and look in smb.conf to figure out.
+# CTDB_SAMBA_CHECK_PORTS="445"
+
+# should ctdb manage starting/stopping Winbind service?
+# if left comented out then it will be autodetected based on smb.conf
+# CTDB_MANAGES_WINBIND=yes
+
+# should ctdb manage starting/stopping the VSFTPD service
+# CTDB_MANAGES_VSFTPD=yes
+
+# should ctdb manage starting/stopping the ISCSI service
+# CTDB_MANAGES_ISCSI=yes
+
+# should ctdb manage starting/stopping the NFS service
+# CTDB_MANAGES_NFS=yes
+
+# should ctdb manage starting/stopping the Apache web server httpd?
+# CTDB_MANAGES_HTTPD
+
+# The init style (redhat/suse/debian...) is usually auto-detected.
+# The names of init scripts of services managed by CTDB are set
+# based on the detected init style. You can override the init style
+# auto-detection here to explicitly use a scheme. This might be
+# useful when you have installed a packages (for instance samba
+# packages) with a different init script layout.
+# There is no default.
+# CTDB_INIT_STYLE=redhat
+
+# The following are specific Samba init scripts / services that you
+# can override from auto-detection.
+# There are no defaults.
+# CTDB_SERVICE_SMB=smb
+# CTDB_SERVICE_NMB=nmb
+# CTDB_SERVICE_WINBIND=winbind
+
+# you may wish to raise the file descriptor limit for ctdb
+# use a ulimit command here. ctdb needs one file descriptor per
+# connected client (ie. one per connected client in Samba)
+#  ulimit -n 10000
+
+# the NODES file must be specified or ctdb won't start
+# it should contain a list of IPs that ctdb will use
+# it must be exactly the same on all cluster nodes
+# defaults to /etc/ctdb/nodes
+# CTDB_NODES=/etc/ctdb/nodes
+
+# a script to run when node health changes
+# CTDB_NOTIFY_SCRIPT=/etc/ctdb/notify.sh
+
+# a script to collect data when an eventscript has hung
+# CTDB_DEBUG_HUNG_SCRIPT=/etc/ctdb/debug-hung-script.sh
+
+# the directory to put the local ctdb database files in
+# defaults to /var/ctdb
+# CTDB_DBDIR=/var/ctdb
+
+# the directory to put the local persistent ctdb database files in
+# defaults to /var/ctdb/persistent
+# CTDB_DBDIR_PERSISTENT=/var/ctdb/persistent
+
+# the directory where service specific event scripts are stored
+# defaults to /etc/ctdb/events.d
+# CTDB_EVENT_SCRIPT_DIR=/etc/ctdb/events.d
+
+# the location of the local ctdb socket
+# defaults to /var/run/ctdb/ctdbd.socket
+# CTDB_SOCKET=/var/run/ctdb/ctdbd.socket
+
+# what transport to use. Only tcp is currently supported
+# defaults to tcp
+# CTDB_TRANSPORT="tcp"
+
+# These setting allow monitoring for low/out-out of memory conditions.
+#
+# If set, once available memory drops below CTDB_MONITOR_FREE_MEMORY_WARN
+# ctdb will start logging messages that memory is low, but will not
+# take any further action.
+#
+# If the amount of free memory drops below CTDB_MONITOR_FREE_MEMORY 
+# ctdb will fail all service over to a different node and finally shutdown.
+# Once this occurs, the administrator needs to find the reason for the OOM
+# situation, rectify it and restart ctdb with "service ctdb start"
+# The unit is MByte
+# CTDB_MONITOR_FREE_MEMORY_WARN=100
+# CTDB_MONITOR_FREE_MEMORY=10
+
+# Should the 60.nfs monitor event try to correct the number of nfsd
+# threads?  This works around a limitation in some NFS initscripts
+# where some threads can be stuck in host filesystem calls (perhaps
+# due to slow storage), a restart occurs, some threads don't exit, the
+# start only adds the missing number of threads, the stuck threads
+# exit, and the result is a lower than expected thread count.  Note
+# that if you must also set $RPCNFSDCOUNT (RedHat/Debian) or
+# $USE_KERNEL_NFSD_NUMBER (SUSE) in your NFS configuration so the
+# monitoring code knows how many threads there should be - if neither
+# of these are set then this option will be ignored.  The default is
+# to not do this check.
+# CTDB_MONITOR_NFS_THREAD_COUNT="yes"
+
+
+# The number of nfsd threads to dump stack traces for if some are
+# still alive after stopping NFS during a restart.  The default is to
+# dump no stack traces.
+# CTDB_NFS_DUMP_STUCK_THREADS=5
+
+# Host to use for rpcinfo checks.  Using "localhost" causes
+# /etc/services to be read serveral times by rpcinfo, so this allows
+# optimisation.
+CTDB_RPCINFO_LOCALHOST="127.0.0.1"
+
+# When set to yes, the CTDB node will start in DISABLED mode and not host
+# any public ip addresses. The administrator needs to explicitely enable
+# the node with "ctdb enable"
+# CTDB_START_AS_DISABLED="yes"
+
+# LMASTER and RECMASTER capabilities.
+# By default all nodes are capable of both being LMASTER for records and
+# also for taking the RECMASTER role and perform recovery.
+# These parameters can be used to disable these two roles on a node.
+# Note: If there are NO available nodes left in a cluster that can perform
+# the RECMASTER role, the cluster will not be able to recover from a failure
+# and will remain in RECOVERY mode until an RECMASTER capable node becomes
+# available. Same for LMASTER.
+# These parametersd are useful for scenarios where you have one "remote" node
+# in a cluster and you do not want the remote node to be fully participating
+# in the cluster and slow things down.
+# For that case, set both roles to "no" for the remote node on the remote site
+# but leave the roles default to "yes" on the primary nodes in the central
+# datacentre.
+# CTDB_CAPABILITY_RECMASTER=yes
+# CTDB_CAPABILITY_LMASTER=yes
+
+# NAT-GW configuration
+# Some services running on nthe CTDB node may need to originate traffic to
+# remote servers before the node is assigned any IP addresses,
+# This is problematic since before the node has public addresses the node might
+# not be able to route traffic to the public networks.
+# One solution is to have static public addresses assigned with routing
+# in addition to the public address interfaces, thus guaranteeing that
+# a node always can route traffic to the external network.
+# This is the most simple solution but it uses up a large number of 
+# additional ip addresses.
+#
+# A more complex solution is NAT-GW.
+# In this mode we only need one additional ip address for the cluster from
+# the exsternal public network.
+# One of the nodes in the cluster is elected to be hosting this ip address
+# so it can reach the external services. This node is also configured
+# to use NAT MASQUERADING for all traffic from the internal private network
+# to the external network. This node is the NAT-GW node.
+#
+# All other nodes are set up with a default rote with a metric of 10 to point
+# to the nat-gw node.
+# 
+# The effect of this is that only when a node does not have a public address
+# and thus no proper routes to the external world it will instead
+# route all packets through the nat-gw node.
+#
+# CTDB_NATGW_NODES is the list of nodes that belong to this natgw group.
+# You can have multiple natgw groups in one cluster but each node
+# can only belong to one single natgw group.
+#
+# CTDB_NATGW_PUBLIC_IP=10.0.0.227/24
+# CTDB_NATGW_PUBLIC_IFACE=eth0
+# CTDB_NATGW_DEFAULT_GATEWAY=10.0.0.1
+# CTDB_NATGW_PRIVATE_NETWORK=10.1.1.0/24
+# CTDB_NATGW_NODES=/etc/ctdb/natgw_nodes
+#
+# Normally any node in the natgw group can act as the natgw master.
+# In some configurations you may have special nodes that is a part of the
+# cluster/natgw group, but where the node lacks connectivity to the 
+# public network.
+# For these cases, set this variable to make these nodes not able to
+# become natgw master.
+#
+# CTDB_NATGW_SLAVE_ONLY=yes
+
+
+# PER_IP_ROUTING configuration
+#
+# Some setups have multiple network interfaces connected to the
+# same network. By default all traffic for a network is routed
+# through only one interface, while the others are idle.
+#
+# On Linux it possible to use policy based routing to spread the load
+# across all interfaces. The is implemented by using a separate
+# routing table per public ip address.
+#
+# The configuration file configured by CTDB_PER_IP_ROUTING_CONF
+# contains the list of additional routes. The routes are bound to the
+# interface that is holding the public ip address.
+#
+# The format of the config file looks like this:
+# <public_ip_address> <network> [<gateway>]
+# and it's possible to have multiple routes per public ip address.
+#
+# If the special value "__auto_link_local__" is used, the config
+# file autogenerated. Each public ip address gets a special route
+# for its own subnet bound to it's current interface.
+# E.g. 10.1.2.3/24 will result in a config file line
+# 10.1.2.3 10.1.2.0/24
+#
+# The CTDB_PER_IP_ROUTING_RULE_PREF option needs to be configured.
+# The value will be passed as "pref" argument of "ip rule".
+# The value should be between 1 and 32765. So that the rule
+# comes after the rule for "local" routing table and before
+# the rule for the "main" routing table. This way the specific
+# routing table just overloads the "main" routing table,
+# this is useful because with the "__auto_link_local__" setup
+# the default route still comes from the "main" routing table.
+#
+# The routing table ids are automaticly allocated. On
+# Linux the routing table ids must be in the range of 0 to 255.
+# But some are reserved values, see /etc/iproute2/rt_tables.
+# You need to configure a range (CTDB_PER_IP_ROUTING_TABLE_ID_LOW
+# and CTDB_PER_IP_ROUTING_TABLE_ID_HIGH) from which the table ids can be taken.
+#
+# The default value for CTDB_PER_IP_ROUTING_CONF is "",
+# which means the feature is disabled by default.
+#
+# CTDB_PER_IP_ROUTING_CONF="/etc/ctdb/per_ip_routing.conf"
+# CTDB_PER_IP_ROUTING_CONF="__auto_link_local__"
+# CTDB_PER_IP_ROUTING_TABLE_ID_LOW=10
+# CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=250
+# CTDB_PER_IP_ROUTING_RULE_PREF=10000
+
+# Make offline interfaces not a reason for being UNHEALTHY.
+# The CTDB_PARTIALLY_ONLINE_INTERFACES option changes
+# the behavior of the 10.interface monitor event.
+# In some setups it's desired that interfaces without
+# an active link don't change the node to unhealthy.
+# ctdbd is just informed about the interface status
+# and "ctdb status" dislays the node as "PARTIALLYONLINE".
+#
+# CTDB_PARTIALLY_ONLINE_INTERFACES="yes"
+
+# where to log messages
+# the default is /var/log/log.ctdb
+# CTDB_LOGFILE=/var/log/log.ctdb
+
+# what debug level to run at. Higher means more verbose
+# the default is ERR
+CTDB_DEBUGLEVEL=ERR
+
+# whether to suppress core files.  Default is no.
+# CTDB_SUPPRESS_COREFILE=yes
+
+# Write debug messages to syslog instead of logfile?
+# The default is not to use syslog.
+# CTDB_SYSLOG=no
+
+# Should the 99.timeout monitor event script be run?
+# This event script just sleeps long enough to trigger the
+# event script timeout. Might be useful for debugging.
+# The default is "no".
+# CTDB_RUN_TIMEOUT_MONITOR=no
+
+# Should ctdbd start with corrupted/unhealthy persistent databases?
+# This parameter specifies the max error count for persistent health
+# checks before the "startup" event. The value must be a positive
+# interger value, "0" or "-1".
+# The default is "0", which means ctdbd will not start.
+# "-1" means wait forever.
+# CTDB_MAX_PERSISTENT_CHECK_ERRORS=0
+
+# All log entries up to level 9 are also collected into a in-memory ringbuffer
+# in addition to the log that is written to the log file.
+# This parameter controls how many entries we allow for this in memory log
+# CTDB_LOG_RINGBUF_SIZE=500000
+
+# Monitor filesystem useage.
+# when set, and the 40.fs_use eventscript is enabled, this variable
+# allows one to monitor the filesystem use and flag a node as unhealthy when
+# the filesystem becomes too full.
+# This is useful for example when /var grows too big.
+# Example: monitor both / and /var and make the node unhealthy when eitehr go
+# above 90%
+# CTDB_CHECK_FS_USE="/:90 /var:90"
+
+# Should CTDB automatically start and stop services when it is told to
+# newly manage or no longer manage them?
+CTDB_SERVICE_AUTOSTARTSTOP=yes
+
+# 
+#
+# set any default tuning options for ctdb
+# use CTDB_SET_XXXX=value where XXXX is the name of the tuning
+# variable
+# for example
+#    CTDB_SET_TRAVERSETIMEOUT=60
+# you can get a list of variables using "ctdb listvars"
diff --git a/ctdb/config/ctdbd_wrapper b/ctdb/config/ctdbd_wrapper
new file mode 100755 (executable)
index 0000000..f0b032d
--- /dev/null
@@ -0,0 +1,275 @@
+#!/bin/sh
+
+# ctdbd wrapper - start or stop CTDB
+
+usage ()
+{
+    echo "usage: ctdbd_wrapper <pidfile> { start | stop }"
+    exit 1
+}
+
+[ $# -eq 2 ] || usage
+
+pidfile="$1"
+action="$2"
+
+############################################################
+
+[ -n "$CTDB_BASE" ] || export CTDB_BASE="/etc/ctdb"
+
+. "${CTDB_BASE}/functions"
+loadconfig "ctdb"
+
+export CTDB_SOCKET
+
+ctdbd="${CTDBD:-/usr/sbin/ctdbd}"
+
+############################################################
+
+# ctdbd_is_running()
+
+# 1. Check if ctdbd is running.
+#    - If the PID file is being used then, if the PID file is present,
+#      ctdbd is only considered to running if the PID in the file is
+#      active.
+#    - If the PID file is not being used (i.e. we're upgrading from a
+#      version that doesn't support it) then the presence of any ctdbd
+#      processes is enough proof.
+
+# 2. Print a comma-separated list of PIDs that can be
+#    used with "pkill -s".
+#    - If the PID file is being used then this is just the PID in that
+#      file.  This also happens to be the session ID, so can be used
+#      to kill all CTDB processes.
+#    - If the PID file is not being used (i.e. upgrading) then this is
+#      just any ctdbd processes that are running.  Hopefully one of
+#      them is the session ID so that it can be used to kill all CTDB
+#      processes.
+
+# Combining these 2 checks is an optimisation to avoid potentially
+# running too many pgrep/pkill processes on an already loaded system.
+# Trawling through /proc/ can be very expensive.
+
+ctdbd_is_running ()
+{
+    # If the directory for the PID file exists then respect the
+    # existence of a PID file.
+    _pidfile_dir=$(dirname "$pidfile")
+    if [ -d "$_pidfile_dir" ] ; then
+       if read _pid 2>/dev/null <"$pidfile" ; then
+           echo "$_pid"
+
+           # Return value of kill is used
+           kill -0 $_pid 2>/dev/null
+       else
+           # Missing/empty PID file
+           return 1
+       fi
+    else
+       if _pid=$(pgrep -f "${ctdbd}\>") ; then
+           echo $_pid | sed -e 's@ @,@g'
+           return 0
+       else
+           return 1
+       fi
+    fi
+}
+
+############################################################
+
+build_ctdb_options ()
+{
+    ctdb_options=""
+
+    maybe_set ()
+    {
+       # If the given variable isn't set then do nothing
+       [ -n "$2" ] || return
+       # If a required value for the variable and it doesn't match,
+       # then do nothing
+       [ -z "$3" -o "$3" = "$2" ] || return
+
+       val="'$2'"
+       case "$1" in
+           --*) sep="=" ;;
+           -*)  sep=" " ;;
+       esac
+       # For these options we're only passing a value-less flag.
+       if [ -n "$3" ] ; then
+           val=""
+           sep=""
+       fi
+
+       ctdb_options="${ctdb_options}${ctdb_options:+ }${1}${sep}${val}"
+    }
+
+    if [ -z "$CTDB_RECOVERY_LOCK" ] ; then
+        echo "No recovery lock specified. Starting CTDB without split brain preventivon"
+    fi
+    maybe_set "--reclock"                "$CTDB_RECOVERY_LOCK"
+
+    maybe_set "--pidfile"                "$pidfile"
+
+    # build up ctdb_options variable from optional parameters
+    maybe_set "--logfile"                "$CTDB_LOGFILE"
+    maybe_set "--nlist"                  "$CTDB_NODES"
+    maybe_set "--socket"                 "$CTDB_SOCKET"
+    maybe_set "--public-addresses"       "$CTDB_PUBLIC_ADDRESSES"
+    maybe_set "--public-interface"       "$CTDB_PUBLIC_INTERFACE"
+    maybe_set "--dbdir"                  "$CTDB_DBDIR"
+    maybe_set "--dbdir-persistent"       "$CTDB_DBDIR_PERSISTENT"
+    maybe_set "--dbdir-state"            "$CTDB_DBDIR_STATE"
+    maybe_set "--event-script-dir"       "$CTDB_EVENT_SCRIPT_DIR"
+    maybe_set "--transport"              "$CTDB_TRANSPORT"
+    maybe_set "-d"                       "$CTDB_DEBUGLEVEL"
+    maybe_set "--notification-script"    "$CTDB_NOTIFY_SCRIPT"
+    maybe_set "--start-as-disabled"      "$CTDB_START_AS_DISABLED"    "yes"
+    maybe_set "--start-as-stopped "      "$CTDB_START_AS_STOPPED"     "yes"
+    maybe_set "--no-recmaster"           "$CTDB_CAPABILITY_RECMASTER" "no"
+    maybe_set "--no-lmaster"             "$CTDB_CAPABILITY_LMASTER"   "no"
+    maybe_set "--lvs --single-public-ip" "$CTDB_LVS_PUBLIC_IP"
+    maybe_set "--script-log-level"       "$CTDB_SCRIPT_LOG_LEVEL"
+    maybe_set "--log-ringbuf-size"       "$CTDB_LOG_RINGBUF_SIZE"
+    maybe_set "--syslog"                 "$CTDB_SYSLOG"               "yes"
+    maybe_set "--max-persistent-check-errors" "$CTDB_MAX_PERSISTENT_CHECK_ERRORS"
+}
+
+export_debug_variables ()
+{
+    export CTDB_DEBUG_HUNG_SCRIPT CTDB_EXTERNAL_TRACE CTDB_DEBUG_LOCKS
+}
+
+kill_ctdbd ()
+{
+    _session="$1"
+
+    if [ -n "$_session" ] ; then
+       pkill -9 -s "$_session" 2>/dev/null
+    fi
+    rm -f "$pidfile"
+}
+
+############################################################
+
+start()
+{
+    if _session=$(ctdbd_is_running) ; then
+       echo $"CTDB is already running"
+       return 0
+    fi
+
+    # About to start new $ctdbd.  The main daemon is not running but
+    # there may still be other processes around, so do some cleanup.
+    # Note that starting ctdbd below will destroy the Unix domain
+    # socket, so any processes that aren't yet completely useless soon
+    # will be, so this can really do no harm.
+    kill_ctdbd "$_session"
+
+    build_ctdb_options
+
+    export_debug_variables
+
+    if [ "$CTDB_SUPPRESS_COREFILE" = "yes" ]; then
+       ulimit -c 0
+    else
+       ulimit -c unlimited
+    fi
+
+    mkdir -p $(dirname "$pidfile")
+
+    if [ -n "$CTDB_VALGRIND" -a "$CTDB_VALGRIND" != "no" ] ; then
+       if [ "$CTDB_VALGRIND" = "yes" ] ; then
+           ctdbd="valgrind -q --log-file=/var/log/ctdb_valgrind ${ctdbd}"
+       else
+           ctdbd="${CTDB_VALGRIND} ${ctdbd}"
+       fi
+       ctdb_options="${ctdb_options} --valgrinding"
+    fi
+
+    if [ "$CTDB_SYSLOG" != "yes" ] ; then
+       logger -t ctdbd "CTDB is being run without syslog enabled.  Logs will be in ${CTDB_LOGFILE:-/var/log/log.ctdb}"
+    fi
+
+    eval "$ctdbd" "$ctdb_options" || return 1
+
+    # Wait until ctdbd has started and is ready to respond to clients.
+    _pid=""
+    _timeout="${CTDB_STARTUP_TIMEOUT:-10}"
+    _count=0
+    while [ $_count -lt $_timeout ] ; do
+       # If we don't have the PID then try to read it.
+       [ -n "$_pid" ] || read _pid 2>/dev/null <"$pidfile"
+
+       # If we got the PID but the PID file has gone or the process
+       # is no longer running then stop waiting... CTDB is dead.
+       if [ -n "$_pid" ] ; then
+           if [ ! -e "$pidfile" ] || ! kill -0 "$_pid" 2>/dev/null ; then
+               echo "CTDB exited during initialisation - check logs."
+               kill_ctdbd "$_pid"
+               drop_all_public_ips >/dev/null 2>&1
+               return 1
+           fi
+
+           if ctdb runstate first_recovery startup running >/dev/null 2>&1 ; then
+               return 0
+           fi
+       fi
+
+       _count=$(($_count + 1))
+       sleep 1
+    done
+
+    echo "Timed out waiting for initialisation - check logs - killing CTDB"
+    kill_ctdbd "$_pid"
+    drop_all_public_ips >/dev/null 2>&1
+    return 1
+}
+
+stop()
+{
+    if ! _session=$(ctdbd_is_running) ; then
+       echo "CTDB is not running"
+       return 0
+    fi
+
+    ctdb shutdown
+
+    # Wait for remaining CTDB processes to exit...
+    _timeout=${CTDB_SHUTDOWN_TIMEOUT:-30}
+    _count=0
+    while [ $_count -lt $_timeout ] ; do
+       pkill -0 -s "$_session" 2>/dev/null || return 0
+
+       _count=$(($_count + 1))
+       sleep 1
+    done
+
+    echo "Timed out waiting for CTDB to shutdown.  Killing CTDB processes."
+    kill_ctdbd "$_session"
+    drop_all_public_ips >/dev/null 2>&1
+
+    sleep 1
+
+    if pkill -0 -s "$_session" ; then
+       # If SIGKILL didn't work then things are bad...
+       echo "Failed to kill all CTDB processes.  Giving up."
+       return 1
+    fi
+
+    return 0
+}
+
+############################################################
+
+# Allow notifications for start/stop.
+if [ -x "$CTDB_BASE/rc.ctdb" ] ; then
+    "$CTDB_BASE/rc.ctdb" "$action"
+fi
+
+case "$action" in
+    start) start ;;
+    stop)  stop  ;;
+    *)
+       echo "usage: $0 {start|stop}"
+       exit 1
+esac
diff --git a/ctdb/config/debug-hung-script.sh b/ctdb/config/debug-hung-script.sh
new file mode 100755 (executable)
index 0000000..1984242
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+(
+    flock --wait 2 9 || exit 1
+
+    echo "===== Start of hung script debug for PID=\"$1\", event=\"$2\" ====="
+
+    echo "pstree -p -a ${1}:"
+    pstree -p -a $1
+
+    if [ "$2" = "init" ] ; then
+       exit 0
+    fi
+
+    echo "ctdb scriptstatus ${2}:"
+    # No use running several of these in parallel if, say, "releaseip"
+    # event hangs for multiple IPs.  In that case the output would be
+    # interleaved in the log and would just be confusing.
+    ctdb scriptstatus "$2"
+
+    echo "===== End of hung script debug for PID=\"$1\", event=\"$2\" ====="
+
+) 9>"${CTDB_VARDIR}/debug-hung-script.lock"
diff --git a/ctdb/config/debug_locks.sh b/ctdb/config/debug_locks.sh
new file mode 100644 (file)
index 0000000..0dde861
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+# This script parses /proc/locks and finds the processes that are holding
+# locks on CTDB databases.  For all those processes the script dumps a
+# stack trace using gstack.
+#
+# This script can be used only if Samba is configured to use fcntl locks
+# rather than mutex locks.
+
+# Create sed expression to convert inodes to names
+sed_cmd=$( ls -li /var/ctdb/*.tdb.* /var/ctdb/persistent/*.tdb.* |
+          sed -e "s#/var/ctdb[/persistent]*/\(.*\)#\1#" |
+          awk '{printf "s#[0-9]*:[0-9]*:%s #%s #\n", $1, $10}' )
+
+# Parse /proc/locks and extract following information
+#    pid process_name tdb_name offsets [W]
+out=$( cat /proc/locks |
+    grep -F "POSIX  ADVISORY  WRITE" |
+    awk '{ if($2 == "->") { print $6, $7, $8, $9, "W" } else { print $5, $6, $7, $8 } }' |
+    while read pid rest ; do
+       pname=$(readlink /proc/$pid/exe)
+       echo $pid $pname $rest
+    done | sed -e "$sed_cmd" | grep "\.tdb" )
+
+if [ -n "$out" ]; then
+    # Log information about locks
+    echo "$out" | logger -t "ctdbd-lock"
+
+    # Find processes that are waiting for locks
+    dbs=$(echo "$out" | grep "W$" | awk '{print $3}')
+    all_pids=""
+    for db in $dbs ; do
+       pids=$(echo "$out" | grep -v "W$" | grep "$db" | grep -v ctdbd | awk '{print $1}')
+       all_pids="$all_pids $pids"
+    done
+    pids=$(echo $all_pids | sort -u)
+
+    # For each process waiting, log stack trace
+    for pid in $pids ; do
+       gstack $pid | logger -t "ctdbd-lock $pid"
+#      gcore -o /var/log/core-deadlock-ctdb $pid
+    done
+fi
+
+exit 0
diff --git a/ctdb/config/events.d/00.ctdb b/ctdb/config/events.d/00.ctdb
new file mode 100755 (executable)
index 0000000..880d07f
--- /dev/null
@@ -0,0 +1,217 @@
+#!/bin/sh
+
+# Event script for ctdb-specific setup and other things that don't fit
+# elsewhere.
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig
+
+ctdb_setup_service_state_dir "ctdb"
+
+############################################################
+
+select_tdb_checker ()
+{
+    # Find the best TDB consistency check available.
+    use_tdb_tool_check=false
+    if which tdbtool >/dev/null 2>&1 && \
+       echo "help" | tdbtool | grep -q check ; then
+
+       use_tdb_tool_check=true
+    elif which tdbtool >/dev/null 2>&1 && which tdbdump >/dev/null 2>&1 ; then
+           cat <<EOF
+WARNING: The installed 'tdbtool' does not offer the 'check' subcommand.
+ Using 'tdbdump' for database checks.
+ Consider updating 'tdbtool' for better checks!
+EOF
+    elif which tdbdump >/dev/null 2>&1 ; then
+       cat <<EOF
+WARNING: 'tdbtool' is not available.
+ Using 'tdbdump' to check the databases.
+ Consider installing a recent 'tdbtool' for better checks!
+EOF
+    else
+       cat <<EOF
+WARNING: Cannot check databases since neither
+ 'tdbdump' nor 'tdbtool check' is available.
+ Consider installing tdbtool or at least tdbdump!
+EOF
+        return 1
+    fi
+}
+
+check_tdb ()
+{
+    _db="$1"
+
+    if $use_tdb_tool_check ; then
+       # tdbtool always exits with 0  :-(
+       if tdbtool "$_db" check 2>/dev/null |
+           grep -q "Database integrity is OK" ; then
+           return 0
+       else
+           return 1
+       fi
+    else
+       tdbdump "$_db" >/dev/null 2>/dev/null
+       return $?
+    fi
+}
+
+check_persistent_databases ()
+{
+    _dir="${CTDB_DBDIR_PERSISTENT:-${CTDB_DBDIR:-/var/ctdb}/persistent}"
+    mkdir -p "$_dir" 2>/dev/null
+
+    [ "${CTDB_MAX_PERSISTENT_CHECK_ERRORS:-0}" = "0" ] || return 0
+
+    for _db in $(ls "$_dir/"*.tdb.*[0-9] 2>/dev/null) ; do
+       check_tdb $_db || {
+           echo "Persistent database $_db is corrupted! CTDB will not start."
+           return 1
+       }
+    done
+}
+
+check_non_persistent_databases ()
+{
+    _dir="${CTDB_DBDIR:-/var/ctdb}"
+    mkdir -p "$_dir" 2>/dev/null
+
+    for _db in $(ls "${_dir}/"*.tdb.*[0-9] 2>/dev/null) ; do
+       check_tdb $_db || {
+           _backup="${_db}.$(date +'%Y%m%d.%H%M%S.%N').corrupt"
+           cat <<EOF
+WARNING: database ${_db} is corrupted.
+ Moving to backup ${_backup} for later analysis.
+EOF
+           mv "$_db" "$_backup"
+
+           # Now remove excess backups
+           ls -td "${_db}."*".corrupt" |
+           tail -n +$((${CTDB_MAX_CORRUPT_DB_BACKUPS:-10} + 1)) |
+           xargs rm -f
+           
+       }
+    done
+}
+
+update_config_from_tdb() {
+
+    # Pull optional ctdb configuration data out of config.tdb
+    _key="public_addresses:node#$(ctdb -t 1 xpnn|sed -e 's/.*://')"
+    _t="$service_state_dir/public_addresses"
+    rm -f "$_t"
+
+    if ctdb pfetch config.tdb "$_key" "$_t" 2>/dev/null && \
+       [ -s "$_t" -a -n "$CTDB_PUBLIC_ADDRESSES"] && \
+       ! cmp -s "$_t" "$CTDB_PUBLIC_ADDRESSES" ; then
+
+       echo "CTDB public address configuration has changed."
+       echo "Extracting new configuration from database."
+       diff "$_t" "$CTDB_PUBLIC_ADDRESSES"
+       cp "$_t" "$CTDB_PUBLIC_ADDRESSES"
+       echo "Restarting CTDB"
+       service ctdb restart &
+    fi
+}
+
+set_ctdb_variables () {
+    # set any tunables from the config file
+    set | grep ^CTDB_SET_ | cut -d_ -f3- | 
+    while read v; do
+       varname=`echo $v | cut -d= -f1`
+       value=`echo $v | cut -d= -f2`
+       ctdb setvar $varname $value || return 1
+       echo "Set $varname to $value"
+    done
+}
+
+monitor_system_memory ()
+{
+    # If monitoring free memory then calculate how much there is
+    if [ -n "$CTDB_MONITOR_FREE_MEMORY_WARN" -o \
+       -n "$CTDB_MONITOR_FREE_MEMORY" ] ; then
+       free_mem=$(free -m | awk '$2 == "buffers/cache:" { print $4 }')
+    fi
+
+    # Shutdown CTDB when memory is below the configured limit
+    if [ -n "$CTDB_MONITOR_FREE_MEMORY" ] ; then
+       if [ $free_mem -le $CTDB_MONITOR_FREE_MEMORY ] ; then
+           echo "CRITICAL: OOM - ${free_mem}MB free <= ${CTDB_MONITOR_FREE_MEMORY}MB (CTDB threshold)"
+           echo "CRITICAL: Shutting down CTDB!!!"
+           get_proc "meminfo"
+           ps auxfww
+           set_proc "sysrq-trigger" "m"
+           ctdb disable
+           sleep 3
+           ctdb shutdown
+       fi
+    fi
+
+    # Warn when low on memory
+    if [ -n "$CTDB_MONITOR_FREE_MEMORY_WARN" ] ; then
+       if [ $free_mem -le $CTDB_MONITOR_FREE_MEMORY_WARN ] ; then
+           echo "WARNING: free memory is low - ${free_mem}MB free <=  ${CTDB_MONITOR_FREE_MEMORY_WARN}MB (CTDB threshold)"
+       fi
+    fi
+
+    # We should never enter swap, so SwapTotal == SwapFree.
+    if [ "$CTDB_CHECK_SWAP_IS_NOT_USED" = "yes" ] ; then
+       set -- $(get_proc "meminfo" | awk '$1 ~ /Swap(Total|Free):/ { print $2 }')
+       if [ "$1" != "$2" ] ; then
+           echo We are swapping:
+           get_proc "meminfo"
+           ps auxfww
+       fi
+    fi
+}
+
+############################################################
+
+ctdb_check_args "$@"
+
+case "$1" in 
+     init)
+        # make sure we have a blank state directory for the scripts to work with
+       rm -rf $CTDB_VARDIR/state
+       # Look at the pattern - this should not be -rf!!!
+       rm -f $ctdb_managed_dir/*
+       mkdir -p $CTDB_VARDIR/state || {
+           ret=$?
+           echo "mkdir -p $CTDB_VARDIR/state - failed - $ret"
+           exit $ret
+       }
+
+       # make sure we drop any ips that might still be held if
+       # previous instance of ctdb got killed with -9 or similar
+       drop_all_public_ips
+
+       if select_tdb_checker ; then
+           check_persistent_databases || exit $?
+           check_non_persistent_databases
+       fi
+       ;;
+
+     setup)
+       # Set any tunables from the config file
+       set_ctdb_variables || die "Failed to set CTDB tunables"
+       ;;
+
+    startup)
+       update_config_from_tdb &
+       ;;
+    monitor)
+       monitor_system_memory
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+# all OK
+exit 0
diff --git a/ctdb/config/events.d/01.reclock b/ctdb/config/events.d/01.reclock
new file mode 100755 (executable)
index 0000000..ed7afdd
--- /dev/null
@@ -0,0 +1,44 @@
+#!/bin/sh
+# script to check accessibility to the reclock file on a node
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig
+
+case "$1" in
+    init)
+       ctdb_counter_init
+       ;;
+
+    monitor)
+       # Early exit if not using a reclock file
+       [ -n "$CTDB_RECOVERY_LOCK" ] || exit 0
+
+       # Try to stat the reclock file as a background process so that
+       # we don't block in case the cluster filesystem is unavailable
+       (
+           if stat $CTDB_RECOVERY_LOCK ; then
+               # We could stat the file, reset the counter
+               ctdb_counter_init
+           fi
+       ) >/dev/null 2>&1 &
+
+       ctdb_counter_incr
+       if ! ctdb_check_counter "quiet" -ge 200 ; then
+           echo "Reclock file \"$CTDB_RECOVERY_LOCK\" can not be accessed. Shutting down."
+           df
+           sleep 1
+           ctdb shutdown
+       fi
+
+       ctdb_check_counter "error" -gt 3
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/10.interface b/ctdb/config/events.d/10.interface
new file mode 100755 (executable)
index 0000000..f44c674
--- /dev/null
@@ -0,0 +1,274 @@
+#!/bin/sh
+
+#################################
+# interface event script for ctdb
+# this adds/removes IPs from your 
+# public interface
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig
+
+[ -z "$CTDB_PUBLIC_ADDRESSES" ] && {
+       CTDB_PUBLIC_ADDRESSES=$CTDB_BASE/public_addresses
+}
+
+[ ! -f "$CTDB_PUBLIC_ADDRESSES" ] && {
+       if [ "$1" = "init" ]; then
+               echo "No public addresses file found. Nothing to do for 10.interfaces"
+       fi
+       exit 0
+}
+
+mark_up ()
+{
+    up_interfaces_found=true
+    ctdb setifacelink $1 up >/dev/null 2>&1
+}
+
+mark_down ()
+{
+    fail=true
+    ctdb setifacelink $1 down >/dev/null 2>&1
+}
+
+# This sets $all_interfaces as a side-effect.
+get_all_interfaces ()
+{
+    # Get all the interfaces listed in the public_addresses file
+    all_interfaces=$(sed -e "s/^[^\t ]*[\t ]*//" -e "s/,/ /g" -e "s/[\t ]*$//" $CTDB_PUBLIC_ADDRESSES)
+
+    # Add some special interfaces if they're defined
+    [ "$CTDB_PUBLIC_INTERFACE" ] && all_interfaces="$CTDB_PUBLIC_INTERFACE $all_interfaces"
+    [ "$CTDB_NATGW_PUBLIC_IFACE" ] && all_interfaces="$CTDB_NATGW_PUBLIC_IFACE $all_interfaces"
+
+    # Get the interfaces for which CTDB has public IPs configured.
+    # That is, for all but the 1st line, get the 1st field.
+    ctdb_ifaces=$(ctdb -Y ifaces | sed -e '1d' -e 's@^:@@' -e 's@:.*@@')
+
+    # Add $ctdb_interfaces and uniquify
+    all_interfaces=$(echo $all_interfaces $ctdb_ifaces | tr ' ' '\n' | sort -u)
+}
+
+monitor_interfaces()
+{
+       get_all_interfaces
+
+       fail=false
+       up_interfaces_found=false
+
+       # Note that this loop must not exit early.  It must process
+       # all interfaces so that the correct state for each interface
+       # is set in CTDB using mark_up/mark_down.  If there is a
+       # problem with an interface then set fail=true and continue.
+       for iface in $all_interfaces ; do
+
+           ip link show $iface 2>/dev/null >/dev/null || {
+               echo "ERROR: Interface $iface does not exist but it is used by public addresses."
+               mark_down $iface
+               continue
+           }
+
+           # These interfaces are sometimes bond devices
+           # When we use VLANs for bond interfaces, there will only
+           # be an entry in /proc for the underlying real interface
+           realiface=`echo $iface |sed -e 's/\..*$//'`
+           bi=$(get_proc "net/bonding/$realiface" 2>/dev/null) && {
+               echo "$bi" | grep -q 'Currently Active Slave: None' && {
+                       echo "ERROR: No active slaves for bond device $realiface"
+                       mark_down $iface
+                       continue
+               }
+               echo "$bi" | grep -q '^MII Status: up' || {
+                       echo "ERROR: public network interface $realiface is down"
+                       mark_down $iface
+                       continue
+               }
+               echo "$bi" | grep -q '^Bonding Mode: IEEE 802.3ad Dynamic link aggregation' && {
+                       # This works around a bug in the driver where the
+                       # overall bond status can be up but none of the actual
+                       # physical interfaces have a link.
+                       echo "$bi" | grep 'MII Status:' | tail -n +2 | grep -q '^MII Status: up' || {
+                               echo "ERROR: No active slaves for 802.ad bond device $realiface"
+                               mark_down $iface
+                               continue
+                       }
+               }
+               mark_up $iface
+               continue
+           }
+
+           case $iface in
+           lo*)
+               # loopback is always working
+               mark_up $iface
+               ;;
+           ib*)
+               # we dont know how to test ib links
+               mark_up $iface
+               ;;
+           *)
+               [ -z "$iface" ] || {
+                   [ "$(basename $(readlink /sys/class/net/$iface/device/driver) 2>/dev/null)" = virtio_net ] ||
+                   ethtool $iface | grep -q 'Link detected: yes' || {
+                       # On some systems, this is not successful when a
+                       # cable is plugged but the interface has not been
+                       # brought up previously. Bring the interface up and
+                       # try again...
+                       ip link set $iface up
+                       ethtool $iface | grep -q 'Link detected: yes' || {
+                           echo "ERROR: No link on the public network interface $iface"
+                           mark_down $iface
+                           continue
+                       }
+                   }
+                   mark_up $iface
+               }
+               ;;
+           esac
+
+       done
+
+       $fail || return 0
+
+       $up_interfaces_found && \
+           [ "$CTDB_PARTIALLY_ONLINE_INTERFACES" = "yes" ] && \
+           return 0
+
+       return 1
+}
+
+ctdb_check_args "$@"
+
+case "$1" in 
+     #############################
+     # called when ctdbd starts up
+     init)
+       # make sure that we only respond to ARP messages from the NIC where
+       # a particular ip address is associated.
+       get_proc sys/net/ipv4/conf/all/arp_filter >/dev/null 2>&1 && {
+           set_proc sys/net/ipv4/conf/all/arp_filter 1
+       }
+       ;;
+
+     #############################
+     # called after ctdbd has done its initial recovery
+     # and we start the services to become healthy
+     startup)
+       monitor_interfaces
+       ;;
+
+
+     ################################################
+     # called when ctdbd wants to claim an IP address
+     takeip)
+       iface=$2
+       ip=$3
+       maskbits=$4
+
+       add_ip_to_iface $iface $ip $maskbits || {
+               exit 1;
+       }
+
+       # cope with the script being killed while we have the interface blocked
+       iptables -D INPUT -i $iface -d $ip -j DROP 2> /dev/null
+
+       # flush our route cache
+       set_proc sys/net/ipv4/route/flush 1
+       ;;
+
+
+     ##################################################
+     # called when ctdbd wants to release an IP address
+     releaseip)
+       # releasing an IP is a bit more complex than it seems. Once the IP
+       # is released, any open tcp connections to that IP on this host will end
+       # up being stuck. Some of them (such as NFS connections) will be unkillable
+       # so we need to use the killtcp ctdb function to kill them off. We also
+       # need to make sure that no new connections get established while we are 
+       # doing this! So what we do is this:
+       # 1) firewall this IP, so no new external packets arrive for it
+       # 2) use netstat -tn to find existing connections, and kill them 
+       # 3) remove the IP from the interface
+       # 4) remove the firewall rule
+       iface=$2
+       ip=$3
+       maskbits=$4
+
+       failed=0
+       # we do an extra delete to cope with the script being killed
+       iptables -D INPUT -i $iface -d $ip -j DROP 2> /dev/null
+       iptables -I INPUT -i $iface -d $ip -j DROP
+       kill_tcp_connections $ip
+
+       delete_ip_from_iface $iface $ip $maskbits || {
+               iptables -D INPUT -i $iface -d $ip -j DROP 2> /dev/null
+               exit 1;
+       }
+
+       iptables -D INPUT -i $iface -d $ip -j DROP 2> /dev/null
+
+       # flush our route cache
+       set_proc sys/net/ipv4/route/flush 1
+       ;;
+
+     ##################################################
+     # called when ctdbd wants to update an IP address
+     updateip)
+       # moving an IP is a bit more complex than it seems.
+       # First we drop all traffic on the old interface.
+       # Then we try to add the ip to the new interface and before
+       # we finally remove it from the old interface.
+       #
+       # 1) firewall this IP, so no new external packets arrive for it
+       # 2) add the IP to the new interface
+       # 3) remove the IP from the old interface
+       # 4) remove the firewall rule
+       # 5) use ctdb gratiousarp to propagate the new mac address
+       # 6) use netstat -tn to find existing connections, and tickle them
+       oiface=$2
+       niface=$3
+       ip=$4
+       maskbits=$5
+
+       failed=0
+       # we do an extra delete to cope with the script being killed
+       iptables -D INPUT -i $oiface -d $ip -j DROP 2> /dev/null
+       iptables -I INPUT -i $oiface -d $ip -j DROP
+
+       delete_ip_from_iface $oiface $ip $maskbits 2>/dev/null
+       delete_ip_from_iface $niface $ip $maskbits 2>/dev/null
+
+       add_ip_to_iface $niface $ip $maskbits || {
+               iptables -D INPUT -i $oiface -d $ip -j DROP 2> /dev/null
+               exit 1;
+       }
+
+       # cope with the script being killed while we have the interface blocked
+       iptables -D INPUT -i $oiface -d $ip -j DROP 2> /dev/null
+
+       # flush our route cache
+       set_proc sys/net/ipv4/route/flush 1
+
+       # propagate the new mac address
+       ctdb gratiousarp $ip $niface
+
+       # tickle all existing connections, so that dropped packets
+       # are retransmited and the tcp streams work
+
+       tickle_tcp_connections $ip
+
+       ;;
+
+     monitor)
+       monitor_interfaces || exit 1
+       ;;
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
+
diff --git a/ctdb/config/events.d/11.natgw b/ctdb/config/events.d/11.natgw
new file mode 100755 (executable)
index 0000000..8555005
--- /dev/null
@@ -0,0 +1,118 @@
+#!/bin/sh
+# Script to set up one of the nodes as a NAT gateway for all other nodes.
+# This is used to ensure that all nodes in the cluster can still originate
+# traffic to the external network even if there are no public addresses
+# available.
+#
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig
+
+[ -n "$CTDB_NATGW_NODES" ] || exit 0
+export CTDB_NATGW_NODES
+
+set_natgw_capability ()
+{
+    # Set NATGW capability depending on configuration
+    if [ "$CTDB_NATGW_SLAVE_ONLY" = "yes" ] ; then
+       ctdb setnatgwstate off
+    else
+       ctdb setnatgwstate on
+    fi
+}
+
+delete_all() {
+       _ip="${CTDB_NATGW_PUBLIC_IP%/*}"
+       _maskbits="${CTDB_NATGW_PUBLIC_IP#*/}"
+
+       [ -z "$CTDB_NATGW_PUBLIC_IFACE" ] || {
+           delete_ip_from_iface $CTDB_NATGW_PUBLIC_IFACE $_ip $_maskbits >/dev/null 2>&1
+       }
+       ip route del 0.0.0.0/0 metric 10 >/dev/null 2>/dev/null
+
+       # Delete the masquerading setup from a previous iteration where we
+       # were the NAT-GW
+       iptables -D POSTROUTING -t nat -s $CTDB_NATGW_PRIVATE_NETWORK ! -d $CTDB_NATGW_PRIVATE_NETWORK -j MASQUERADE >/dev/null 2>/dev/null
+
+       # remove any iptables rule we may have on this address
+       iptables -D INPUT -p tcp --syn -d $_ip/32 -j REJECT 2>/dev/null
+}
+
+ensure_natgwmaster ()
+{
+    _event="$1"
+
+    set -- $(ctdb natgwlist)
+    natgwmaster="${1:--1}" # Default is -1 if natgwlist fails
+    natgwip="$2"
+
+    if [ "$natgwmaster" = "-1" ]; then
+       # Fail...
+       die "There is no NATGW master node"
+    fi
+}
+
+case "$1" in 
+    setup)
+       set_natgw_capability
+       ;;
+
+    startup)
+       # Error if CTDB_NATGW_PUBLIC_IP is listed in public addresses
+       grep -q "^$CTDB_NATGW_PUBLIC_IP[[:space:]]" "${CTDB_PUBLIC_ADDRESSES:-/etc/ctdb/public_addresses}" && \
+           die "ERROR: NATGW configured to use a public address. NATGW must not use a public address."
+
+       # do not send out arp requests from loopback addresses
+       echo 2 > /proc/sys/net/ipv4/conf/all/arp_announce
+       ;;
+
+    updatenatgw|ipreallocated)
+       mypnn=$(ctdb pnn | cut -d: -f2)
+
+       set_natgw_capability
+       ensure_natgwmaster "$1"
+
+       delete_all
+
+       if [ "$mypnn" = "$natgwmaster" ]; then
+               # This is the NAT GW
+               echo 1 >/proc/sys/net/ipv4/ip_forward
+               iptables -A POSTROUTING -t nat -s $CTDB_NATGW_PRIVATE_NETWORK ! -d $CTDB_NATGW_PRIVATE_NETWORK -j MASQUERADE
+
+               # block all incoming connections to the natgw address
+               ctdb_natgw_public_ip_host="${CTDB_NATGW_PUBLIC_IP%/*}/32"
+               iptables -D INPUT -p tcp --syn -d $ctdb_natgw_public_ip_host -j REJECT 2>/dev/null
+               iptables -I INPUT -p tcp --syn -d $ctdb_natgw_public_ip_host -j REJECT 2>/dev/null
+
+               ip addr add $CTDB_NATGW_PUBLIC_IP dev $CTDB_NATGW_PUBLIC_IFACE
+               ip route add 0.0.0.0/0 metric 10 via $CTDB_NATGW_DEFAULT_GATEWAY >/dev/null 2>/dev/null
+       else
+               # This is NOT the NAT GW
+               ip route add 0.0.0.0/0 via $natgwip metric 10
+               # Make sure winbindd does not stay bound to this address
+               # if we are no longer natgwmaster
+               smbcontrol winbindd ip-dropped $CTDB_NATGW_PUBLIC_IP >/dev/null 2>/dev/null
+       fi
+
+       # flush our route cache
+       echo 1 > /proc/sys/net/ipv4/route/flush
+       ;;
+
+    shutdown|removenatgw)
+       delete_all
+       ;;
+
+    monitor)
+       set_natgw_capability
+       ensure_natgwmaster "$1"
+       ;;
+
+    *)
+       ctdb_standard_event_handler "@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/11.routing b/ctdb/config/events.d/11.routing
new file mode 100755 (executable)
index 0000000..ccc60e7
--- /dev/null
@@ -0,0 +1,47 @@
+#!/bin/sh
+# script to add entries to the routing table after we have performed a
+# take ip event
+# (when we do a "releaseip" event and remove an ip address from an interface
+#  the kernel might automatically remove associated entries from
+#  the routing table. This is where we add them back)
+#
+# Routes to add are defined in /etc/ctdb/static-routes.
+# Syntax is :
+# IFACE NET/MASK GATEWAY
+#
+# Example
+# bond1 10.3.3.0/24 10.0.0.1
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig
+
+[ -f $CTDB_BASE/static-routes ] || {
+    exit 0
+}
+
+case "$1" in
+    ipreallocated)
+        while read iface dest gw; do
+            ip route add "$dest" via "$gw" dev "$iface" >/dev/null 2>&1
+        done <"${CTDB_BASE}/static-routes"
+        ;;
+
+    updateip)
+       oiface=$2
+       niface=$3
+       while read iface dest gw; do
+           if [ "$niface" = "$iface" -o "$oiface" = "$iface" ] ; then
+               ip route add "$dest" via "$gw" dev "$iface" >/dev/null 2>&1
+           fi
+       done <"${CTDB_BASE}/static-routes"
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/13.per_ip_routing b/ctdb/config/events.d/13.per_ip_routing
new file mode 100755 (executable)
index 0000000..de153a6
--- /dev/null
@@ -0,0 +1,416 @@
+#!/bin/sh
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig
+
+service_name=per_ip_routing
+
+# Do nothing if unconfigured 
+[ -n "$CTDB_PER_IP_ROUTING_CONF" ] || exit 0
+
+table_id_prefix="ctdb."
+
+[ -n "$CTDB_PER_IP_ROUTING_RULE_PREF" ] || \
+    die "error: CTDB_PER_IP_ROUTING_RULE_PREF not configured"
+
+[ "$CTDB_PER_IP_ROUTING_TABLE_ID_LOW" -lt "$CTDB_PER_IP_ROUTING_TABLE_ID_HIGH" ] 2>/dev/null || \
+    die "error: CTDB_PER_IP_ROUTING_TABLE_ID_LOW[$CTDB_PER_IP_ROUTING_TABLE_ID_LOW] and/or CTDB_PER_IP_ROUTING_TABLE_ID_HIGH[$CTDB_PER_IP_ROUTING_TABLE_ID_HIGH] improperly configured"
+
+have_link_local_config ()
+{
+    [ "$CTDB_PER_IP_ROUTING_CONF" = "__auto_link_local__" ]
+}
+
+if ! have_link_local_config && [ ! -r "$CTDB_PER_IP_ROUTING_CONF" ] ; then
+    die "error: CTDB_PER_IP_ROUTING_CONF=$CTDB_PER_IP_ROUTING_CONF file not found"
+fi
+
+######################################################################
+
+ipv4_is_valid_addr()
+{
+    _ip="$1"
+
+    _count=0
+    # Get the shell to break up the address into 1 word per octet 
+    for _o in $(export IFS="." ; echo $_ip) ; do
+       # The 2>/dev/null stops output from failures where an "octet"
+       # is not numeric.  The test will still fail.
+       if ! [ 0 -le $_o -a $_o -le 255 ] 2>/dev/null ; then
+           return 1
+       fi
+       _count=$(($_count + 1))
+    done
+
+    # A valid IPv4 address has 4 octets
+    [ $_count -eq 4 ]
+}
+
+ensure_ipv4_is_valid_addr ()
+{
+    _event="$1"
+    _ip="$2"
+
+    ipv4_is_valid_addr "$_ip" || {
+       echo "$0: $_event not an ipv4 address skipping IP:$_ip"
+       exit 0
+    }
+}
+
+ipv4_host_addr_to_net ()
+{
+    _host="$1"
+    _maskbits="$2"
+
+    # Convert the host address to an unsigned long by splitting out
+    # the octets and doing the math.
+    _host_ul=0
+    for _o in $(export IFS="." ; echo $_host) ; do
+       _host_ul=$(( ($_host_ul << 8) + $_o)) # work around Emacs color bug
+    done
+
+    # Calculate the mask and apply it.
+    _mask_ul=$(( 0xffffffff << (32 - $_maskbits) ))
+    _net_ul=$(( $_host_ul & $_mask_ul ))
+    # Now convert to a network address one byte at a time.
+    _net=""
+    for _o in $(seq 1 4) ; do
+       _net="$(($_net_ul & 255))${_net:+.}${_net}"
+       _net_ul=$(($_net_ul >> 8))
+    done
+
+    echo "${_net}/${_maskbits}"
+}
+
+######################################################################
+
+# Setup a table id to use for the given IP.  We don't need to know it,
+# it just needs to exist in /etc/iproute2/rt_tables.  Fail if no free
+# table id could be found in the configured range.
+ensure_table_id_for_ip ()
+{
+    _ip=$1
+
+    _f="$CTDB_ETCDIR/iproute2/rt_tables"
+    # This file should always exist, but...
+    if [ ! -f "$_f" ] ; then
+       mkdir -p $(dirname "$_f")
+       touch "$_f"
+    fi
+
+    # Maintain a table id for each IP address we've ever seen in
+    # rt_tables.  We use a "ctdb." prefix on the label.
+    _label="${table_id_prefix}${_ip}"
+
+    # This finds either the table id corresponding to the label or a
+    # new unused one (that is greater than all the used ones in the
+    # range).
+    (
+       # Note that die() just gets us out of the subshell...
+       flock --timeout 30 0 || \
+           die "ensure_table_id_for_ip: failed to lock file $_f"
+
+       _new=$CTDB_PER_IP_ROUTING_TABLE_ID_LOW
+       while read _t _l ; do
+           # Skip comments
+           case "$_t" in
+               \#*) continue ;;
+           esac
+           # Found existing: done
+           if [ "$_l" = "$_label" ] ; then
+               return 0
+           fi
+           # Potentially update the new table id to be used.  The
+           # redirect stops error spam for a non-numeric value.
+           if [ $_new -le $_t -a \
+               $_t -le $CTDB_PER_IP_ROUTING_TABLE_ID_HIGH ] 2>/dev/null ; then
+               _new=$(($_t + 1))
+           fi
+       done
+
+       # If the new table id is legal then add it to the file and
+       # print it.
+       if [ $_new -le $CTDB_PER_IP_ROUTING_TABLE_ID_HIGH ] ; then
+           printf "%d\t%s\n" "$_new" "$_label" >>"$_f"
+           return 0
+       else
+           return 1
+       fi
+    ) <"$_f"
+}
+
+# Clean up all the table ids that we might own.
+clean_up_table_ids ()
+{
+    _f="$CTDB_ETCDIR/iproute2/rt_tables"
+    # Even if this didn't exist on the system, adding a route will
+    # have created it.  What if we startup and immediately shutdown?
+    if [ ! -f "$_f" ] ; then
+       mkdir -p $(dirname "$_f")
+       touch "$_f"
+    fi
+
+    (
+       # Note that die() just gets us out of the subshell...
+       flock --timeout 30 0 || \
+           die "clean_up_table_ids: failed to lock file $_f"
+
+       # Delete any items from the file that have a table id in our
+       # range or a label matching our label.  Preserve comments.
+       _tmp="${_f}.$$.ctdb"
+       awk -v min="$CTDB_PER_IP_ROUTING_TABLE_ID_LOW" \
+           -v max="$CTDB_PER_IP_ROUTING_TABLE_ID_HIGH" \
+           -v pre="$table_id_prefix" \
+           '/^#/ || \
+            !(min <= $1 && $1 <= max) && \
+            !(index($2, pre) == 1) \
+            { print $0 }' "$_f" >"$_tmp"
+
+       mv "$_tmp" "$_f"
+       # The lock is gone - don't do anything else here
+    ) <"$_f"
+}
+
+######################################################################
+
+# This prints the config for an IP, which is either relevant entries
+# from the config file or, if set to the magic link local value, some
+# link local routing config for the IP.
+get_config_for_ip ()
+{
+    _ip="$1"
+
+    if have_link_local_config ; then
+       # When parsing public_addresses also split on '/'.  This means
+       # that we get the maskbits as item #2 without further parsing.
+       while IFS="/$IFS" read _i _maskbits _x ; do
+           if [ "$_ip" = "$_i" ] ; then
+               echo -n "$_ip "; ipv4_host_addr_to_net "$_ip" "$_maskbits"
+           fi
+       done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
+    else
+       while read _i _rest ; do
+           if [ "$_ip" = "$_i" ] ; then
+               printf "%s\t%s\n" "$_ip" "$_rest"
+           fi
+       done <"$CTDB_PER_IP_ROUTING_CONF"
+    fi
+}
+
+ip_has_configuration ()
+{
+    _ip="$1"
+
+    [ -n "$(get_config_for_ip $_ip)" ]
+}
+
+add_routing_for_ip ()
+{
+    _iface="$1"
+    _ip="$2"
+
+    # Do nothing if no config for this IP.
+    ip_has_configuration "$_ip" || return 0
+
+    ensure_table_id_for_ip "$_ip" || \
+       die "add_routing_for_ip: out of table ids in range $CTDB_PER_IP_ROUTING_TABLE_ID_LOW - $CTDB_PER_IP_ROUTING_TABLE_ID_HIGH"
+
+    _pref="$CTDB_PER_IP_ROUTING_RULE_PREF"
+    _table_id="${table_id_prefix}${_ip}"
+
+    del_routing_for_ip "$_ip" >/dev/null 2>&1
+
+    ip rule add from "$_ip" pref "$_pref" table "$_table_id" || \
+       die "add_routing_for_ip: failed to add rule for $_ip"
+
+    # Add routes to table for any lines matching the IP.
+    get_config_for_ip "$_ip" |
+    while read _i _dest _gw ; do
+       _r="$_dest ${_gw:+via} $_gw dev $_iface table $_table_id"
+       ip route add $_r || \
+           die "add_routing_for_ip: failed to add route: $_r"
+    done
+}
+
+del_routing_for_ip ()
+{
+    _ip="$1"
+
+    _pref="$CTDB_PER_IP_ROUTING_RULE_PREF"
+    _table_id="${table_id_prefix}${_ip}"
+
+    # Do this unconditionally since we own any matching table ids.
+    # However, print a meaningful message if something goes wrong.
+    _cmd="ip rule del from $_ip pref $_pref table $_table_id"
+    _out=$($_cmd 2>&1) || \
+       cat <<EOF
+WARNING: Failed to delete policy routing rule
+  Command "$_cmd" failed:
+  $_out
+EOF
+    # This should never usually fail, so don't redirect output.
+    # However, it can fail when deleting a rogue IP, since there will
+    # be no routes for that IP.  In this case it should only fail when
+    # the rule deletion above has already failed because the table id
+    # is invalid.  Therefore, go to a little bit of trouble to indent
+    # the failure message so that it is associated with the above
+    # warning message and doesn't look too nasty.
+    ip route flush table $_table_id 2>&1 | sed -e 's@^.@  &@'
+}
+
+######################################################################
+
+flush_rules_and_routes ()
+{
+       ip rule show |
+       while read _p _x _i _x _t ; do
+           # Remove trailing colon after priority/preference.
+           _p="${_p%:}"
+           # Only remove rules that match our priority/preference.
+           [ "$CTDB_PER_IP_ROUTING_RULE_PREF" = "$_p" ] || continue
+
+           echo "Removing ip rule for public address $_i for routing table $_t"
+           ip rule del from "$_i" table "$_t" pref "$_p"
+           ip route flush table "$_t" 2>/dev/null
+       done
+}
+
+# Add any missing routes.  Some might have gone missing if, for
+# example, all IPs on the network were removed (possibly if the
+# primary was removed).  If $1 is "force" then (re-)add all the
+# routes.
+add_missing_routes ()
+{
+    ctdb ip -v -Y | {
+       read _x # skip header line
+
+       # Read the rest of the lines.  We're only interested in the
+       # "IP" and "ActiveInterface" columns.  The latter is only set
+       # for addresses local to this node, making it easy to skip
+       # non-local addresses.  For each IP local address we check if
+       # the relevant routing table is populated and populate it if
+       # not.
+       while IFS=":" read _x _ip _x _iface _x ; do
+           [ -n "$_iface" ] || continue
+           
+           _table_id="${table_id_prefix}${_ip}"
+           if [ -z "$(ip route show table $_table_id 2>/dev/null)" -o \
+               "$1" = "force" ]  ; then
+               add_routing_for_ip "$_iface" "$_ip"
+           fi
+       done
+    } || exit $?
+}
+
+# Remove rules/routes for addresses that we're not hosting.  If a
+# releaseip event failed in an earlier script then we might not have
+# had a chance to remove the corresponding rules/routes.
+remove_bogus_routes ()
+{
+    # Get a IPs current hosted by this node, each anchored with '@'.
+    _ips=$(ctdb ip -v -Y | awk -F: 'NR > 1 && $4 != "" {printf "@%s@\n", $2}')
+
+    ip rule show |
+    while read _p _x _i _x _t ; do
+       # Remove trailing colon after priority/preference.
+       _p="${_p%:}"
+       # Only remove rules that match our priority/preference.
+       [ "$CTDB_PER_IP_ROUTING_RULE_PREF" = "$_p" ] || continue
+       # Only remove rules for which we don't have an IP.  This could
+       # be done with grep, but let's do it with shell prefix removal
+       # to avoid unnecessary processes.  This falls through if
+       # "@${_i}@" isn't present in $_ips.
+       [ "$_ips" = "${_ips#*@${_i}@}" ] || continue
+
+       echo "Removing ip rule/routes for unhosted public address $_i"
+       del_routing_for_ip "$_i"
+    done
+}
+
+######################################################################
+
+service_reconfigure ()
+{
+    add_missing_routes "force"
+    remove_bogus_routes
+
+    # flush our route cache
+    set_proc sys/net/ipv4/route/flush 1
+}
+
+######################################################################
+
+ctdb_check_args "$@"
+
+ctdb_service_check_reconfigure
+
+case "$1" in
+    startup)
+       flush_rules_and_routes
+
+       # make sure that we only respond to ARP messages from the NIC
+       # where a particular ip address is associated.
+       get_proc sys/net/ipv4/conf/all/arp_filter >/dev/null 2>&1 && {
+           set_proc sys/net/ipv4/conf/all/arp_filter 1
+       }
+       ;;
+
+    shutdown)
+       flush_rules_and_routes
+       clean_up_table_ids
+       ;;
+
+    takeip)
+       iface=$2
+       ip=$3
+       maskbits=$4
+
+       ensure_ipv4_is_valid_addr "$1" "$ip"
+       add_routing_for_ip "$iface" "$ip"
+
+       # flush our route cache
+       set_proc sys/net/ipv4/route/flush 1
+
+       ctdb gratiousarp "$ip" "$iface"
+       ;;
+
+    updateip)
+       oiface=$2
+       niface=$3
+       ip=$4
+       maskbits=$5
+
+       ensure_ipv4_is_valid_addr "$1" "$ip"
+       add_routing_for_ip "$niface" "$ip"
+
+       # flush our route cache
+       set_proc sys/net/ipv4/route/flush 1
+
+       ctdb gratiousarp "$ip" "$niface"
+       tickle_tcp_connections "$ip"
+       ;;
+
+    releaseip)
+       iface=$2
+       ip=$3
+       maskbits=$4
+
+       ensure_ipv4_is_valid_addr "$1" "$ip"
+       del_routing_for_ip "$ip"
+       ;;
+
+    ipreallocated)
+       add_missing_routes
+       remove_bogus_routes
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/20.multipathd b/ctdb/config/events.d/20.multipathd
new file mode 100755 (executable)
index 0000000..64748da
--- /dev/null
@@ -0,0 +1,84 @@
+#!/bin/sh
+# ctdb event script for monitoring the multipath daemon
+#
+# Configure monitporing of multipath devices by listing the device serials
+# in /etc/ctdb/multipathd :
+#   CTDB_MONITOR_MPDEVICES="device1 device2 ..."
+#
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+service_name="multipathd"
+
+loadconfig 
+
+[ -n "$CTDB_MONITOR_MPDEVICES" ] || exit 0
+
+ctdb_setup_service_state_dir
+
+multipath_fail="${service_state_dir}/fail"
+
+multipathd_check_background()
+{
+    for _device in $CTDB_MONITOR_MPDEVICES; do
+       # Check multipath knows about the device
+       _out=$(multipath -ll "$_device")
+       if [ -z "$_out" ] ; then
+           echo "device \"${_device}\" not known to multipathd" >"$multipath_fail"
+           exit 1
+       fi
+
+       # Check for at least 1 active path
+       if ! echo "$_out" | grep 'prio=.* status=active' >/dev/null 2>&1 ; then
+           echo "multipath device \"${_device}\" has no active paths" >"$multipath_fail"
+           exit 1
+       fi
+    done
+    exit 0
+}
+
+multipathd_check()
+{
+    # Run the actual check in the background since the call to
+    # multipath may block
+    multipathd_check_background </dev/null >/dev/null 2>&1 &
+    _pid="$!"
+    _timeleft=10
+
+    while [ $_timeleft -gt 0 ]; do
+       _timeleft=$(($_timeleft - 1))
+
+       # see if the process still exists
+       kill -0 $_pid >/dev/null 2>&1 || {
+           if wait $_pid ; then
+               return 0
+           else
+               echo -n "ERROR: "
+               cat "$multipath_fail"
+               rm -f "$multipath_fail"
+               return 1
+           fi
+       }
+       sleep 1
+    done
+
+    echo "ERROR: callout to multipath checks hung"
+    # If hung then this probably won't work, but worth trying...
+    kill -9 $_pid >/dev/null 2>&1
+    return 1
+}
+
+case "$1" in
+    monitor)
+       multipathd_check || die "multipath monitoring failed"
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/31.clamd b/ctdb/config/events.d/31.clamd
new file mode 100755 (executable)
index 0000000..15751a9
--- /dev/null
@@ -0,0 +1,52 @@
+#!/bin/sh
+# event script to manage clamd in a cluster environment
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+detect_init_style
+
+case $CTDB_INIT_STYLE in
+       redhat)
+               service_name="clamd"
+               service_config="clamd"
+               ;;
+       *)
+               service_name="clamav"
+               service_config="clamav"
+               ;;
+esac
+
+service_start ()
+{
+    service $service_name stop > /dev/null 2>&1
+    service $service_name start
+}
+
+loadconfig
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+case "$1" in 
+    startup)
+       ctdb_service_start
+        ;;
+
+    shutdown)
+        ctdb_service_stop
+        ;;
+
+    monitor)
+        ctdb_check_unix_socket ${CTDB_CLAMD_SOCKET} || exit $?
+        ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/40.fs_use b/ctdb/config/events.d/40.fs_use
new file mode 100644 (file)
index 0000000..603b463
--- /dev/null
@@ -0,0 +1,55 @@
+#!/bin/sh
+# ctdb event script for checking local file system utilization
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig
+
+case "$1" in 
+    monitor)
+        # check each specified fs to be checked
+        # config format is <fs_mount>:<fs_threshold>
+        for fs in $CTDB_CHECK_FS_USE
+        do
+            # parse fs_mount and fs_threshold
+            fs_mount="${fs%:*}"
+            fs_threshold="${fs#*:}"
+
+            # check if given fs_mount is existing directory
+            if [ ! -d "$fs_mount" ]; then
+                echo "Directory $fs_mount does not exist"
+                exit 1
+            fi
+
+            # check if given fs_threshold is number
+            if ! (echo "$fs_threshold" | egrep -q '^[0-9]+$')  ; then
+                echo "Threshold $fs_threshold is invalid number"
+                exit 1
+            fi
+
+            # get utilization of given fs from df
+            fs_usage=$(df -kP $fs_mount | sed -n -e 's@.*[[:space:]]\([[:digit:]]*\)%.*@\1@p')
+
+            # check if fs_usage is number
+            if [ -z "$fs_usage" ] ; then
+                echo "Unable to get FS utilization for $fs_mount"
+                exit 1
+            fi
+
+            # check if fs_usage is higher than or equal to fs_threshold
+            if [ "$fs_usage" -ge "$fs_threshold" ] ; then
+                echo "ERROR: Utilization of $fs_mount ($fs_usage%) is higher than threshold ($fs_threshold%)"
+                exit 1
+            fi
+        done
+
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/40.vsftpd b/ctdb/config/events.d/40.vsftpd
new file mode 100755 (executable)
index 0000000..92a0e99
--- /dev/null
@@ -0,0 +1,68 @@
+#!/bin/sh
+# event strict to manage vsftpd in a cluster environment
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+service_name="vsftpd"
+# make sure the service is stopped first
+service_start ()
+{
+    service $service_name stop > /dev/null 2>&1
+    service $service_name start
+}
+service_stop ()
+{
+    service $service_name stop
+}
+
+service_reconfigure ()
+{
+    service $service_name restart
+}
+
+service_fail_limit=2
+service_tcp_ports=21
+
+loadconfig
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+ctdb_service_check_reconfigure
+
+case "$1" in 
+    startup)
+       ctdb_service_start
+       ;;
+
+    shutdown)
+       ctdb_service_stop
+       ;;
+
+    takeip|releaseip)
+       ctdb_service_set_reconfigure
+       ;;
+
+    monitor)
+       if [ -n "$service_tcp_ports" ] ; then
+           if ctdb_check_tcp_ports $service_tcp_ports ; then
+               ctdb_counter_init
+           else
+               ctdb_counter_incr
+               ctdb_check_counter
+               ctdb_check_counter "quiet" -ge 1 || \
+                   echo "WARNING: vsftpd not listening but less than $service_fail_limit consecutive failures, not unhealthy yet" 
+           fi
+       fi      
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/41.httpd b/ctdb/config/events.d/41.httpd
new file mode 100755 (executable)
index 0000000..ac0c941
--- /dev/null
@@ -0,0 +1,86 @@
+#!/bin/sh
+# event script to manage httpd in a cluster environment
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+detect_init_style
+
+case $CTDB_INIT_STYLE in
+    redhat)
+       service_name="httpd"
+       service_config="http"
+       ;;
+    suse|debian|*)
+       service_name="apache2"
+       service_config="apache2"
+       ;;
+esac
+
+# RHEL5 sometimes use a SIGKILL to terminate httpd, which then leaks
+# semaphores.  This is a hack to clean them up.
+cleanup_httpd_semaphore_leak() {
+    killall -q -0 "$service_name" ||
+    for i in $(ipcs -s | awk '$3 == "apache" { print $2 }') ; do
+       ipcrm -s $i
+    done
+}
+
+##########
+
+service_start ()
+{
+    cleanup_httpd_semaphore_leak
+    service $service_name start
+}
+service_stop ()
+{
+    service $service_name stop
+    killall -q -9 $service_name || true
+}
+
+loadconfig
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+case "$1" in
+    startup)
+       ctdb_service_start
+       ;;
+
+    shutdown)
+       ctdb_service_stop
+       ;;
+
+    monitor)
+       if ctdb_check_tcp_ports 80 >/dev/null 2>/dev/null ; then
+           ctdb_counter_init
+       else
+           ctdb_counter_incr
+
+           ctdb_check_counter warn -eq 2 || {
+               echo "HTTPD is not running. Trying to restart HTTPD."
+               service_stop
+               service_start
+               exit 0
+           }
+            ctdb_check_counter warn -ge 5 || {
+               echo "HTTPD is not running. Trying to restart HTTPD."
+               service_stop
+               service_start
+               exit 1
+           }
+       fi
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
+
diff --git a/ctdb/config/events.d/49.winbind b/ctdb/config/events.d/49.winbind
new file mode 100755 (executable)
index 0000000..dee3c90
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/sh
+# ctdb event script for winbind
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+detect_init_style
+
+CTDB_SERVICE_WINBIND=${CTDB_SERVICE_WINBIND:-winbind}
+
+service_name="winbind"
+
+loadconfig
+
+ctdb_setup_service_state_dir
+
+service_start ()
+{
+    service "$CTDB_SERVICE_WINBIND" stop >/dev/null 2>&1
+    killall -0 -q winbindd && {
+       sleep 1
+        # make absolutely sure winbindd is dead
+       killall -q -9 winbindd
+    }
+
+    service "$CTDB_SERVICE_WINBIND" start || \
+       die "Failed to start winbind"
+}
+
+service_stop ()
+{
+    service "$CTDB_SERVICE_WINBIND" stop
+}
+
+###########################
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+###########################
+
+case "$1" in 
+     startup)
+       ctdb_service_start
+       ;;
+       
+     shutdown)
+       ctdb_service_stop
+       ;;
+
+     monitor)
+       ctdb_check_command wbinfo -p
+       ;;
+
+     takeip|releaseip)
+       iface=$2
+       ip=$3
+       maskbits=$4
+
+       smbcontrol winbindd ip-dropped $ip >/dev/null 2>/dev/null
+       ;;
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/50.samba b/ctdb/config/events.d/50.samba
new file mode 100755 (executable)
index 0000000..4b53cba
--- /dev/null
@@ -0,0 +1,166 @@
+#!/bin/sh
+# ctdb event script for Samba
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+detect_init_style
+
+case $CTDB_INIT_STYLE in
+       suse)
+               CTDB_SERVICE_SMB=${CTDB_SERVICE_SMB:-smb}
+               CTDB_SERVICE_NMB=${CTDB_SERVICE_NMB:-nmb}
+               ;;
+       debian)
+               CTDB_SERVICE_SMB=${CTDB_SERVICE_SMB:-samba}
+               CTDB_SERVICE_NMB=${CTDB_SERVICE_NMB:-""}
+               ;;
+       *)
+               # Use redhat style as default:
+               CTDB_SERVICE_SMB=${CTDB_SERVICE_SMB:-smb}
+               CTDB_SERVICE_NMB=${CTDB_SERVICE_NMB:-""}
+               ;;
+esac
+
+service_name="samba"
+
+loadconfig
+
+ctdb_setup_service_state_dir
+
+service_start ()
+{
+    # make sure samba is not already started
+    service "$CTDB_SERVICE_SMB" stop > /dev/null 2>&1
+    if [ -n "$CTDB_SERVICE_NMB" ] ; then
+       service "$CTDB_SERVICE_NMB" stop > /dev/null 2>&1
+    fi
+    killall -0 -q smbd && {
+       sleep 1
+       # make absolutely sure samba is dead
+       killall -q -9 smbd
+    }
+    killall -0 -q nmbd && {
+       sleep 1
+       # make absolutely sure samba is dead
+       killall -q -9 nmbd
+    }
+
+    # start Samba service. Start it reniced, as under very heavy load
+    # the number of smbd processes will mean that it leaves few cycles
+    # for anything else
+    net serverid wipe
+
+    if [ -n "$CTDB_SERVICE_NMB" ] ; then
+       nice_service "$CTDB_SERVICE_NMB" start || die "Failed to start nmbd"
+    fi
+
+    nice_service "$CTDB_SERVICE_SMB" start || die "Failed to start samba"
+}
+
+service_stop ()
+{
+    service "$CTDB_SERVICE_SMB" stop
+    if [ -n "$CTDB_SERVICE_NMB" ] ; then
+       service "$CTDB_SERVICE_NMB" stop
+    fi
+}
+
+######################################################################
+# Show the testparm output using a cached smb.conf to avoid delays due
+# to registry access.
+
+smbconf_cache="$service_state_dir/smb.conf.cache"
+
+testparm_foreground_update ()
+{
+    _timeout="$1"
+
+    if ! _out=$(timeout $_timeout testparm -v -s 2>/dev/null) ; then
+       if [ -f "$smbconf_cache" ] ; then
+           echo "WARNING: smb.conf cache update failed - using old cache file"
+           return 1
+       else
+           die "ERROR: smb.conf cache create failed"
+       fi
+    fi
+
+    _tmpfile="${smbconf_cache}.$$"
+    # Patterns to exclude...
+    pat='^[[:space:]]+(registry[[:space:]]+shares|include|copy|winbind[[:space:]]+separator)[[:space:]]+='    
+    echo "$_out" | grep -Ev "$pat" >"$_tmpfile"
+    mv "$_tmpfile" "$smbconf_cache" # atomic
+
+    return 0
+}
+
+testparm_background_update ()
+{
+    _timeout="$1"
+
+    testparm_foreground_update $_timeout >/dev/null 2>&1 </dev/null &
+}
+
+testparm_cat ()
+{
+    testparm -s "$smbconf_cache" "$@" 2>/dev/null
+}
+
+list_samba_shares ()
+{
+    testparm_cat |
+    sed -n -e 's@^[[:space:]]*path[[:space:]]*=[[:space:]]@@p' |
+    sed -e 's/"//g'
+}
+
+list_samba_ports ()
+{
+    testparm_cat --parameter-name="smb ports" |
+    sed -e 's@,@ @g'
+}
+
+###########################
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+###########################
+
+case "$1" in
+     startup)
+       ctdb_service_start
+       ;;
+
+     shutdown)
+       ctdb_service_stop
+       ;;
+
+     monitor)
+       testparm_foreground_update 10
+       ret=$?
+
+       smb_ports="$CTDB_SAMBA_CHECK_PORTS"
+       if [ -z "$smb_ports" ] ; then
+           smb_ports=$(list_samba_ports)
+           [ -n "$smb_ports" ] || die "Failed to set smb ports"
+       fi
+       ctdb_check_tcp_ports $smb_ports || exit $?
+
+       if [ "$CTDB_SAMBA_SKIP_SHARE_CHECK" != "yes" ] ; then
+           list_samba_shares | ctdb_check_directories || exit $?
+       fi
+
+       if [ $ret -ne 0 ] ; then
+           testparm_background_update 10
+       fi
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/60.ganesha b/ctdb/config/events.d/60.ganesha
new file mode 100755 (executable)
index 0000000..744c5ce
--- /dev/null
@@ -0,0 +1,229 @@
+#!/bin/sh
+# script to manage nfs in a clustered environment
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+GANRECDIR="/var/lib/nfs/ganesha"
+GANRECDIR2="/var/lib/nfs/ganesha/recevents"
+GPFS_STATE="/usr/lpp/mmfs/bin/mmgetstate"
+GANRECDIR3="/var/lib/nfs/ganesha_local"
+
+
+service_start ()
+{
+    startstop_ganesha stop
+    startstop_ganesha start
+    set_proc "sys/net/ipv4/tcp_tw_recycle" 1
+}
+
+service_stop ()
+{
+    startstop_ganesha stop
+}
+
+service_reconfigure ()
+{
+    # if the ips have been reallocated, we must restart ganesha
+    # across all nodes and ping all statd listeners
+    [ -x $CTDB_BASE/statd-callout ] && {
+       $CTDB_BASE/statd-callout notify &
+    } >/dev/null 2>&1
+}
+
+loadconfig "nfs"
+
+
+[ -n "$CTDB_CLUSTER_FILESYSTEM_TYPE" ] || CTDB_CLUSTER_FILESYSTEM_TYPE="gpfs"
+
+service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
+
+[ "${CTDB_NFS_SERVER_MODE:-${NFS_SERVER_MODE}}" = "ganesha" ] || exit 0
+
+ctdb_setup_service_state_dir
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+ctdb_service_check_reconfigure
+
+get_cluster_fs_state  ()
+{
+    case $CTDB_CLUSTER_FILESYSTEM_TYPE in
+        gpfs)
+            STATE=`$GPFS_STATE | awk 'NR <= 3 {next} {printf "%-6s", $3}'`
+            echo $STATE
+            ;;
+        *)
+            die "File system $CTDB_CLUSTER_FILESYSTEM_TYPE not supported"
+            ;;
+   esac
+}
+
+create_ganesha_recdirs ()
+{
+    if [ -z "$(mount -t $CTDB_CLUSTER_FILESYSTEM_TYPE)" ]; then
+      echo "startup $CTDB_CLUSTER_FILESYSTEM_TYPE not ready"
+      exit 1
+    fi
+    MNTPT=`mount -t $CTDB_CLUSTER_FILESYSTEM_TYPE | sort | awk '{print $3}' | head -n 1`
+    mkdir -p $MNTPT/.ganesha
+    if [ -e $GANRECDIR ]; then
+        if [ ! -L $GANRECDIR ] ; then
+            rm -rf $GANRECDIR
+            if ! ln -s $MNTPT/.ganesha  $GANRECDIR ; then
+                echo "ln failed"
+            fi
+        fi
+    else
+        if ! ln -sf $MNTPT/.ganesha  $GANRECDIR ; then
+            echo "ln failed"
+        fi
+    fi
+
+    mkdir -p $GANRECDIR2
+    mkdir -p $GANRECDIR3
+}
+
+monitor_ganesha_nfsd ()
+{
+       create_ganesha_recdirs
+       service_name=${service_name}_process
+
+       PIDFILE="/var/run/ganesha.pid"
+       CUR_STATE=`get_cluster_fs_state`
+       GANESHA="/usr/bin/$CTDB_CLUSTER_FILESYSTEM_TYPE.ganesha.nfsd"
+       if { read PID < $PIDFILE && \
+           grep "$GANESHA" "/proc/$PID/cmdline" ; } >/dev/null 2>&1 ; then
+               ctdb_counter_init "$service_name"
+       else
+           if [ $CUR_STATE = "active" ]; then
+               echo "Trying fast restart of NFS service"
+               startstop_ganesha restart
+               ctdb_counter_incr "$service_name"
+               ctdb_check_counter "error" "-ge" "6" "$service_name"
+           fi
+       fi
+
+       service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"_service
+       # check that NFS is posting forward progress
+       if [ $CUR_STATE = "active" -a "$CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK" != "yes" ] ; then
+           MAXREDS=2
+           MAXSTALL=120
+           RESTART=0
+
+           NUMREDS=`ls $GANRECDIR3 | grep "red" | wc -l`
+           LASTONE=`ls -t $GANRECDIR3 | sed 's/_/ /' | awk 'NR > 1 {next} {printf $1} '`
+           # Beware of startup
+           if [ -z $LASTONE ] ; then
+               LASTONE=`date +"%s"`
+           fi
+           TNOW=$(date +"%s")
+           TSTALL=$(($TNOW - $LASTONE))
+           if [ $NUMREDS -ge $MAXREDS ] ; then
+               echo restarting because of $NUMREDS red conditions
+               RESTART=1
+               ctdb_counter_incr "$service_name"
+               ctdb_check_counter "error" "-ge" "6" "$service_name"
+           fi
+           if [ $TSTALL -ge $MAXSTALL ] ; then
+               echo restarting because of $TSTALL second stall
+               RESTART=1
+               ctdb_counter_incr "$service_name"
+               ctdb_check_counter "error" "-ge" "6" "$service_name"
+           fi
+           if [ $RESTART -gt 0 ] ; then
+               startstop_ganesha restart
+           else
+               ctdb_counter_init "$service_name"
+           fi
+       fi
+}
+
+############################################################
+
+case "$1" in
+     init)
+       # read statd from persistent database
+       ;;
+     startup)
+       create_ganesha_recdirs
+       ctdb_service_start
+       ;;
+
+     shutdown)
+       ctdb_service_stop
+       ;;
+
+     takeip)
+       if [ -n "$2" ] ; then
+           case  $CTDB_CLUSTER_FILESYSTEM_TYPE in
+               gpfs)
+                   NNUM=`/usr/lpp/mmfs/bin/mmlsconfig myNodeConfigNumber | awk '{print $2}'`
+                   TDATE=`date +"%s"`
+                   TOUCHTGT=$1"_"$TDATE"_"$NNUM"_"$3"_"$4"_"$2
+                   touch $GANRECDIR2/$TOUCHTGT
+                   ;;
+           esac
+       fi
+       ctdb_service_set_reconfigure
+       ;;
+
+     releaseip)
+       if [ -n "$2" ] ; then
+           case  $CTDB_CLUSTER_FILESYSTEM_TYPE in
+               gpfs)
+                   NNUM=`/usr/lpp/mmfs/bin/mmlsconfig myNodeConfigNumber | awk '{print $2}'`
+                   TDATE=`date +"%s"`
+                   TOUCHTGT=$1"_"$TDATE"_"$NNUM"_"$3"_"$4"_"$2
+                   touch $GANRECDIR2/$TOUCHTGT
+               ;;
+           esac
+       fi
+       ctdb_service_set_reconfigure
+       ;;
+
+     monitor)
+       update_tickles 2049
+
+       # check that statd responds to rpc requests
+       # if statd is not running we try to restart it
+       # we only do this IF we have a rpc.statd command.
+       # For platforms where rpc.statd does not exist, we skip
+        # the check completely
+       p="rpc.statd"
+       which $p >/dev/null 2>/dev/null && \
+           nfs_check_rpc_service "statd" \
+               -ge 6 "verbose unhealthy" \
+               -eq 4 "verbose restart" \
+               -eq 2 "restart:b"
+
+       if [ "$CTDB_SKIP_GANESHA_NFSD_CHECK" != "yes" ] ; then
+           monitor_ganesha_nfsd
+       fi
+
+       # rquotad is sometimes not started correctly on RHEL5
+       # not a critical service so we dont flag the node as unhealthy
+       nfs_check_rpc_service "rquotad" \
+           -gt 0 "verbose restart:b"
+
+       # Check that directories for shares actually exist.
+       [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
+           grep Path /etc/ganesha/$CTDB_CLUSTER_FILESYSTEM_TYPE.ganesha.exports.conf |
+           cut -f2 -d\" | ctdb_check_directories
+       } || exit $?
+
+       # once every 60 seconds, update the statd state database for which
+       # clients need notifications
+       nfs_statd_update 60
+       ;;
+
+     *)
+       ctdb_standard_event_handler "$@"
+        ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/60.nfs b/ctdb/config/events.d/60.nfs
new file mode 100755 (executable)
index 0000000..bd6cc7f
--- /dev/null
@@ -0,0 +1,109 @@
+#!/bin/sh
+# script to manage nfs in a clustered environment
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+service_name="nfs"
+service_start ()
+{
+    startstop_nfs stop
+    startstop_nfs start
+    set_proc "sys/net/ipv4/tcp_tw_recycle" 1
+}
+service_stop ()
+{
+    startstop_nfs stop
+}
+service_reconfigure ()
+{
+    # if the ips have been reallocated, we must restart the lockmanager
+    # across all nodes and ping all statd listeners
+    [ -x $CTDB_BASE/statd-callout ] && {
+       $CTDB_BASE/statd-callout notify &
+    } >/dev/null 2>&1
+}
+
+nfs_check_thread_count ()
+{
+    [ "$CTDB_MONITOR_NFS_THREAD_COUNT" = "yes" ] || return 0
+
+    # If $RPCNFSDCOUNT/$USE_KERNEL_NFSD_NUMBER isn't set then we could
+    # guess the default from the initscript.  However, let's just
+    # assume that those using the default don't care about the number
+    # of threads and that they have switched on this feature in error.
+    _configured_threads="${RPCNFSDCOUNT:-${USE_KERNEL_NFSD_NUMBER}}"
+    [ -n "$_configured_threads" ] || return 0
+
+    # nfsd should be running the configured number of threads.  If
+    # there are a different number of threads then tell nfsd the
+    # correct number.  
+    _running_threads=$(get_proc "fs/nfsd/threads")
+    # Intentionally not arithmetic comparison - avoids extra errors
+    # when get_proc() fails...
+    if [ "$_running_threads" != "$_configured_threads" ] ; then
+       echo "Attempting to correct number of nfsd threads from ${_running_threads} to ${_configured_threads}"
+       set_proc "fs/nfsd/threads" "$_configured_threads"
+    fi
+}
+
+loadconfig
+
+[ "${CTDB_NFS_SERVER_MODE:-${NFS_SERVER_MODE}}" != "ganesha" ] || exit 0
+
+ctdb_setup_service_state_dir
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+ctdb_service_check_reconfigure
+
+case "$1" in 
+     init)
+       # read statd from persistent database
+       ;;
+     startup)
+       ctdb_service_start
+       ;;
+
+     shutdown)
+       ctdb_service_stop
+       ;;
+
+     takeip)
+       ctdb_service_set_reconfigure
+       ;;
+
+     releaseip)
+       ctdb_service_set_reconfigure
+       ;;
+
+      monitor)
+       # Check that directories for shares actually exist.
+       [ "$CTDB_NFS_SKIP_SHARE_CHECK" = "yes" ] || {
+           exportfs -v | grep '^/' | 
+           sed -r -e 's@[[:space:]]+[^[:space:]()]+\([^[:space:]()]+\)$@@' | 
+           sort -u | 
+           ctdb_check_directories 
+       } || exit $?
+
+       update_tickles 2049
+
+       nfs_check_rpc_services
+
+       nfs_check_thread_count
+
+       # Every 10 minutes, update the statd state database for which
+       # clients need notifications
+       nfs_statd_update 600
+               ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/62.cnfs b/ctdb/config/events.d/62.cnfs
new file mode 100755 (executable)
index 0000000..da02acc
--- /dev/null
@@ -0,0 +1,78 @@
+#!/bin/sh
+# event script to integrate with gpfs cnfs
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+loadconfig
+
+ctdb_setup_service_state_dir "gpfs"
+
+check_if_healthy() {
+        mkdir -p "$service_state_dir/fs"
+
+        [ -f "$service_state_dir/gpfsnoquorum" ] && {
+                logger No GPFS quorum. Node is UNHEALTHY
+                $CTDB_BASE/events.d/62.cnfs unhealthy "No GPFS quorum. Nodfe is UNHEALTHY."
+               exit 0
+       }
+
+        logger All required GPFS resources are available. CNFS part is healthy.
+        $CTDB_BASE/events.d/62.cnfs healthy
+}
+
+case "$1" in
+    startup)
+        check_if_healthy
+        ;;
+
+
+    gpfsquorumreached)
+        rm -f "$service_state_dir/gpfsnoquorum"
+        logger "GPFS quorum has been reached."
+        check_if_healthy
+        ;;
+
+    gpfsquorumloss)
+        touch "$service_state_dir/gpfsnoquorum"
+        logger "GPFS quorum has been lost."
+        $CTDB_BASE/events.d/62.cnfs unhealthy "GPFS quorum was lost! Marking node as UNHEALTHY."
+        ;;
+
+    unhealthy)
+        # Mark the node as UNHEALTHY which means all public addresses
+        # will be migrated off the node.
+        shift
+        echo "$*" | ctdb_setstatus unhealthy -
+
+        # force a monitor event so we pick up immediately that this script
+        # will now fail and make the node unhealthy.
+        ctdb eventscript monitor
+
+        # Wait until we no longer serve any ip addresses at all
+        PNN=`ctdb pnn | cut -d: -f2`
+        while `ctdb -Y ip | cut -d: -f3 | egrep "^$PNN$" >/dev/null`; do
+                sleep 1
+        done
+        ;;
+
+    healthy)
+        # mark the node as healthy
+        ctdb_setstatus healthy
+        ;;
+
+
+    monitor)
+        ctdb_checkstatus
+        exit $?
+        ;;
+
+    *)
+        ctdb_standard_event_handler "$@"
+        ;;
+esac
+
+exit 0
+
diff --git a/ctdb/config/events.d/70.iscsi b/ctdb/config/events.d/70.iscsi
new file mode 100755 (executable)
index 0000000..cedaf40
--- /dev/null
@@ -0,0 +1,69 @@
+#!/bin/sh
+# ctdb event script for TGTD based iSCSI
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+service_name="iscsi"
+
+loadconfig
+
+ctdb_start_stop_service
+
+is_ctdb_managed_service || exit 0
+
+[ -z "$CTDB_START_ISCSI_SCRIPTS" ] && {
+       echo "No iscsi start script directory found"
+       exit 0
+}
+
+case "$1" in 
+    ipreallocated)
+       # block the iscsi port
+       iptables -I INPUT 1 -p tcp --dport 3260 -j DROP
+       
+       # shut down the iscsi service
+       killall -9 tgtd >/dev/null 2>/dev/null
+
+       this_node=$(ctdb xpnn | sed -e 's@PNN:@@')
+       if [ -z "$this_node" ] ; then
+               echo "Failed to get node pnn"
+               exit 0
+       fi
+
+       # start the iscsi daemon
+       tgtd >/dev/null 2>/dev/null
+
+       ips=$(ctdb -Y ip | awk -F: -v pnn=$this_node '$3 == pnn {print $2}')
+       for ip in $ips ; do
+           script="${CTDB_START_ISCSI_SCRIPTS}/${ip}.sh"
+           if [ -x "$script" ] ; then
+               echo "Starting iscsi service for public address ${ip}"
+               "$script"
+           fi
+       done
+
+       # remove all iptables rules
+       while iptables -D INPUT -p tcp --dport 3260 -j DROP >/dev/null 2>&1 ; do
+           :
+       done
+
+       ;;
+
+    shutdown)
+       # shutdown iscsi when ctdb goes down
+       killall -9 tgtd >/dev/null 2>/dev/null
+       ;;
+
+    monitor)
+       ctdb_check_tcp_ports 3260 || exit $?
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/91.lvs b/ctdb/config/events.d/91.lvs
new file mode 100755 (executable)
index 0000000..bdbcfa3
--- /dev/null
@@ -0,0 +1,89 @@
+#!/bin/sh
+# script to manage the lvs ip multiplexer for a single public address cluster
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+
+loadconfig ctdb
+
+[ -z "$CTDB_LVS_PUBLIC_IP" ] && exit 0
+[ -z "$CTDB_PUBLIC_INTERFACE" ] && exit 0
+
+[ -x /sbin/ipvsadm ] || {
+    echo "LVS configured but /sbin/ipvsadm is not installed."
+    exit 0
+}
+
+case "$1" in 
+     startup)
+       ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+       ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+
+       ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo scope host >/dev/null 2>/dev/null
+
+       # do not respond to ARPs that are for ip addresses with scope 'host'
+       echo 3 > /proc/sys/net/ipv4/conf/all/arp_ignore
+       # do not send out arp requests from loopback addresses
+       echo 2 > /proc/sys/net/ipv4/conf/all/arp_announce
+       ;;
+
+     shutdown)
+       ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+       ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+
+       # remove the ip
+       ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+
+       # flush our route cache
+       echo 1 > /proc/sys/net/ipv4/route/flush
+       ;;
+
+     recovered|ipreallocated)
+       # kill off any tcp connections
+       ipvsadm -D -t $CTDB_LVS_PUBLIC_IP:0
+       ipvsadm -D -u $CTDB_LVS_PUBLIC_IP:0
+       kill_tcp_connections_local_only $CTDB_LVS_PUBLIC_IP
+
+       PNN=`ctdb pnn | sed -e "s/.*PNN://"`
+       LVSMASTER=`ctdb lvsmaster | sed -e "s/.*Node //" -e "s/ .*//"`
+
+       [ "$PNN" != "$LVSMASTER" ] && {
+           # we are not the lvs master so we have to
+           # change the ip address to have scope host so we wont respond
+           # to arps
+           ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+           ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo scope host >/dev/null 2>/dev/null
+           exit 0
+       }
+
+       # change the scope so we start responding to arps
+       ip addr del $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+       ip addr add $CTDB_LVS_PUBLIC_IP/32 dev lo >/dev/null 2>/dev/null
+
+       ipvsadm -A -t $CTDB_LVS_PUBLIC_IP:0 -p 1999999 -s lc
+       ipvsadm -A -u $CTDB_LVS_PUBLIC_IP:0 -p 1999999 -s lc
+
+       # add all nodes (except ourselves) to the lvs config
+       ctdb lvs | egrep -v "^$PNN:" | sed -e "s/.*://" | while read IP; do
+               ipvsadm -a -t $CTDB_LVS_PUBLIC_IP:0 -r $IP -g
+               ipvsadm -a -u $CTDB_LVS_PUBLIC_IP:0 -r $IP -g
+       done
+       # and add the localhost too
+       ipvsadm -a -t $CTDB_LVS_PUBLIC_IP:0 -r 127.0.0.1
+       ipvsadm -a -u $CTDB_LVS_PUBLIC_IP:0 -r 127.0.0.1
+
+       # send out a gratious arp so our peers will update their arp tables
+       ctdb gratiousarp $CTDB_LVS_PUBLIC_IP $CTDB_PUBLIC_INTERFACE >/dev/null 2>/dev/null
+
+       # flush our route cache
+       echo 1 > /proc/sys/net/ipv4/route/flush
+       ;;
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
diff --git a/ctdb/config/events.d/99.timeout b/ctdb/config/events.d/99.timeout
new file mode 100755 (executable)
index 0000000..2a6495a
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+#
+# Event script to just sleep longer than the timeout
+# in the monitor action. The purpose is to trigger
+# the event timeout mechanism.
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig ctdb
+
+[ "$CTDB_RUN_TIMEOUT_MONITOR" = "yes" ] || exit 0
+
+case "$1" in
+    monitor)
+       TIMEOUT=$(ctdb getvar EventScriptTimeout | awk '{print $3}')
+       echo "sleeping for $((TIMEOUT * 2)) seconds..."
+       sleep $((TIMEOUT * 2))
+       ;;
+
+
+    *)
+       ctdb_standard_event_handler "$@"
+       ;;
+esac
+
+exit 0
+
diff --git a/ctdb/config/events.d/README b/ctdb/config/events.d/README
new file mode 100644 (file)
index 0000000..ea9048f
--- /dev/null
@@ -0,0 +1,170 @@
+This directory is where you should put any local or application
+specific event scripts for ctdb to call.
+
+All event scripts start with the prefic 'NN.' where N is a digit.
+The event scripts are run in sequence based on NN.
+Thus 10.interfaces will be run before 60.nfs.
+
+Each NN must be unique and duplicates will cause undefined behaviour.
+I.e. having both 10.interfaces and 10.otherstuff is not allowed.
+
+
+As a special case, any eventscript that ends with a '~' character will be 
+ignored since this is a common postfix that some editors will append to 
+older versions of a file.
+
+Only event scripts with executable permissions are run from CTDB. Any event
+script that does not have executable permission is ignored.
+
+The eventscripts are called with varying number of arguments.
+The first argument is the "event" and the rest of the arguments depend
+on which event was triggered.
+
+All of the events except the 'shutdown' and 'startrecovery' events will be
+called with the ctdb daemon in NORMAL mode (ie. not in recovery)
+
+The events currently implemented are
+init
+       This event does not take any additional arguments.
+       This event is only invoked once, when ctdb is starting up.
+       This event is used to do some cleanup work from earlier runs
+       and prepare the basic setup.
+       At this stage 'ctdb' commands won't work.
+
+       Example: 00.ctdb cleans up $CTDB_VARDIR/state
+
+setup
+       This event does not take any additional arguments.
+       This event is only invoked once, after init event is completed.
+       This event is used to do setup any tunables defined in ctdb 
+        configuration file.
+
+startup
+       This event does not take any additional arguments.
+       This event is only invoked once, when ctdb has finished
+       the initial recoveries. This event is used to wait for
+       the service to start and all resources for the service
+       becoming available.
+
+       This is used to prevent ctdb from starting up and advertize its
+       services until all dependent services have become available.
+
+       All services that are managed by ctdb should implement this
+       event and use it to start the service.
+
+       Example: 50.samba uses this event to start the samba daemon
+       and then wait until samba and all its associated services have
+       become available. It then also proceeds to wait until all
+       shares have become available.
+
+shutdown
+       This event is called when the ctdb service is shuting down.
+       
+       All services that are managed by ctdb should implement this event
+       and use it to perform a controlled shutdown of the service.
+
+       Example: 60.nfs uses this event to shut down nfs and all associated
+       services and stop exporting any shares when this event is invoked.
+
+monitor
+       This event is invoked every X number of seconds.
+       The interval can be configured using the MonitorInterval tunable
+       but defaults to 15 seconds.
+
+       This event is triggered by ctdb to continuously monitor that all
+       managed services are healthy.
+       When invoked, the event script will check that the service is healthy
+       and return 0 if so. If the service is not healthy the event script
+       should return non zero.
+
+       If a service returns nonzero from this script this will cause ctdb
+       to consider the node status as UNHEALTHY and will cause the public
+       address and all associated services to be failed over to a different
+       node in the cluster.
+
+       All managed services should implement this event.
+
+       Example: 10.interfaces which checks that the public interface (if used)
+       is healthy, i.e. it has a physical link established.
+
+takeip
+       This event is triggered everytime the node takes over a public ip
+       address during recovery.
+       This event takes three additional arguments :
+       'interface' 'ipaddress' and 'netmask'
+
+       Before this event there will always be a 'startrecovery' event.
+
+       This event will always be followed by a 'recovered' event once
+       all ipaddresses have been reassigned to new nodes and the ctdb database
+       has been recovered.
+       If multiple ip addresses are reassigned during recovery it is
+       possible to get several 'takeip' events followed by a single 
+       'recovered' event.
+
+       Since there might involve substantial work for the service when an ip
+       address is taken over and since multiple ip addresses might be taken 
+       over in a single recovery it is often best to only mark which addresses
+       are being taken over in this event and defer the actual work to 
+       reconfigure or restart the services until the 'recovered' event.
+
+       Example: 60.nfs which just records which ip addresses are being taken
+       over into a local state directory   and which defers the actual
+       restart of the services until the 'recovered' event.
+
+
+releaseip
+       This event is triggered everytime the node releases a public ip
+       address during recovery.
+       This event takes three additional arguments :
+       'interface' 'ipaddress' and 'netmask'
+
+       In all other regards this event is analog to the 'takeip' event above.
+
+       Example: 60.nfs
+
+updateip
+       This event is triggered everytime the node moves a public ip
+       address between interfaces
+       This event takes four additional arguments :
+       'old-interface' 'new-interface' 'ipaddress' and 'netmask'
+
+       Example: 10.interface
+
+startrecovery
+       This event is triggered everytime we start a recovery process
+       or before we start changing ip address allocations.
+
+recovered
+       This event is triggered every time we have finished a full recovery
+       and also after we have finished reallocating the public ip addresses
+       across the cluster.
+
+       Example: 60.nfs which if the ip address configuration has changed
+       during the recovery (i.e. if addresses have been taken over or
+       released) will kill off any tcp connections that exist for that
+       service and also send out statd notifications to all registered 
+       clients.
+       
+ipreallocated
+
+       This event is triggered after releaseip and takeip events in a
+       takeover run.  It can be used to reconfigure services, update
+       routing and many other things.
+
+Additional note for takeip, releaseip, recovered:
+
+ALL services that depend on the ip address configuration of the node must 
+implement all three of these events.
+
+ALL services that use TCP should also implement these events and at least
+kill off any tcp connections to the service if the ip address config has 
+changed in a similar fashion to how 60.nfs does it.
+The reason one must do this is that ESTABLISHED tcp connections may survive
+when an ip address is released and removed from the host until the ip address
+is re-takenover.
+Any tcp connections that survive a release/takeip sequence can potentially
+cause the client/server tcp connection to get out of sync with sequence and 
+ack numbers and cause a disruptive ack storm.
+
+
diff --git a/ctdb/config/functions b/ctdb/config/functions
new file mode 100755 (executable)
index 0000000..aa31f89
--- /dev/null
@@ -0,0 +1,1486 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!
+
+# utility functions for ctdb event scripts
+
+[ -z "$CTDB_VARDIR" ] && {
+    if [ -d "/var/lib/ctdb" ] ; then
+       export CTDB_VARDIR="/var/lib/ctdb"
+    else
+       export CTDB_VARDIR="/var/ctdb"
+    fi
+}
+[ -z "$CTDB_ETCDIR" ] && {
+    export CTDB_ETCDIR="/etc"
+}
+
+#######################################
+# pull in a system config file, if any
+_loadconfig() {
+
+    if [ -z "$1" ] ; then
+       foo="${service_config:-${service_name}}"
+       if [ -n "$foo" ] ; then
+           loadconfig "$foo"
+           return
+       fi
+    fi
+
+    if [ "$1" != "ctdb" ] ; then
+       loadconfig "ctdb"
+    fi
+
+    if [ -z "$1" ] ; then
+       return
+    fi
+
+    if [ -f $CTDB_ETCDIR/sysconfig/$1 ]; then
+       . $CTDB_ETCDIR/sysconfig/$1
+    elif [ -f $CTDB_ETCDIR/default/$1 ]; then
+       . $CTDB_ETCDIR/default/$1
+    elif [ -f $CTDB_BASE/sysconfig/$1 ]; then
+       . $CTDB_BASE/sysconfig/$1
+    fi
+
+    if [ "$1" = "ctdb" ] ; then
+       _config="${CTDB_BASE}/ctdbd.conf"
+       if [ -r "$_config" ] ; then
+           . "$_config"
+       fi
+    fi
+}
+
+loadconfig () {
+    _loadconfig "$@"
+}
+
+##############################################################
+
+# CTDB_SCRIPT_DEBUGLEVEL can be overwritten by setting it in a
+# configuration file.
+debug ()
+{
+    if [ ${CTDB_SCRIPT_DEBUGLEVEL:-2} -ge 4 ] ; then
+       # If there are arguments then echo them.  Otherwise expect to
+       # use stdin, which allows us to pass lots of debug using a
+       # here document.
+       if [ -n "$1" ] ; then
+           echo "DEBUG: $*"
+       elif ! tty -s ; then
+           sed -e 's@^@DEBUG: @'
+       fi
+    fi
+}
+
+die ()
+{
+    _msg="$1"
+    _rc="${2:-1}"
+
+    echo "$_msg"
+    exit $_rc
+}
+
+# Log given message or stdin to either syslog or a CTDB log file
+# $1 is the tag passed to logger if syslog is in use.
+script_log ()
+{
+    _tag="$1" ; shift
+
+    if [ "$CTDB_SYSLOG" = "yes" ] ; then
+       logger -t "ctdbd: ${_tag}" $*
+    else
+       {
+           if [ -n "$*" ] ; then
+               echo "$*"
+           else
+               cat
+           fi
+       } >>"${CTDB_LOGFILE:-/var/log/log.ctdb}"
+    fi
+}
+
+# When things are run in the background in an eventscript then logging
+# output might get lost.  This is the "solution".  :-)
+background_with_logging ()
+{
+    (
+       "$@" 2>&1 </dev/null |
+       script_log "${script_name}&"
+    )&
+
+    return 0
+}
+
+##############################################################
+# check number of args for different events
+ctdb_check_args ()
+{
+    case "$1" in
+       takeip|releaseip)
+           if [ $# != 4 ]; then
+               echo "ERROR: must supply interface, IP and maskbits"
+               exit 1
+           fi
+           ;;
+       updateip)
+           if [ $# != 5 ]; then
+               echo "ERROR: must supply old interface, new interface, IP and maskbits"
+               exit 1
+           fi
+           ;;
+    esac
+}
+
+##############################################################
+# determine on what type of system (init style) we are running
+detect_init_style()
+{
+    # only do detection if not already set:
+    [ -z "$CTDB_INIT_STYLE" ] || return
+
+    if [ -x /sbin/startproc ]; then
+        CTDB_INIT_STYLE="suse"
+    elif [ -x /sbin/start-stop-daemon ]; then
+        CTDB_INIT_STYLE="debian"
+    else
+        CTDB_INIT_STYLE="redhat"
+    fi
+}
+
+######################################################
+# simulate /sbin/service on platforms that don't have it
+# _service() makes it easier to hook the service() function for
+# testing.
+_service ()
+{
+  _service_name="$1"
+  _op="$2"
+
+  # do nothing, when no service was specified
+  [ -z "$_service_name" ] && return
+
+  if [ -x /sbin/service ]; then
+      $_nice /sbin/service "$_service_name" "$_op"
+  elif [ -x $CTDB_ETCDIR/init.d/$_service_name ]; then
+      $_nice $CTDB_ETCDIR/init.d/$_service_name "$_op"
+  elif [ -x $CTDB_ETCDIR/rc.d/init.d/$_service_name ]; then
+      $_nice $CTDB_ETCDIR/rc.d/init.d/$_service_name "$_op"
+  fi
+}
+
+service()
+{
+    _nice=""
+    _service "$@"
+}
+
+######################################################
+# simulate /sbin/service (niced) on platforms that don't have it
+nice_service()
+{
+    _nice="nice"
+    _service "$@"
+}
+
+######################################################
+# wrapper around /proc/ settings to allow them to be hooked
+# for testing
+# 1st arg is relative path under /proc/, 2nd arg is value to set
+set_proc ()
+{
+    echo "$2" >"/proc/$1"
+}
+
+######################################################
+# wrapper around getting file contents from /proc/ to allow
+# this to be hooked for testing
+# 1st arg is relative path under /proc/
+get_proc ()
+{
+    cat "/proc/$1"
+}
+
+######################################################
+# Check that an RPC service is healthy -
+# this includes allowing a certain number of failures
+# before marking the NFS service unhealthy.
+#
+# usage: nfs_check_rpc_service SERVICE_NAME [ triple ...]
+#
+# each triple is a set of 3 arguments: an operator, a 
+# fail count limit and an action string.
+#
+# For example:
+#
+#      nfs_check_rpc_service "lockd" \
+#          -ge 15 "verbose restart unhealthy" \
+#          -eq 10 "restart:bs"
+#
+# says that if lockd is down for 15 iterations then do
+# a verbose restart of lockd and mark the node unhealthy.
+# Before this, after 10 iterations of failure, the
+# service is restarted silently in the background.
+# Order is important: the number of failures need to be
+# specified in reverse order because processing stops
+# after the first condition that is true.
+######################################################
+nfs_check_rpc_service ()
+{
+    _prog_name="$1" ; shift
+
+    if _nfs_check_rpc_common "$_prog_name" ; then
+       return
+    fi
+
+    while [ -n "$3" ] ; do
+       if _nfs_check_rpc_action "$1" "$2" "$3" ; then
+           break
+       fi
+       shift 3
+    done
+}
+
+# The new way of doing things...
+nfs_check_rpc_services ()
+{
+    # Files must end with .check - avoids editor backups, RPM fu, ...
+    for _f in "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9].*.check ; do
+       _t="${_f%.check}"
+       _prog_name="${_t##*/[0-9][0-9].}"
+
+       if _nfs_check_rpc_common "$_prog_name" ; then
+           # This RPC service is up, check next service...
+           continue
+       fi
+
+       # Check each line in the file in turn until one of the limit
+       # checks is hit...
+       while read _cmp _lim _rest ; do
+           # Skip comments
+           case "$_cmp" in
+               \#*) continue ;;
+           esac
+
+           if _nfs_check_rpc_action "$_cmp" "$_lim" "$_rest" ; then
+               # Limit was hit on this line, no further checking...
+               break
+           fi
+       done <"$_f"
+    done
+}
+
+_nfs_check_rpc_common ()
+{
+    _prog_name="$1"
+
+    # Some platforms don't have separate programs for all services.
+    case "$_prog_name" in
+       statd)
+           which "rpc.${_prog_name}" >/dev/null 2>&1 || return 0
+    esac
+
+    case "$_prog_name" in
+       nfsd)
+           _rpc_prog=nfs
+           _version=3
+           ;;
+       mountd)
+           _rpc_prog=mountd
+           _version=1
+           ;;
+       rquotad)
+           _rpc_prog=rquotad
+           _version=1
+           ;;
+       lockd)
+           _rpc_prog=nlockmgr
+           _version=4
+           ;;
+       statd)
+           _rpc_prog=status
+           _version=1
+           ;;
+       *)
+           echo "Internal error: unknown RPC program \"$_prog_name\"."
+           exit 1
+    esac
+
+    _service_name="nfs_${_prog_name}"
+
+    if ctdb_check_rpc "$_rpc_prog" $_version >/dev/null ; then
+       ctdb_counter_init "$_service_name"
+       return 0
+    fi
+
+    ctdb_counter_incr "$_service_name"
+
+    return 1
+}
+
+_nfs_check_rpc_action ()
+{
+    _cmp="$1"
+    _limit="$2"
+    _actions="$3"
+
+    if ctdb_check_counter "quiet" "$_cmp" "$_limit" "$_service_name" ; then
+       return 1
+    fi
+
+    for _action in $_actions ; do
+       case "$_action" in
+           verbose)
+               echo "$ctdb_check_rpc_out"
+               ;;
+           restart)
+               _nfs_restart_rpc_service "$_prog_name"
+               ;;
+           restart:b)
+               _nfs_restart_rpc_service "$_prog_name" true
+               ;;
+           unhealthy)
+               exit 1
+               ;;
+           *)
+               echo "Internal error: unknown action \"$_action\"."
+               exit 1
+       esac
+    done
+
+    return 0
+}
+
+_nfs_restart_rpc_service ()
+{
+    _prog_name="$1"
+    _background="${2:-false}"
+
+    if $_background ; then
+       _maybe_background="background_with_logging"
+    else
+       _maybe_background=""
+    fi
+
+    _p="rpc.${_prog_name}"
+
+    case "$_prog_name" in
+       nfsd)
+           echo "Trying to restart NFS service"
+           $_maybe_background startstop_nfs restart
+           ;;
+       mountd)
+           echo "Trying to restart $_prog_name [${_p}]"
+           killall -q -9 "$_p"
+           $_maybe_background $_p ${MOUNTD_PORT:+-p} $MOUNTD_PORT
+           ;;
+       rquotad)
+           echo "Trying to restart $_prog_name [${_p}]"
+           killall -q -9 "$_p"
+           $_maybe_background $_p ${RQUOTAD_PORT:+-p} $RQUOTAD_PORT
+           ;;
+       lockd)
+           echo "Trying to restart lock manager service"
+           $_maybe_background startstop_nfslock restart
+           ;;
+       statd)
+           echo "Trying to restart $_prog_name [${_p}]"
+           killall -q -9 "$_p"
+           $_maybe_background $_p \
+               ${STATD_HOSTNAME:+-n} $STATD_HOSTNAME \
+               ${STATD_PORT:+-p} $STATD_PORT \
+               ${STATD_OUTGOING_PORT:+-o} $STATD_OUTGOING_PORT
+           ;;
+       *)
+           echo "Internal error: unknown RPC program \"$_prog_name\"."
+           exit 1
+    esac
+}
+
+######################################################
+# check that a rpc server is registered with portmap
+# and responding to requests
+# usage: ctdb_check_rpc SERVICE_NAME VERSION
+######################################################
+ctdb_check_rpc ()
+{
+    progname="$1"
+    version="$2"
+
+    _localhost="${CTDB_RPCINFO_LOCALHOST:-127.0.0.1}"
+
+    if ! ctdb_check_rpc_out=$(rpcinfo -u $_localhost $progname $version 2>&1) ; then
+       ctdb_check_rpc_out="ERROR: $progname failed RPC check:
+$ctdb_check_rpc_out"
+       echo "$ctdb_check_rpc_out"
+       return 1
+    fi
+}
+
+######################################################
+# Ensure $service_name is set
+assert_service_name ()
+{
+    [ -n "$service_name" ] || die "INTERNAL ERROR: \$service_name not set"
+}
+
+######################################################
+# check a set of directories is available
+# return 1 on a missing directory
+# directories are read from stdin
+######################################################
+ctdb_check_directories_probe()
+{
+    while IFS="" read d ; do
+       case "$d" in
+           *%*)
+               continue
+               ;;
+           *)
+               [ -d "${d}/." ] || return 1
+       esac
+    done
+}
+
+######################################################
+# check a set of directories is available
+# directories are read from stdin
+######################################################
+ctdb_check_directories()
+{
+    ctdb_check_directories_probe || {
+       echo "ERROR: $service_name directory \"$d\" not available"
+       exit 1
+    }
+}
+
+######################################################
+# check a set of tcp ports
+# usage: ctdb_check_tcp_ports <ports...>
+######################################################
+
+# This flag file is created when a service is initially started.  It
+# is deleted the first time TCP port checks for that service succeed.
+# Until then ctdb_check_tcp_ports() prints a more subtle "error"
+# message if a port check fails.
+_ctdb_check_tcp_common ()
+{
+    assert_service_name
+    _ctdb_service_started_file="$ctdb_fail_dir/$service_name.started"
+}
+
+ctdb_check_tcp_init ()
+{
+    _ctdb_check_tcp_common
+    mkdir -p "${_ctdb_service_started_file%/*}" # dirname
+    touch "$_ctdb_service_started_file"
+}
+
+# Check whether something is listening on all of the given TCP ports
+# using the "ctdb checktcpport" command.
+ctdb_check_tcp_ports()
+{
+    if [ -z "$1" ] ; then
+       echo "INTERNAL ERROR: ctdb_check_tcp_ports - no ports specified"
+       exit 1
+    fi
+
+    for _p ; do  # process each function argument (port)
+       _cmd="ctdb checktcpport $_p"
+       _out=$($_cmd 2>&1)
+       _ret=$?
+       case "$_ret" in
+           0)
+               _ctdb_check_tcp_common
+               if [ ! -f "$_ctdb_service_started_file" ] ; then
+                   echo "ERROR: $service_name tcp port $_p is not responding"
+                   debug "\"ctdb checktcpport $_p\" was able to bind to port"
+               else
+                   echo "INFO: $service_name tcp port $_p is not responding"
+               fi
+
+               return 1
+               ;;
+           98)
+               # Couldn't bind, something already listening, next port...
+               continue
+               ;;
+           *)
+               echo "ERROR: unexpected error running \"ctdb checktcpport\""
+               debug <<EOF
+ctdb checktcpport (exited with $_ret) with output:
+$_out"
+EOF
+               return $_ret
+       esac
+    done
+
+    # All ports listening
+    _ctdb_check_tcp_common
+    rm -f "$_ctdb_service_started_file"
+    return 0
+}
+
+######################################################
+# check a unix socket
+# usage: ctdb_check_unix_socket SERVICE_NAME <socket_path>
+######################################################
+ctdb_check_unix_socket() {
+    socket_path="$1"
+    [ -z "$socket_path" ] && return
+
+    if ! netstat --unix -a -n | grep -q "^unix.*LISTEN.*${socket_path}$"; then
+        echo "ERROR: $service_name socket $socket_path not found"
+        return 1
+    fi
+}
+
+######################################################
+# check a command returns zero status
+# usage: ctdb_check_command <command>
+######################################################
+ctdb_check_command ()
+{
+    _out=$("$@" 2>&1) || {
+       echo "ERROR: $* returned error"
+       echo "$_out" | debug
+       exit 1
+    }
+}
+
+################################################
+# kill off any TCP connections with the given IP
+################################################
+kill_tcp_connections ()
+{
+    _ip="$1"
+
+    _oneway=false
+    if [ "$2" = "oneway" ] ; then
+       _oneway=true
+    fi
+
+    get_tcp_connections_for_ip "$_ip" | {
+       _killcount=0
+       _connections=""
+       _nl="
+"
+       while read _dst _src; do
+           _destport="${_dst##*:}"
+           __oneway=$_oneway
+           case $_destport in
+               # we only do one-way killtcp for CIFS
+               139|445) __oneway=true ;;
+           esac
+
+           echo "Killing TCP connection $_src $_dst"
+           _connections="${_connections}${_nl}${_src} ${_dst}"
+           if ! $__oneway ; then
+               _connections="${_connections}${_nl}${_dst} ${_src}"
+           fi
+
+           _killcount=$(($_killcount + 1))
+       done
+
+       if [ $_killcount -eq 0 ] ; then
+           return
+       fi
+
+       echo "$_connections" | ctdb killtcp || {
+           echo "Failed to send killtcp control"
+           return
+       }
+
+       _count=0
+       while : ; do
+           _remaining=$(get_tcp_connections_for_ip $_ip | wc -l)
+
+           if [ $_remaining -eq 0 ] ; then
+               echo "Killed $_killcount TCP connections to released IP $_ip"
+               return
+           fi
+
+           _count=$(($_count + 1))
+           if [ $_count -gt 3 ] ; then
+               echo "Timed out killing tcp connections for IP $_ip"
+               return
+           fi
+
+           echo "Waiting for $_remaining connections to be killed for IP $_ip"
+           sleep 1
+       done
+    }
+}
+
+##################################################################
+# kill off the local end for any TCP connections with the given IP
+##################################################################
+kill_tcp_connections_local_only ()
+{
+    kill_tcp_connections "$1" "oneway"
+}
+
+##################################################################
+# tickle any TCP connections with the given IP
+##################################################################
+tickle_tcp_connections ()
+{
+    _ip="$1"
+
+    get_tcp_connections_for_ip "$_ip" |
+    {
+       _failed=false
+
+       while read dest src; do
+           echo "Tickle TCP connection $src $dest"
+           ctdb tickle $src $dest >/dev/null 2>&1 || _failed=true
+           echo "Tickle TCP connection $dest $src"
+           ctdb tickle $dest $src >/dev/null 2>&1 || _failed=true
+       done
+
+       if $_failed ; then
+           echo "Failed to send tickle control"
+       fi
+    }
+}
+
+get_tcp_connections_for_ip ()
+{
+    _ip="$1"
+
+    netstat -tn | awk -v ip=$_ip \
+       'index($1, "tcp") == 1 && \
+        (index($4, ip ":") == 1 || index($4, "::ffff:" ip ":") == 1) \
+        && $6 == "ESTABLISHED" \
+        {print $4" "$5}'
+}
+
+########################################################
+# start/stop the Ganesha nfs service
+########################################################
+startstop_ganesha()
+{
+    _service_name="nfs-ganesha-$CTDB_CLUSTER_FILESYSTEM_TYPE"
+    case "$1" in
+       start)
+           service "$_service_name" start
+           ;;
+       stop)
+           service "$_service_name" stop
+           ;;
+       restart)
+           service "$_service_name" restart
+           ;;
+    esac
+}
+
+########################################################
+# start/stop the nfs service on different platforms
+########################################################
+startstop_nfs() {
+       PLATFORM="unknown"
+       [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
+               PLATFORM="sles"
+       }
+       [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
+               PLATFORM="rhel"
+       }
+
+       case $PLATFORM in
+       sles)
+               case $1 in
+               start)
+                       service nfsserver start
+                       ;;
+               stop)
+                       service nfsserver stop > /dev/null 2>&1
+                       ;;
+               restart)
+                       set_proc "fs/nfsd/threads" 0
+                       service nfsserver stop > /dev/null 2>&1
+                       pkill -9 nfsd
+                       nfs_dump_some_threads
+                       service nfsserver start
+                       ;;
+               esac
+               ;;
+       rhel)
+               case $1 in
+               start)
+                       service nfslock start
+                       service nfs start
+                       ;;
+               stop)
+                       service nfs stop
+                       service nfslock stop
+                       ;;
+               restart)
+                       set_proc "fs/nfsd/threads" 0
+                       service nfs stop > /dev/null 2>&1
+                       service nfslock stop > /dev/null 2>&1
+                       pkill -9 nfsd
+                       nfs_dump_some_threads
+                       service nfslock start
+                       service nfs start
+                       ;;
+               esac
+               ;;
+       *)
+               echo "Unknown platform. NFS is not supported with ctdb"
+               exit 1
+               ;;
+       esac
+}
+
+# Dump up to the configured number of nfsd thread backtraces.
+nfs_dump_some_threads ()
+{
+    [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] || return 0
+
+    # Optimisation to avoid running an unnecessary pidof
+    [ $CTDB_NFS_DUMP_STUCK_THREADS -gt 0 ] || return 0
+
+    _count=0
+    for _pid in $(pidof nfsd) ; do
+       [ $_count -le $CTDB_NFS_DUMP_STUCK_THREADS ] || break
+
+       # Do this first to avoid racing with thread exit
+       _stack=$(get_proc "${_pid}/stack" 2>/dev/null)
+       if [ -n "$_stack" ] ; then
+           echo "Stack trace for stuck nfsd thread [${_pid}]:"
+           echo "$_stack"
+           _count=$(($_count + 1))
+       fi
+    done
+}
+
+########################################################
+# start/stop the nfs lockmanager service on different platforms
+########################################################
+startstop_nfslock() {
+       PLATFORM="unknown"
+       [ -x $CTDB_ETCDIR/init.d/nfsserver ] && {
+               PLATFORM="sles"
+       }
+       [ -x $CTDB_ETCDIR/init.d/nfslock ] && {
+               PLATFORM="rhel"
+       }
+
+       case $PLATFORM in
+       sles)
+               # for sles there is no service for lockmanager
+               # so we instead just shutdown/restart nfs
+               case $1 in
+               start)
+                       service nfsserver start
+                       ;;
+               stop)
+                       service nfsserver stop > /dev/null 2>&1
+                       ;;
+               restart)
+                       service nfsserver stop > /dev/null 2>&1
+                       service nfsserver start
+                       ;;
+               esac
+               ;;
+       rhel)
+               case $1 in
+               start)
+                       service nfslock start
+                       ;;
+               stop)
+                       service nfslock stop > /dev/null 2>&1
+                       ;;
+               restart)
+                       service nfslock stop > /dev/null 2>&1
+                       service nfslock start
+                       ;;
+               esac
+               ;;
+       *)
+               echo "Unknown platform. NFS locking is not supported with ctdb"
+               exit 1
+               ;;
+       esac
+}
+
+# Periodically update the statd database
+nfs_statd_update ()
+{
+    _update_period="$1"
+
+    _statd_update_trigger="$service_state_dir/update-trigger"
+    [ -f "$_statd_update_trigger" ] || touch "$_statd_update_trigger"
+
+    _last_update=$(stat --printf="%Y" "$_statd_update_trigger")
+    _current_time=$(date +"%s")
+    if [ $(( $_current_time - $_last_update)) -ge $_update_period ] ; then
+       touch "$_statd_update_trigger"
+       $CTDB_BASE/statd-callout updatelocal &
+       $CTDB_BASE/statd-callout updateremote &
+    fi
+}
+
+add_ip_to_iface()
+{
+    _iface=$1
+    _ip=$2
+    _maskbits=$3
+
+    _lockfile="${CTDB_VARDIR}/state/interface_modify_${_iface}.flock"
+    mkdir -p "${_lockfile%/*}" # dirname
+    [ -f "$_lockfile" ] || touch "$_lockfile"
+
+    (
+       # Note: use of return/exit/die() below only gets us out of the
+       # sub-shell, which is actually what we want.  That is, the
+       # function should just return non-zero.
+
+       flock --timeout 30 0 || \
+           die "add_ip_to_iface: unable to get lock for ${_iface}"
+
+       # Ensure interface is up
+       ip link set "$_iface" up || \
+           die "Failed to bringup interface $_iface"
+
+       ip addr add "$_ip/$_maskbits" brd + dev "$_iface" || \
+           die "Failed to add $_ip/$_maskbits on dev $_iface"
+    ) <"$_lockfile"
+
+    # Do nothing here - return above only gets us out of the subshell
+    # and doing anything here will affect the return code.
+}
+
+delete_ip_from_iface()
+{
+    _iface=$1
+    _ip=$2
+    _maskbits=$3
+
+    _lockfile="${CTDB_VARDIR}/state/interface_modify_${_iface}.flock"
+    mkdir -p "${_lockfile%/*}" # dirname
+    [ -f "$_lockfile" ] || touch "$_lockfile"
+
+    (
+       # Note: use of return/exit/die() below only gets us out of the
+       # sub-shell, which is actually what we want.  That is, the
+       # function should just return non-zero.
+
+       flock --timeout 30 0 || \
+           die "delete_ip_from_iface: unable to get lock for ${_iface}"
+
+       _im="$_ip/$_maskbits"  # shorthand for readability
+
+       # "ip addr del" will delete all secondary IPs if this is the
+       # primary.  To work around this _very_ annoying behaviour we
+       # have to keep a record of the secondaries and re-add them
+       # afterwards.  Yuck!
+
+       _secondaries=""
+       if ip addr list dev "$_iface" primary | grep -Fq "inet $_im " ; then
+           _secondaries=$(ip addr list dev "$_iface" secondary | \
+               awk '$1 == "inet" { print $2 }')
+       fi
+
+       local _rc=0
+       ip addr del "$_im" dev "$_iface" || {
+           echo "Failed to del $_ip on dev $_iface"
+           _rc=1
+       }
+
+       if [ -n "$_secondaries" ] ; then
+           for _i in $_secondaries; do
+               if ip addr list dev "$_iface" | grep -Fq "inet $_i" ; then
+                   echo "Kept secondary $_i on dev $_iface"
+               else
+                   echo "Re-adding secondary address $_i to dev $_iface"
+                   ip addr add $_i brd + dev $_iface || {
+                       echo "Failed to re-add address $_i to dev $_iface"
+                       _rc=1
+                   }
+               fi
+           done
+       fi
+
+       return $_rc
+    ) <"$_lockfile"
+
+    # Do nothing here - return above only gets us out of the subshell
+    # and doing anything here will affect the return code.
+}
+
+# If the given IP is hosted then print 2 items: maskbits and iface 
+ip_maskbits_iface ()
+{
+    _addr="$1"
+
+    ip addr show to "${_addr}/32" 2>/dev/null | \
+       awk '$1 == "inet" { print gensub(".*/", "", 1, $2), $NF }'
+}
+
+drop_ip ()
+{
+    _addr="${1%/*}"  # Remove optional maskbits
+
+    set -- $(ip_maskbits_iface $_addr)
+    if [ -n "$1" ] ; then
+       _maskbits="$1"
+       _iface="$2"
+       echo "Removing public address $_addr/$_maskbits from device $_iface"
+       delete_ip_from_iface $_iface $_addr $_maskbits >/dev/null 2>&1
+    fi
+}
+
+drop_all_public_ips ()
+{
+    while read _ip _x ; do
+       drop_ip "$_ip"
+    done <"${CTDB_PUBLIC_ADDRESSES:-/dev/null}"
+}
+
+########################################################
+# Simple counters
+_ctdb_counter_common () {
+    _service_name="${1:-${service_name:-${script_name}}}"
+    _counter_file="$ctdb_fail_dir/$_service_name"
+    mkdir -p "${_counter_file%/*}" # dirname
+}
+ctdb_counter_init () {
+    _ctdb_counter_common "$1"
+
+    >"$_counter_file"
+}
+ctdb_counter_incr () {
+    _ctdb_counter_common "$1"
+
+    # unary counting!
+    echo -n 1 >> "$_counter_file"
+}
+ctdb_check_counter () {
+    _msg="${1:-error}"  # "error"  - anything else is silent on fail
+    _op="${2:--ge}"  # an integer operator supported by test
+    _limit="${3:-${service_fail_limit}}"
+    shift 3
+    _ctdb_counter_common "$1"
+
+    # unary counting!
+    _size=$(stat -c "%s" "$_counter_file" 2>/dev/null || echo 0)
+    _hit=false
+    if [ "$_op" != "%" ] ; then
+       if [ $_size $_op $_limit ] ; then
+           _hit=true
+       fi
+    else
+       if [ $(($_size $_op $_limit)) -eq 0 ] ; then
+           _hit=true
+       fi
+    fi
+    if $_hit ; then
+       if [ "$_msg" = "error" ] ; then
+           echo "ERROR: $_size consecutive failures for $_service_name, marking node unhealthy"
+           exit 1              
+       else
+           return 1
+       fi
+    fi
+}
+
+########################################################
+
+ctdb_status_dir="$CTDB_VARDIR/status"
+ctdb_fail_dir="$CTDB_VARDIR/failcount"
+
+ctdb_setup_service_state_dir ()
+{
+    service_state_dir="$CTDB_VARDIR/state/${1:-${service_name}}"
+    mkdir -p "$service_state_dir" || {
+       echo "Error creating state dir \"$service_state_dir\""
+       exit 1
+    }
+}
+
+########################################################
+# Managed status history, for auto-start/stop
+
+ctdb_managed_dir="$CTDB_VARDIR/managed_history"
+
+_ctdb_managed_common ()
+{
+    _ctdb_managed_file="$ctdb_managed_dir/$service_name"
+}
+
+ctdb_service_managed ()
+{
+    _ctdb_managed_common
+    mkdir -p "$ctdb_managed_dir"
+    touch "$_ctdb_managed_file"
+}
+
+ctdb_service_unmanaged ()
+{
+    _ctdb_managed_common
+    rm -f "$_ctdb_managed_file"
+}
+
+is_ctdb_previously_managed_service ()
+{
+    _ctdb_managed_common
+    [ -f "$_ctdb_managed_file" ]
+}
+
+########################################################
+# Check and set status
+
+log_status_cat ()
+{
+    echo "node is \"$1\", \"${script_name}\" reports problem: $(cat $2)"
+}
+
+ctdb_checkstatus ()
+{
+    if [ -r "$ctdb_status_dir/$script_name/unhealthy" ] ; then
+       log_status_cat "unhealthy" "$ctdb_status_dir/$script_name/unhealthy"
+       return 1
+    elif [ -r "$ctdb_status_dir/$script_name/banned" ] ; then
+       log_status_cat "banned" "$ctdb_status_dir/$script_name/banned"
+       return 2
+    else
+       return 0
+    fi
+}
+
+ctdb_setstatus ()
+{
+    d="$ctdb_status_dir/$script_name"
+    case "$1" in
+       unhealthy|banned)
+           mkdir -p "$d"
+           cat "$2" >"$d/$1"
+           ;;
+       *)
+           for i in "banned" "unhealthy" ; do
+               rm -f "$d/$i"
+           done
+           ;;
+    esac
+}
+
+##################################################################
+# Reconfigure a service on demand
+
+_ctdb_service_reconfigure_common ()
+{
+    _d="$ctdb_status_dir/${service_name}"
+    mkdir -p "$_d"
+    _ctdb_service_reconfigure_flag="$_d/reconfigure"
+}
+
+ctdb_service_needs_reconfigure ()
+{
+    _ctdb_service_reconfigure_common
+    [ -e "$_ctdb_service_reconfigure_flag" ]
+}
+
+ctdb_service_set_reconfigure ()
+{
+    _ctdb_service_reconfigure_common
+    >"$_ctdb_service_reconfigure_flag"
+}
+
+ctdb_service_unset_reconfigure ()
+{
+    _ctdb_service_reconfigure_common
+    rm -f "$_ctdb_service_reconfigure_flag"
+}
+
+ctdb_service_reconfigure ()
+{
+    echo "Reconfiguring service \"${service_name}\"..."
+    ctdb_service_unset_reconfigure
+    service_reconfigure || return $?
+    ctdb_counter_init
+}
+
+# Default service_reconfigure() function does nothing.
+service_reconfigure ()
+{
+    :
+}
+
+ctdb_reconfigure_try_lock ()
+{
+    _ctdb_service_reconfigure_common
+    _lock="${_d}/reconfigure_lock"
+    mkdir -p "${_lock%/*}" # dirname
+    touch "$_lock"
+
+    (
+       flock 0
+       # This is overkill but will work if we need to extend this to
+       # allow certain events to run multiple times in parallel
+       # (e.g. takeip) and write multiple PIDs to the file.
+       read _locker_event 
+       if [ -n "$_locker_event" ] ; then
+           while read _pid ; do
+               if [ -n "$_pid" -a "$_pid" != $$ ] && \
+                   kill -0 "$_pid" 2>/dev/null ; then
+                   exit 1
+               fi
+           done
+       fi
+
+       printf "%s\n%s\n" "$event_name" $$ >"$_lock"
+       exit 0
+    ) <"$_lock"
+}
+
+ctdb_replay_monitor_status ()
+{
+    echo "Replaying previous status for this script due to reconfigure..."
+    # Leading colon (':') is missing in some versions...
+    _out=$(ctdb scriptstatus -Y | grep -E "^:?monitor:${script_name}:")
+    # Output looks like this:
+    # :monitor:60.nfs:1:ERROR:1314764004.030861:1314764004.035514:foo bar:
+    # This is the cheapest way of getting fields in the middle.
+    set -- $(IFS=":" ; echo $_out)
+    _code="$3"
+    _status="$4"
+    # The error output field can include colons so we'll try to
+    # preserve them.  The weak checking at the beginning tries to make
+    # this work for both broken (no leading ':') and fixed output.
+    _out="${_out%:}"
+    _err_out="${_out#*monitor:${script_name}:*:*:*:*:}"
+    case "$_status" in
+       OK) : ;;  # Do nothing special.
+       TIMEDOUT)
+           # Recast this as an error, since we can't exit with the
+           # correct negative number.
+           _code=1
+           _err_out="[Replay of TIMEDOUT scriptstatus - note incorrect return code.] ${_err_out}"
+           ;;
+       DISABLED)
+           # Recast this as an OK, since we can't exit with the
+           # correct negative number.
+           _code=0
+           _err_out="[Replay of DISABLED scriptstatus - note incorrect return code.] ${_err_out}"
+           ;;
+       *) : ;;  # Must be ERROR, do nothing special.
+    esac
+    if [ -n "$_err_out" ] ; then
+       echo "$_err_out"
+    fi
+    exit $_code
+}
+
+ctdb_service_check_reconfigure ()
+{
+    assert_service_name
+
+    # We only care about some events in this function.  For others we
+    # return now.
+    case "$event_name" in
+       monitor|ipreallocated|reconfigure) : ;;
+       *) return 0 ;;
+    esac
+
+    if ctdb_reconfigure_try_lock ; then
+       # No events covered by this function are running, so proceed
+       # with gay abandon.
+       case "$event_name" in
+           reconfigure)
+               (ctdb_service_reconfigure)
+               exit $?
+               ;;
+           ipreallocated)
+               if ctdb_service_needs_reconfigure ; then
+                   ctdb_service_reconfigure
+               fi
+               ;;
+           monitor)
+               if ctdb_service_needs_reconfigure ; then
+                   ctdb_service_reconfigure
+                   # Given that the reconfigure might not have
+                   # resulted in the service being stable yet, we
+                   # replay the previous status since that's the best
+                   # information we have.
+                   ctdb_replay_monitor_status
+               fi
+               ;;
+       esac
+    else
+       # Somebody else is running an event we don't want to collide
+       # with.  We proceed with caution.
+       case "$event_name" in
+           reconfigure)
+               # Tell whoever called us to retry.
+               exit 2
+               ;;
+           ipreallocated)
+               # Defer any scheduled reconfigure and just run the
+               # rest of the ipreallocated event, as per the
+               # eventscript.  There's an assumption here that the
+               # event doesn't depend on any scheduled reconfigure.
+               # This is true in the current code.
+               return 0
+               ;;
+           monitor)
+               # There is most likely a reconfigure in progress so
+               # the service is possibly unstable.  As above, we
+               # defer any scheduled reconfigured.  We also replay
+               # the previous monitor status since that's the best
+               # information we have.
+               ctdb_replay_monitor_status
+               ;;
+       esac
+    fi
+}
+
+##################################################################
+# Does CTDB manage this service? - and associated auto-start/stop
+
+ctdb_compat_managed_service ()
+{
+    if [ "$1" = "yes" -a "$2" = "$service_name" ] ; then
+       CTDB_MANAGED_SERVICES="$CTDB_MANAGED_SERVICES $2"
+    fi
+}
+
+is_ctdb_managed_service ()
+{
+    assert_service_name
+
+    # $t is used just for readability and to allow better accurate
+    # matching via leading/trailing spaces
+    t=" $CTDB_MANAGED_SERVICES "
+
+    # Return 0 if "<space>$service_name<space>" appears in $t
+    if [ "${t#* ${service_name} }" != "${t}" ] ; then
+       return 0
+    fi
+
+    # If above didn't match then update $CTDB_MANAGED_SERVICES for
+    # backward compatibility and try again.
+    ctdb_compat_managed_service "$CTDB_MANAGES_VSFTPD"   "vsftpd"
+    ctdb_compat_managed_service "$CTDB_MANAGES_SAMBA"    "samba"
+    ctdb_compat_managed_service "$CTDB_MANAGES_WINBIND"  "winbind"
+    ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "apache2"
+    ctdb_compat_managed_service "$CTDB_MANAGES_HTTPD"    "httpd"
+    ctdb_compat_managed_service "$CTDB_MANAGES_ISCSI"    "iscsi"
+    ctdb_compat_managed_service "$CTDB_MANAGES_CLAMD"    "clamd"
+    ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs"
+    ctdb_compat_managed_service "$CTDB_MANAGES_NFS"      "nfs-ganesha-gpfs"
+
+    t=" $CTDB_MANAGED_SERVICES "
+
+    # Return 0 if "<space>$service_name<space>" appears in $t
+    [ "${t#* ${service_name} }" != "${t}" ]
+}
+
+ctdb_start_stop_service ()
+{
+    assert_service_name
+
+    # Allow service-start/service-stop pseudo-events to start/stop
+    # services when we're not auto-starting/stopping and we're not
+    # monitoring.
+    case "$event_name" in
+       service-start)
+           if is_ctdb_managed_service ; then
+               die 'service-start event not permitted when service is managed'
+           fi
+           if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
+               die 'service-start event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
+           fi
+           ctdb_service_start
+           exit $?
+           ;;
+       service-stop)
+           if is_ctdb_managed_service ; then
+               die 'service-stop event not permitted when service is managed'
+           fi
+           if [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] ; then
+               die 'service-stop event not permitted with $CTDB_SERVICE_AUTOSTARTSTOP = yes'
+           fi
+           ctdb_service_stop
+           exit $?
+           ;;
+    esac
+
+    # Do nothing unless configured to...
+    [ "$CTDB_SERVICE_AUTOSTARTSTOP" = "yes" ] || return 0
+
+    [ "$event_name" = "monitor" ] || return 0
+
+    if is_ctdb_managed_service ; then
+       if ! is_ctdb_previously_managed_service ; then
+           echo "Starting service \"$service_name\" - now managed"
+           background_with_logging ctdb_service_start
+           exit $?
+       fi
+    else
+       if is_ctdb_previously_managed_service ; then
+           echo "Stopping service \"$service_name\" - no longer managed"
+           background_with_logging ctdb_service_stop
+           exit $?
+       fi
+    fi
+}
+
+ctdb_service_start ()
+{
+    # The service is marked managed if we've ever tried to start it.
+    ctdb_service_managed
+
+    service_start || return $?
+
+    ctdb_counter_init
+    ctdb_check_tcp_init
+}
+
+ctdb_service_stop ()
+{
+    ctdb_service_unmanaged
+    service_stop
+}
+
+# Default service_start() and service_stop() functions.
+# These may be overridden in an eventscript.  When overriding, the
+# following convention must be followed.  If these functions are
+# called with no arguments then they may use internal logic to
+# determine whether the service is managed and, therefore, whether
+# they should take any action.  However, if the service name is
+# specified as an argument then an attempt must be made to start or
+# stop the service.  This is because the auto-start/stop code calls
+# them with the service name as an argument.
+service_start ()
+{
+    service "$service_name" start
+}
+
+service_stop ()
+{
+    service "$service_name" stop
+}
+
+##################################################################
+
+ctdb_standard_event_handler ()
+{
+    case "$1" in
+       status)
+           ctdb_checkstatus
+           exit
+           ;;
+       setstatus)
+            shift
+           ctdb_setstatus "$@"
+           exit
+           ;;
+    esac
+}
+
+# iptables doesn't like being re-entered, so flock-wrap it.
+iptables()
+{
+       flock -w 30 $CTDB_VARDIR/iptables-ctdb.flock /sbin/iptables "$@"
+}
+
+# AIX (and perhaps others?) doesn't have mktemp
+if ! which mktemp >/dev/null 2>&1 ; then
+    mktemp ()
+    {
+       _dir=false
+       if [ "$1" = "-d" ] ; then
+           _dir=true
+           shift
+       fi
+       _d="${TMPDIR:-/tmp}"
+       _hex10=$(dd if=/dev/urandom count=20 2>/dev/null | \
+           md5sum | \
+           sed -e 's@\(..........\).*@\1@')
+       _t="${_d}/tmp.${_hex10}"
+       (
+           umask 077
+           if $_dir ; then
+               mkdir "$_t"
+           else
+               >"$_t"
+           fi
+       )
+       echo "$_t"
+    }
+fi
+
+########################################################
+# tickle handling
+########################################################
+
+update_tickles ()
+{
+       _port="$1"
+
+       tickledir="$CTDB_VARDIR/state/tickles"
+       mkdir -p "$tickledir"
+
+       # Who am I?
+       _pnn=$(ctdb pnn) ; _pnn=${_pnn#PNN:}
+
+       # What public IPs do I hold?
+       _ips=$(ctdb -Y ip | awk -F: -v pnn=$_pnn '$3 == pnn {print $2}')
+
+       # IPs as a regexp choice
+       _ipschoice="($(echo $_ips | sed -e 's/ /|/g' -e 's/\./\\\\./g'))"
+
+       # Record connections to our public IPs in a temporary file
+       _my_connections="${tickledir}/${_port}.connections"
+       rm -f "$_my_connections"
+       netstat -tn |
+       awk -v destpat="^${_ipschoice}:${_port}\$" \
+         '$1 == "tcp" && $6 == "ESTABLISHED" && $4 ~ destpat {print $5, $4}' |
+       sort >"$_my_connections"
+
+       # Record our current tickles in a temporary file
+       _my_tickles="${tickledir}/${_port}.tickles"
+       rm -f "$_my_tickles"
+       for _i in $_ips ; do
+               ctdb -Y gettickles $_i $_port | 
+               awk -F: 'NR > 1 { printf "%s:%s %s:%s\n", $2, $3, $4, $5 }'
+       done |
+       sort >"$_my_tickles"
+
+       # Add tickles for connections that we haven't already got tickles for
+       comm -23 "$_my_connections" "$_my_tickles" |
+       while read _src _dst ; do
+               ctdb addtickle $_src $_dst
+       done
+
+       # Remove tickles for connections that are no longer there
+       comm -13 "$_my_connections" "$_my_tickles" |
+       while read _src _dst ; do
+               ctdb deltickle $_src $_dst
+       done
+
+       rm -f "$_my_connections" "$_my_tickles" 
+}
+
+########################################################
+# load a site local config file
+########################################################
+
+[ -n "$CTDB_RC_LOCAL" -a -x "$CTDB_RC_LOCAL" ] && {
+       . "$CTDB_RC_LOCAL"
+}
+
+[ -x $CTDB_BASE/rc.local ] && {
+       . $CTDB_BASE/rc.local
+}
+
+[ -d $CTDB_BASE/rc.local.d ] && {
+       for i in $CTDB_BASE/rc.local.d/* ; do
+               [ -x "$i" ] && . "$i"
+       done
+}
+
+script_name="${0##*/}"       # basename
+service_fail_limit=1
+event_name="$1"
diff --git a/ctdb/config/gcore_trace.sh b/ctdb/config/gcore_trace.sh
new file mode 100755 (executable)
index 0000000..4d3e1d1
--- /dev/null
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+gcore -o "/var/log/core" "$1" 2>&1 | logger -t "ctdb:gcore_trace"
diff --git a/ctdb/config/nfs-rpc-checks.d/10.statd.check b/ctdb/config/nfs-rpc-checks.d/10.statd.check
new file mode 100644 (file)
index 0000000..d738a32
--- /dev/null
@@ -0,0 +1,3 @@
+-ge 6 verbose unhealthy
+-eq 4 verbose restart
+-eq 2 restart:b
diff --git a/ctdb/config/nfs-rpc-checks.d/20.nfsd.check b/ctdb/config/nfs-rpc-checks.d/20.nfsd.check
new file mode 100644 (file)
index 0000000..aa4a2e7
--- /dev/null
@@ -0,0 +1,2 @@
+%   10 verbose restart:b unhealthy
+-ge  2 verbose unhealthy
diff --git a/ctdb/config/nfs-rpc-checks.d/30.lockd.check b/ctdb/config/nfs-rpc-checks.d/30.lockd.check
new file mode 100644 (file)
index 0000000..95ae7b3
--- /dev/null
@@ -0,0 +1,2 @@
+-ge 15 verbose restart:b unhealthy
+-eq 10 restart:b
diff --git a/ctdb/config/nfs-rpc-checks.d/40.mountd.check b/ctdb/config/nfs-rpc-checks.d/40.mountd.check
new file mode 100644 (file)
index 0000000..6b4f801
--- /dev/null
@@ -0,0 +1,2 @@
+-ge 10 verbose restart:b unhealthy
+-eq 5 restart:b
diff --git a/ctdb/config/nfs-rpc-checks.d/50.rquotad.check b/ctdb/config/nfs-rpc-checks.d/50.rquotad.check
new file mode 100644 (file)
index 0000000..1ebb828
--- /dev/null
@@ -0,0 +1 @@
+-gt 0 verbose restart:b
diff --git a/ctdb/config/notify.d.README b/ctdb/config/notify.d.README
new file mode 100755 (executable)
index 0000000..ffce7fa
--- /dev/null
@@ -0,0 +1,44 @@
+This directory should contain executable programs to handle CTDB event
+notifications.  The first and only argument passed to each program is
+the event, which is one of:
+
+  init, setup, startup, unhealthy, healthy
+
+To use notifications with this directory then you need to set:
+
+  CTDB_NOTIFY_SCRIPT=/etc/ctdb/notify.sh
+
+in your CTDB configuration file.
+
+An example script that sends SNMP traps for unhealthy/healthy might
+look like this:
+
+  #!/bin/sh
+
+  case "$1" in
+      unhealthy)
+          # Send an SNMP trap saying that the node is unhealthy:
+          snmptrap -m ALL -v 1 -c public 10.1.1.105 ctdb \
+              $(hostname) 0 0 $(date +"%s") ctdb.nodeHealth.0 i 1
+          ;;
+      healthy)
+          # Send an SNMP trap saying that the node is healthy again:
+          snmptrap -m ALL -v 1 -c public 10.1.1.105 ctdb \
+             $(hostname) 0 0 $(date +"%s") ctdb.nodeHealth.0 i 0
+         ;;
+  esac
+
+Alternatively, email could be sent:
+
+  #!/bin/sh
+
+  case "$1" in
+      unhealthy)
+          mail -s "$(hostname) is UNHEALTHY" foo@example.com </dev/null >/dev/null 2>&1
+          ;;
+      healthy)
+          mail -s "$(hostname) is HEALTHY" foo@example.com </dev/null >/dev/null 2>&1
+          ;;
+  esac
+
+When adding programs please note the exclusion patterns in notify.sh.
diff --git a/ctdb/config/notify.sh b/ctdb/config/notify.sh
new file mode 100755 (executable)
index 0000000..dfcb81a
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+# This script is activated by setting CTDB_NOTIFY_SCRIPT=/etc/ctdb/notify.sh
+# in /etc/sysconfig/ctdb
+
+# This is script is invoked from ctdb when certain events happen.  See
+# /etc/ctdb/notify.d/README for more details.
+
+d=$(dirname $0)
+nd="${d}/notify.d"
+
+ok=true
+
+for i in "${nd}/"* ; do
+    # Don't run files matching basename
+    case "${i##*/}" in
+       *~|*,|*.rpm*|*.swp|README) continue ;;
+    esac
+
+    # Files must be executable
+    [ -x "$i" ] || continue
+
+    # Flag failures
+    "$i" "$1" || ok=false
+done
+
+$ok
diff --git a/ctdb/config/statd-callout b/ctdb/config/statd-callout
new file mode 100755 (executable)
index 0000000..cd259c6
--- /dev/null
@@ -0,0 +1,195 @@
+#!/bin/sh
+
+# This must run as root as CTDB tool commands need to access CTDB socket
+[ $(id -u) -eq 0 ] || exec sudo "$0" "$@"
+
+# this script needs to be installed so that statd points to it with the -H 
+# command line argument. The easiest way to do that is to put something like this in 
+# /etc/sysconfig/nfs:
+#   STATD_HOSTNAME="myhostname -H /etc/ctdb/statd-callout"
+
+[ -n "$CTDB_BASE" ] || \
+    export CTDB_BASE=$(cd -P $(dirname "$0") ; echo "$PWD")
+
+. $CTDB_BASE/functions
+loadconfig ctdb
+loadconfig nfs
+
+[ -z $NFS_HOSTNAME ] && {
+       echo NFS_HOSTNAME is not configured. statd-callout failed.
+       exit 0
+}
+
+case "$1" in
+  add-client)
+       # the callout does not tell us to which ip the client connected
+       # so we must add it to all the ips that we serve
+       PNN=`ctdb xpnn | sed -e "s/.*://"`
+       ctdb ip -Y | while read LINE; do
+               NODE=`echo $LINE | cut -f3 -d:`
+               [ "$NODE" = "$PNN" ] || {
+                       # not us
+                       continue
+               } 
+               IP=`echo $LINE | cut -f2 -d:`
+               mkdir -p $CTDB_VARDIR/state/statd/ip/$IP
+               touch $CTDB_VARDIR/state/statd/ip/$IP/$2
+       done
+       ;;
+  del-client)
+       # the callout does not tell us to which ip the client disconnected
+       # so we must remove it from all the ips that we serve
+       PNN=`ctdb xpnn | sed -e "s/.*://"`
+       ctdb ip -Y | while read LINE; do
+               NODE=`echo $LINE | cut -f3 -d:`
+               [ "$NODE" = "$PNN" ] || {
+                       # not us
+                       continue
+               } 
+               IP=`echo $LINE | cut -f2 -d:`
+               mkdir -p $CTDB_VARDIR/state/statd/ip/$IP
+               rm -f $CTDB_VARDIR/state/statd/ip/$IP/$2
+       done
+       ;;
+  updatelocal)
+       # For all IPs we serve, collect info and push to the config database
+       PNN=`ctdb xpnn | sed -e "s/.*://"`
+       ctdb ip -Y | tail -n +2 | while read LINE; do
+               NODE=`echo $LINE | cut -f3 -d:`
+               [ "$NODE" = "$PNN" ] || {
+                       continue
+               } 
+               IP=`echo $LINE | cut -f2 -d:`
+
+               mkdir -p $CTDB_VARDIR/state/statd/ip/$IP
+
+               rm -f $CTDB_VARDIR/state/statd/ip/$IP.tar
+               tar cfP $CTDB_VARDIR/state/statd/ip/$IP.tar $CTDB_VARDIR/state/statd/ip/$IP
+
+               rm -f $CTDB_VARDIR/state/statd/ip/$IP.rec
+               ctdb pfetch ctdb.tdb statd-state:$IP $CTDB_VARDIR/state/statd/ip/$IP.rec 2>/dev/null
+               [ "$?" = "0" ] || {
+                       # something went wrong,  try storing this data
+                       echo No record. Store STATD state data for $IP
+                       ctdb pstore ctdb.tdb statd-state:$IP $CTDB_VARDIR/state/statd/ip/$IP.tar 2>/dev/null
+                       continue
+               }
+
+               cmp $CTDB_VARDIR/state/statd/ip/$IP.tar $CTDB_VARDIR/state/statd/ip/$IP.rec >/dev/null 2>/dev/null
+               [ "$?" = "0" ] || {
+                       # something went wrong,  try storing this data
+                       echo Updated record. Store STATD state data for $IP
+                       ctdb pstore ctdb.tdb statd-state:$IP $CTDB_VARDIR/state/statd/ip/$IP.tar 2>/dev/null
+                       continue
+               }
+       done
+       ;;
+
+  updateremote)
+       # For all IPs we dont serve, pull the state from the database
+       PNN=`ctdb xpnn | sed -e "s/.*://"`
+       ctdb ip -Y | tail -n +2 | while read LINE; do
+               NODE=`echo $LINE | cut -f3 -d:`
+               [ "$NODE" = "$PNN" ] && {
+                       continue
+               } 
+               IP=`echo $LINE | cut -f2 -d:`
+
+               mkdir -p $CTDB_VARDIR/state/statd/ip/$IP
+
+               rm -f $CTDB_VARDIR/state/statd/ip/$IP.rec
+               ctdb pfetch ctdb.tdb statd-state:$IP $CTDB_VARDIR/state/statd/ip/$IP.rec 2>/dev/null
+               [ "$?" = "0" ] || {
+                       continue
+               }
+
+               rm -f $CTDB_VARDIR/state/statd/ip/$IP/*
+               tar xfP $CTDB_VARDIR/state/statd/ip/$IP.rec
+       done
+       ;;
+
+  notify)
+       # we must restart the lockmanager (on all nodes) so that we get
+       # a clusterwide grace period (so other clients dont take out
+       # conflicting locks through other nodes before all locks have been
+       # reclaimed)
+
+       # we need these settings to make sure that no tcp connections survive
+       # across a very fast failover/failback
+       #echo 10 > /proc/sys/net/ipv4/tcp_fin_timeout
+       #echo 0 > /proc/sys/net/ipv4/tcp_max_tw_buckets
+       #echo 0 > /proc/sys/net/ipv4/tcp_max_orphans
+
+       # Delete the notification list for statd, we dont want it to 
+       # ping any clients
+       rm -f /var/lib/nfs/statd/sm/*
+       rm -f /var/lib/nfs/statd/sm.bak/*
+
+       # we must keep a monotonically increasing state variable for the entire
+       # cluster  so state always increases when ip addresses fail from one
+       # node to another
+       # We use epoch and hope the nodes are close enough in clock.
+       # Even numbers mean service is shut down, odd numbers mean
+       # service is started.
+       STATE=$(( $(date '+%s') / 2 * 2))
+
+
+       # we must also let some time pass between stopping and restarting the
+       # lockmanager since othervise there is a window where the lockmanager
+       # will respond "strangely" immediately after restarting it, which
+       # causes clients to fail to reclaim the locks.
+       # 
+       if [ "${CTDB_NFS_SERVER_MODE:-${NFS_SERVER_MODE}}" != "ganesha" ] ; then
+            startstop_nfslock stop >/dev/null 2>&1
+            sleep 2
+            startstop_nfslock start >/dev/null 2>&1
+       fi
+
+       # we now need to send out additional statd notifications to ensure
+       # that clients understand that the lockmanager has restarted.
+       # we have three cases:
+       # 1, clients that ignore the ip address the stat notification came from
+       #    and ONLY care about the 'name' in the notify packet.
+       #    these clients ONLY work with lock failover IFF that name
+       #    can be resolved into an ipaddress that matches the one used
+       #    to mount the share.  (==linux clients)
+       #    This is handled when starting lockmanager above,  but those
+       #    packets are sent from the "wrong" ip address, something linux
+       #    clients are ok with, buth other clients will barf at.
+       # 2, Some clients only accept statd packets IFF they come from the
+       #    'correct' ip address.
+       # 2a,Send out the notification using the 'correct' ip address and also
+       #    specify the 'correct' hostname in the statd packet.
+       #    Some clients require both the correct source address and also the
+       #    correct name. (these clients also ONLY work if the ip addresses
+       #    used to map the share can be resolved into the name returned in
+       #    the notify packet.)
+       # 2b,Other clients require that the source ip address of the notify
+       #    packet matches the ip address used to take out the lock.
+       #    I.e. that the correct source address is used.
+       #    These clients also require that the statd notify packet contains
+       #    the name as the ip address used when the lock was taken out.
+       #
+       # Both 2a and 2b are commonly used in lockmanagers since they maximize
+       # probability that the client will accept the statd notify packet and
+       # not just ignore it.
+       # For all IPs we serve, collect info and push to the config database
+       PNN=`ctdb xpnn | sed -e "s/.*://"`
+       ctdb ip -Y | tail -n +2 | while read LINE; do
+               NODE=`echo $LINE | cut -f3 -d:`
+               [ "$NODE" = "$PNN" ] || {
+                       continue
+               } 
+               IP=`echo $LINE | cut -f2 -d:`
+
+               ls $CTDB_VARDIR/state/statd/ip/$IP | while read CLIENT; do
+                       rm $CTDB_VARDIR/state/statd/ip/$IP/$CLIENT
+                       smnotify --client=$CLIENT --ip=$IP --server=$ip --stateval=$STATE
+                       smnotify --client=$CLIENT --ip=$IP --server=$NFS_HOSTNAME --stateval=$STATE
+                       STATE=$(($STATE + 1))
+                       smnotify --client=$CLIENT --ip=$IP --server=$ip --stateval=$STATE
+                       smnotify --client=$CLIENT --ip=$IP --server=$NFS_HOSTNAME --stateval=$STATE
+               done
+       done
+       ;;
+esac
diff --git a/ctdb/configure.ac b/ctdb/configure.ac
new file mode 100644 (file)
index 0000000..9621f42
--- /dev/null
@@ -0,0 +1,126 @@
+AC_PREREQ(2.50)
+AC_INIT(ctdb, m4_esyscmd([grep 'Version:' ./packaging/RPM/ctdb.spec 2>/dev/null | head -1 | sed -e 's/[ \t]*Version:[ \t]*\([^ \t]*\)[ \t]*.*/\1/' | tr -d '\n']))
+AC_DEFUN([SMB_MODULE_DEFAULT], [echo -n ""])
+AC_DEFUN([SMB_LIBRARY_ENABLE], [echo -n ""])
+AC_DEFUN([SMB_EXT_LIB], [echo -n ""])
+AC_DEFUN([SMB_ENABLE], [echo -n ""])
+AC_CONFIG_SRCDIR([server/ctdbd.c])
+
+if test "${libdir}" = '${exec_prefix}/lib'; then
+  case `uname -m` in
+    x86_64|ppc64|powerpc64)
+      libdir='${exec_prefix}/lib64'
+      ;;
+    *)
+      libdir='${exec_prefix}/lib'
+      ;;
+  esac
+fi
+
+case `uname` in
+  Linux*)
+    CTDB_SYSTEM_OBJ=common/system_linux.o
+    CTDB_SCSI_IO=bin/scsi_io
+    CTDB_PCAP_LDFLAGS=
+    ;;
+  AIX*)
+    CTDB_SYSTEM_OBJ=common/system_aix.o
+    CTDB_SCSI_IO=
+    CPPFLAGS="$CPPFLAGS -D_AIX_=1"
+    CTDB_PCAP_LDFLAGS=-lpcap
+    ;;
+  GNU/kFreeBSD)
+    CTDB_SYSTEM_OBJ=common/system_kfreebsd.o
+    CTDB_SCSI_IO=
+    CTDB_PCAP_LDFLAGS=-lpcap
+    ;;
+  FreeBSD)
+    CTDB_SYSTEM_OBJ=common/system_freebsd.o
+    CTDB_SCSI_IO=
+    CTDB_PCAP_LDFLAGS=-lpcap
+    LDFLAGS="$LDFLAGS -L/usr/local/lib -lexecinfo"
+    AC_SUBST(LDFLAGS)
+    CPPFLAGS="$CPPFLAGS -I/usr/local/include -D_FREEBSD_=1"
+    AC_SUBST(CPPFLAGS)
+    ;;
+  GNU)
+    CTDB_SYSTEM_OBJ=common/system_gnu.o
+    CTDB_SCSI_IO=
+    CTDB_PCAP_LDFLAGS=-lpcap
+    ;;
+  *)
+    echo unknown system  cant configure
+    exit
+    ;;
+esac
+
+AC_LIBREPLACE_ALL_CHECKS
+AC_LIBREPLACE_NETWORK_CHECKS
+
+if test "$ac_cv_prog_gcc" = yes; then
+   CFLAGS="$CFLAGS -Wall -Wshadow -Wstrict-prototypes -Wpointer-arith -Wcast-qual -Wcast-align -Wwrite-strings"
+fi
+
+LOGDIR='${localstatedir}/log'
+AC_ARG_WITH([logdir],
+       [  --with-logdir=DIR       path to log directory [[LOCALSTATEDIR/log]]],
+       LOGDIR=$withval)
+if test ! -z "$LOGDIR"; then
+  if test "$LOGDIR" = "yes" -o "$LOGDIR" = "no"; then
+    AC_MSG_ERROR([--with-logdir must specify a path])
+  fi
+fi
+AC_SUBST(LOGDIR)
+
+SOCKPATH='${localstatedir}/run/ctdb/ctdbd.socket'
+AC_ARG_WITH([socketpath],
+       [  --with-socketpath=FILE       path to CTDB daemon socket [[LOCALSTATEDIR/run/ctdb/ctdbd.socket]]],
+       SOCKPATH=$withval)
+if test ! -z "$SOCKPATH"; then
+  if test "$SOCKPATH" = "yes" -o "$SOCKPATH" = "no"; then
+         AC_MSG_ERROR([--with-socketpath must specify a file path])
+  fi
+fi
+AC_SUBST(SOCKPATH)
+
+AC_CONFIG_HEADER(config.h)
+
+EXTRA_OBJ=""
+
+m4_include(libpopt.m4)
+m4_include(libtalloc.m4)
+m4_include(libtdb.m4)
+m4_include(libtevent.m4)
+m4_include(ib/config.m4)
+m4_include(lib/util/signal.m4)
+m4_include(lib/util/fault.m4)
+m4_include(lib/socket_wrapper/config.m4)
+m4_include(utils/pmda/config.m4)
+
+AC_CHECK_HEADERS(sched.h)
+AC_CHECK_HEADERS(procinfo.h)
+
+AC_CHECK_DECL([ETIME], [],[AC_DEFINE([ETIME], ETIMEDOUT, [ETIME on non-supporting platforms])], [
+#include <errno.h>
+])
+
+AC_CHECK_FUNCS(sched_setscheduler)
+AC_CHECK_FUNCS(thread_setsched)
+AC_CHECK_FUNCS(mlockall)
+
+AC_CACHE_CHECK([for sin_len in sock],ctdb_cv_HAVE_SOCK_SIN_LEN,[
+AC_TRY_COMPILE([#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>],
+[struct sockaddr_in sock; sock.sin_len = sizeof(sock);],
+ctdb_cv_HAVE_SOCK_SIN_LEN=yes,ctdb_cv_HAVE_SOCK_SIN_LEN=no)])
+if test x"$ctdb_cv_HAVE_SOCK_SIN_LEN" = x"yes"; then
+    AC_DEFINE(HAVE_SOCK_SIN_LEN,1,[Whether the sockaddr_in struct has a sin_len property])
+fi
+
+AC_SUBST(EXTRA_OBJ)
+AC_SUBST(CTDB_SYSTEM_OBJ)
+AC_SUBST(CTDB_SCSI_IO)
+AC_SUBST(CTDB_PCAP_LDFLAGS)
+
+AC_OUTPUT(Makefile ctdb.pc)
diff --git a/ctdb/configure.rpm b/ctdb/configure.rpm
new file mode 100755 (executable)
index 0000000..4c82ecf
--- /dev/null
@@ -0,0 +1,7 @@
+
+CFLAGS="-Wall -g -D_GNU_SOURCE" ./configure \
+       --prefix=/usr \
+       --sysconfdir=/etc \
+       --mandir=/usr/man \
+       --localstatedir=/var \
+       $*
diff --git a/ctdb/ctdb.pc.in b/ctdb/ctdb.pc.in
new file mode 100644 (file)
index 0000000..5f5bfab
--- /dev/null
@@ -0,0 +1,19 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+datarootdir=@datarootdir@
+includedir=@includedir@
+libdir=@libdir@
+bindir=@bindir@
+sbindir=@sbindir@
+mandir=@mandir@
+localstatedir=@localstatedir@
+srcdir=@srcdir@
+etcdir=@sysconfdir@
+
+Name: ctdb
+Description: A clustered database to store temporary data
+Version: @PACKAGE_VERSION@
+Libs: -L${libdir}
+Cflags: -I${includedir}
+URL: http://ctdb.samba.org/
+
diff --git a/ctdb/doc/Makefile b/ctdb/doc/Makefile
new file mode 100644 (file)
index 0000000..34303a5
--- /dev/null
@@ -0,0 +1,20 @@
+DOCS = ctdb.1 ctdb.1.html \
+       ctdbd.1 ctdbd.1.html \
+       ctdbd_wrapper.1 ctdbd_wrapper.1.html \
+       onnode.1 onnode.1.html \
+       ltdbtool.1 ltdbtool.1.html \
+       ping_pong.1 ping_pong.1.html \
+       ctdbd.conf.5 ctdbd.conf.5.html \
+       ctdb.7 ctdb.7.html \
+       ctdb-tunables.7 ctdb-tunables.7.html
+
+all: $(DOCS)
+
+%: %.xml
+       xsltproc -o $@ http://docbook.sourceforge.net/release/xsl/current/manpages/docbook.xsl $<
+
+%.html: %.xml
+       xsltproc -o $@ http://docbook.sourceforge.net/release/xsl/current/html/docbook.xsl $<
+
+distclean:
+       rm -f $(DOCS)
diff --git a/ctdb/doc/ctdb-tunables.7.xml b/ctdb/doc/ctdb-tunables.7.xml
new file mode 100644 (file)
index 0000000..456e856
--- /dev/null
@@ -0,0 +1,708 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+
+<refentry id="ctdb-tunables.7">
+
+  <refmeta>
+    <refentrytitle>ctdb-tunables</refentrytitle>
+    <manvolnum>7</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>ctdb-tunables</refname>
+    <refpurpose>CTDB tunable configuration variables</refpurpose>
+  </refnamediv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+
+    <para>
+      CTDB's behaviour can be configured by setting run-time tunable
+      variables.  This lists and describes all tunables.  See the
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>
+      <command>listvars</command>, <command>setvar</command> and
+      <command>getvar</command> commands for more details.
+    </para>
+
+    <refsect2>
+      <title>MaxRedirectCount</title>
+      <para>Default: 3</para>
+      <para>
+       If we are not the DMASTER and need to fetch a record across the network
+       we first send the request to the LMASTER after which the record
+       is passed onto the current DMASTER. If the DMASTER changes before
+       the request has reached that node, the request will be passed onto the
+       "next" DMASTER. For very hot records that migrate rapidly across the
+       cluster this can cause a request to "chase" the record for many hops
+       before it catches up with the record.
+
+       this is how many hops we allow trying to chase the DMASTER before we
+       switch back to the LMASTER again to ask for new directions.
+      </para>
+      <para>
+       When chasing a record, this is how many hops we will chase the record
+       for before going back to the LMASTER to ask for new guidance.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>SeqnumInterval</title>
+      <para>Default: 1000</para>
+      <para>
+       Some databases have seqnum tracking enabled, so that samba will be able
+       to detect asynchronously when there has been updates to the database.
+       Everytime a database is updated its sequence number is increased.
+      </para>
+      <para>
+       This tunable is used to specify in 'ms' how frequently ctdb will
+       send out updates to remote nodes to inform them that the sequence
+       number is increased.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>ControlTimeout</title>
+      <para>Default: 60</para>
+      <para>
+       This is the default
+       setting for timeout for when sending a control message to either the
+       local or a remote ctdb daemon.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>TraverseTimeout</title>
+      <para>Default: 20</para>
+      <para>
+       This setting controls how long we allow a traverse process to run.
+       After this timeout triggers, the main ctdb daemon will abort the
+       traverse if it has not yet finished.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>KeepaliveInterval</title>
+      <para>Default: 5</para>
+      <para>
+       How often in seconds should the nodes send keepalives to eachother.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>KeepaliveLimit</title>
+      <para>Default: 5</para>
+      <para>
+       After how many keepalive intervals without any traffic should a node
+       wait until marking the peer as DISCONNECTED.
+      </para>
+      <para>
+       If a node has hung, it can thus take KeepaliveInterval*(KeepaliveLimit+1)
+       seconds before we determine that the node is DISCONNECTED and that we
+       require a recovery. This limitshould not be set too high since we want
+       a hung node to be detectec, and expunged from the cluster well before
+       common CIFS timeouts (45-90 seconds) kick in.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecoverTimeout</title>
+      <para>Default: 20</para>
+      <para>
+       This is the default setting for timeouts for controls when sent from the
+       recovery daemon. We allow longer control timeouts from the recovery daemon
+       than from normal use since the recovery dameon often use controls that 
+       can take a lot longer than normal controls.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecoverInterval</title>
+      <para>Default: 1</para>
+      <para>
+       How frequently in seconds should the recovery daemon perform the
+       consistency checks that determine if we need to perform a recovery or not.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>ElectionTimeout</title>
+      <para>Default: 3</para>
+      <para>
+       When electing a new recovery master, this is how many seconds we allow
+       the election to take before we either deem the election finished
+       or we fail the election and start a new one.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>TakeoverTimeout</title>
+      <para>Default: 9</para>
+      <para>
+       This is how many seconds we allow controls to take for IP failover events.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>MonitorInterval</title>
+      <para>Default: 15</para>
+      <para>
+       How often should ctdb run the event scripts to check for a nodes health.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>TickleUpdateInterval</title>
+      <para>Default: 20</para>
+      <para>
+       How often will ctdb record and store the "tickle" information used to
+       kickstart stalled tcp connections after a recovery.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>EventScriptTimeout</title>
+      <para>Default: 20</para>
+      <para>
+       How long should ctdb let an event script run before aborting it and
+       marking the node unhealthy.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>EventScriptTimeoutCount</title>
+      <para>Default: 1</para>
+      <para>
+       How many events in a row needs to timeout before we flag the node UNHEALTHY.
+       This setting is useful if your scripts can not be written so that they
+       do not hang for benign reasons.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>EventScriptUnhealthyOnTimeout</title>
+      <para>Default: 0</para>
+      <para>
+       This setting can be be used to make ctdb never become UNHEALTHY if your
+       eventscripts keep hanging/timing out.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecoveryGracePeriod</title>
+      <para>Default: 120</para>
+      <para>
+       During recoveries, if a node has not caused recovery failures during the
+       last grace period, any records of transgressions that the node has caused
+       recovery failures will be forgiven. This resets the ban-counter back to 
+       zero for that node.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecoveryBanPeriod</title>
+      <para>Default: 300</para>
+      <para>
+       If a node becomes banned causing repetitive recovery failures. The node will
+       eventually become banned from the cluster.
+       This controls how long the culprit node will be banned from the cluster
+       before it is allowed to try to join the cluster again.
+       Don't set to small. A node gets banned for a reason and it is usually due
+       to real problems with the node.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DatabaseHashSize</title>
+      <para>Default: 100001</para>
+      <para>
+       Size of the hash chains for the local store of the tdbs that ctdb manages.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DatabaseMaxDead</title>
+      <para>Default: 5</para>
+      <para>
+       How many dead records per hashchain in the TDB database do we allow before
+       the freelist needs to be processed.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RerecoveryTimeout</title>
+      <para>Default: 10</para>
+      <para>
+       Once a recovery has completed, no additional recoveries are permitted
+       until this timeout has expired.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>EnableBans</title>
+      <para>Default: 1</para>
+      <para>
+       When set to 0, this disables BANNING completely in the cluster and thus
+       nodes can not get banned, even it they break. Don't set to 0 unless you
+       know what you are doing.  You should set this to the same value on
+       all nodes to avoid unexpected behaviour.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DeterministicIPs</title>
+      <para>Default: 0</para>
+      <para>
+       When enabled, this tunable makes ctdb try to keep public IP addresses
+       locked to specific nodes as far as possible. This makes it easier for
+       debugging since you can know that as long as all nodes are healthy
+       public IP X will always be hosted by node Y. 
+      </para>
+      <para>
+       The cost of using deterministic IP address assignment is that it
+       disables part of the logic where ctdb tries to reduce the number of
+       public IP assignment changes in the cluster. This tunable may increase
+       the number of IP failover/failbacks that are performed on the cluster
+       by a small margin.
+      </para>
+
+    </refsect2>
+    <refsect2>
+      <title>LCP2PublicIPs</title>
+      <para>Default: 1</para>
+      <para>
+       When enabled this switches ctdb to use the LCP2 ip allocation
+       algorithm.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>ReclockPingPeriod</title>
+      <para>Default: x</para>
+      <para>
+       Obsolete
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>NoIPFailback</title>
+      <para>Default: 0</para>
+      <para>
+       When set to 1, ctdb will not perform failback of IP addresses when a node
+       becomes healthy. Ctdb WILL perform failover of public IP addresses when a
+       node becomes UNHEALTHY, but when the node becomes HEALTHY again, ctdb
+       will not fail the addresses back.
+      </para>
+      <para>
+       Use with caution! Normally when a node becomes available to the cluster
+       ctdb will try to reassign public IP addresses onto the new node as a way
+       to distribute the workload evenly across the clusternode. Ctdb tries to
+       make sure that all running nodes have approximately the same number of
+       public addresses it hosts.
+      </para>
+      <para>
+       When you enable this tunable, CTDB will no longer attempt to rebalance
+       the cluster by failing IP addresses back to the new nodes. An unbalanced
+       cluster will therefore remain unbalanced until there is manual
+       intervention from the administrator. When this parameter is set, you can
+       manually fail public IP addresses over to the new node(s) using the
+       'ctdb moveip' command.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DisableIPFailover</title>
+      <para>Default: 0</para>
+      <para>
+       When enabled, ctdb will not perform failover or failback. Even if a
+       node fails while holding public IPs, ctdb will not recover the IPs or
+       assign them to another node.
+      </para>
+      <para>
+       When you enable this tunable, CTDB will no longer attempt to recover
+       the cluster by failing IP addresses over to other nodes. This leads to
+       a service outage until the administrator has manually performed failover
+       to replacement nodes using the 'ctdb moveip' command.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>NoIPTakeover</title>
+      <para>Default: 0</para>
+      <para>
+       When set to 1, ctdb will not allow IP addresses to be failed over
+       onto this node. Any IP addresses that the node currently hosts
+       will remain on the node but no new IP addresses can be failed over
+       to the node.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>NoIPHostOnAllDisabled</title>
+      <para>Default: 0</para>
+      <para>
+       If no nodes are healthy then by default ctdb will happily host
+       public IPs on disabled (unhealthy or administratively disabled)
+       nodes.  This can cause problems, for example if the underlying
+       cluster filesystem is not mounted.  When set to 1 on a node and
+       that node is disabled it, any IPs hosted by this node will be
+       released and the node will not takeover any IPs until it is no
+       longer disabled.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DBRecordCountWarn</title>
+      <para>Default: 100000</para>
+      <para>
+       When set to non-zero, ctdb will log a warning when we try to recover a
+       database with more than this many records. This will produce a warning
+       if a database grows uncontrollably with orphaned records.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DBRecordSizeWarn</title>
+      <para>Default: 10000000</para>
+      <para>
+       When set to non-zero, ctdb will log a warning when we try to recover a
+       database where a single record is bigger than this. This will produce
+       a warning if a database record grows uncontrollably with orphaned
+       sub-records.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DBSizeWarn</title>
+      <para>Default: 1000000000</para>
+      <para>
+       When set to non-zero, ctdb will log a warning when we try to recover a
+       database bigger than this. This will produce
+       a warning if a database grows uncontrollably.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>VerboseMemoryNames</title>
+      <para>Default: 0</para>
+      <para>
+       This feature consumes additional memory. when used the talloc library
+       will create more verbose names for all talloc allocated objects.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecdPingTimeout</title>
+      <para>Default: 60</para>
+      <para>
+       If the main dameon has not heard a "ping" from the recovery dameon for
+       this many seconds, the main dameon will log a message that the recovery
+       daemon is potentially hung.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecdFailCount</title>
+      <para>Default: 10</para>
+      <para>
+       If the recovery daemon has failed to ping the main dameon for this many
+       consecutive intervals, the main daemon will consider the recovery daemon
+       as hung and will try to restart it to recover.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>LogLatencyMs</title>
+      <para>Default: 0</para>
+      <para>
+       When set to non-zero, this will make the main daemon log any operation that
+       took longer than this value, in 'ms', to complete.
+       These include "how long time a lockwait child process needed", 
+       "how long time to write to a persistent database" but also
+       "how long did it take to get a response to a CALL from a remote node".
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecLockLatencyMs</title>
+      <para>Default: 1000</para>
+      <para>
+       When using a reclock file for split brain prevention, if set to non-zero
+       this tunable will make the recovery dameon log a message if the fcntl()
+       call to lock/testlock the recovery file takes longer than this number of 
+       ms.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecoveryDropAllIPs</title>
+      <para>Default: 120</para>
+      <para>
+       If we have been stuck in recovery, or stopped, or banned, mode for
+       this many seconds we will force drop all held public addresses.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>VerifyRecoveryLock</title>
+      <para>Default: 1</para>
+      <para>
+       Should we take a fcntl() lock on the reclock file to verify that we are the
+       sole recovery master node on the cluster or not.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>VacuumInterval</title>
+      <para>Default: 10</para>
+      <para>
+        Periodic interval in seconds when vacuuming is triggered for
+        volatile databases.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>VacuumMaxRunTime</title>
+      <para>Default: 120</para>
+      <para>
+        The maximum time in seconds for which the vacuuming process is
+        allowed to run.  If vacuuming process takes longer than this
+        value, then the vacuuming process is terminated.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RepackLimit</title>
+      <para>Default: 10000</para>
+      <para>
+        During vacuuming, if the number of freelist records are more
+        than <varname>RepackLimit</varname>, then databases are
+        repacked to get rid of the freelist records to avoid
+        fragmentation.
+      </para>
+      <para>
+        Databases are repacked only if both
+        <varname>RepackLimit</varname> and
+        <varname>VacuumLimit</varname> are exceeded.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>VacuumLimit</title>
+      <para>Default: 5000</para>
+      <para>
+        During vacuuming, if the number of deleted records are more
+        than <varname>VacuumLimit</varname>, then databases are
+        repacked to avoid fragmentation.
+      </para>
+      <para>
+        Databases are repacked only if both
+        <varname>RepackLimit</varname> and
+        <varname>VacuumLimit</varname> are exceeded.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>VacuumFastPathCount</title>
+      <para>Default: 60</para>
+      <para>
+        When a record is deleted, it is marked for deletion during
+        vacuuming.  Vacuuming process usually processes this list to purge
+        the records from the database.  If the number of records marked
+        for deletion are more than VacuumFastPathCount, then vacuuming
+       process will scan the complete database for empty records instead
+       of using the list of records marked for deletion.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>DeferredAttachTO</title>
+      <para>Default: 120</para>
+      <para>
+       When databases are frozen we do not allow clients to attach to the
+       databases. Instead of returning an error immediately to the application
+       the attach request from the client is deferred until the database
+       becomes available again at which stage we respond to the client.
+      </para>
+      <para>
+       This timeout controls how long we will defer the request from the client
+       before timing it out and returning an error to the client.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>HopcountMakeSticky</title>
+      <para>Default: 50</para>
+      <para>
+       If the database is set to 'STICKY' mode, using the 'ctdb setdbsticky' 
+       command, any record that is seen as very hot and migrating so fast that
+       hopcount surpasses 50 is set to become a STICKY record for StickyDuration
+       seconds. This means that after each migration the record will be kept on
+       the node and prevented from being migrated off the node.
+      </para>
+      <para>
+       This setting allows one to try to identify such records and stop them from
+       migrating across the cluster so fast. This will improve performance for
+       certain workloads, such as locking.tdb if many clients are opening/closing
+       the same file concurrently.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>StickyDuration</title>
+      <para>Default: 600</para>
+      <para>
+       Once a record has been found to be fetch-lock hot and has been flagged to
+       become STICKY, this is for how long, in seconds, the record will be 
+       flagged as a STICKY record.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>StickyPindown</title>
+      <para>Default: 200</para>
+      <para>
+       Once a STICKY record has been migrated onto a node, it will be pinned down
+       on that node for this number of ms. Any request from other nodes to migrate
+       the record off the node will be deferred until the pindown timer expires.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>StatHistoryInterval</title>
+      <para>Default: 1</para>
+      <para>
+       Granularity of the statistics collected in the statistics history.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>AllowClientDBAttach</title>
+      <para>Default: 1</para>
+      <para>
+       When set to 0, clients are not allowed to attach to any databases.
+       This can be used to temporarily block any new processes from attaching
+       to and accessing the databases.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>RecoverPDBBySeqNum</title>
+      <para>Default: 1</para>
+      <para>
+       When set to zero, database recovery for persistent databases
+       is record-by-record and recovery process simply collects the
+       most recent version of every individual record.
+      </para>
+      <para>
+       When set to non-zero, persistent databases will instead be
+       recovered as a whole db and not by individual records. The
+       node that contains the highest value stored in the record
+       "__db_sequence_number__" is selected and the copy of that
+       nodes database is used as the recovered database.
+      </para>
+      <para>
+       By default, recovery of persistent databses is done using
+       __db_sequence_number__ record.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>FetchCollapse</title>
+      <para>Default: 1</para>
+      <para>
+       When many clients across many nodes try to access the same record at the
+       same time this can lead to a fetch storm where the record becomes very
+       active and bounces between nodes very fast. This leads to high CPU
+       utilization of the ctdbd daemon, trying to bounce that record around
+       very fast, and poor performance.
+      </para>
+      <para>
+       This parameter is used to activate a fetch-collapse. A fetch-collapse
+       is when we track which records we have requests in flight so that we only
+       keep one request in flight from a certain node, even if multiple smbd
+       processes are attemtping to fetch the record at the same time. This 
+       can improve performance and reduce CPU utilization for certain
+       workloads.
+      </para>
+      <para>
+       This timeout controls if we should collapse multiple fetch operations
+       of the same record into a single request and defer all duplicates or not.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>Samba3AvoidDeadlocks</title>
+      <para>Default: 0</para>
+      <para>
+       Enable code that prevents deadlocks with Samba (only for Samba 3.x).
+      </para>
+      <para>
+       This should be set to 1 when using Samba version 3.x to enable special
+       code in CTDB to avoid deadlock with Samba version 3.x.  This code
+       is not required for Samba version 4.x and must not be enabled for
+       Samba 4.x.
+      </para>
+    </refsect2>
+  </refsect1>
+
+  <refsect1>
+    <title>SEE ALSO</title>
+    <para>
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd.conf</refentrytitle>
+      <manvolnum>5</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by
+       Ronnie Sahlberg,
+       Amitay Isaacs,
+       Martin Schwenke
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2007</year>
+      <holder>Andrew Tridgell</holder>
+      <holder>Ronnie Sahlberg</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/ctdb.1.xml b/ctdb/doc/ctdb.1.xml
new file mode 100644 (file)
index 0000000..27e52cd
--- /dev/null
@@ -0,0 +1,1674 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<refentry id="ctdb.1">
+
+  <refmeta>
+    <refentrytitle>ctdb</refentrytitle>
+    <manvolnum>1</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>ctdb</refname>
+    <refpurpose>CTDB management utility</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>ctdb</command>
+      <arg rep="repeat"><replaceable>OPTION</replaceable></arg>
+      <arg choice="req"><replaceable>COMMAND</replaceable></arg>
+      <arg choice="opt"><replaceable>COMMAND-ARGS</replaceable></arg>
+    </cmdsynopsis>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+    <para>
+      ctdb is a utility to view and manage a CTDB cluster.
+    </para>
+
+    <para>
+      The following terms are used when referring to nodes in a
+      cluster:
+      <variablelist>
+       <varlistentry>
+         <term>PNN</term>
+         <listitem>
+           <para>
+             Physical Node Number.  The physical node number is an
+             integer that describes the node in the cluster. The
+             first node has physical node number 0.  in a cluster.
+           </para>
+         </listitem>
+       </varlistentry>
+       <varlistentry>
+         <term>PNN-LIST</term>
+         <listitem>
+           <para>
+             This is either a single PNN, a comma-separate list of PNNs
+             or "all".
+           </para>
+         </listitem>
+       </varlistentry>
+      </variablelist>
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>OPTIONS</title>
+
+    <variablelist>
+      <varlistentry><term>-n <parameter>PNN-LIST</parameter></term>
+      <listitem>
+       <para>
+         The nodes specified by PNN-LIST should be queried for the
+         requested information.  Default is to query the daemon
+         running on the local host.
+       </para>
+      </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-Y</term>
+      <listitem>
+       <para>
+         Produce output in machine readable form for easier parsing
+         by scripts. Not all commands support this option.
+       </para>
+      </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-t <parameter>TIMEOUT</parameter></term>
+      <listitem>
+       <para>
+         Indicates that ctdb should wait up to TIMEOUT seconds for
+         a response to most commands sent to the CTDB daemon.  The
+         default is 10 seconds.
+       </para>
+      </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-T <parameter>TIMELIMIT</parameter></term>
+      <listitem>
+       <para>
+         Indicates that TIMELIMIT is the maximum run time (in
+         seconds) for the ctdb command.  When TIMELIMIT is exceeded
+         the ctdb command will terminate with an error.  The default
+         is 120 seconds.
+       </para>
+      </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-? --help</term>
+      <listitem>
+       <para>
+         Print some help text to the screen.
+       </para>
+      </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--usage</term>
+      <listitem>
+       <para>
+         Print useage information to the screen.
+       </para>
+      </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-d --debug=<parameter>DEBUGLEVEL</parameter></term>
+      <listitem>
+       <para>
+         Change the debug level for the command. Default is ERR (0).
+       </para>
+      </listitem>
+      </varlistentry>
+
+      <varlistentry><term>--socket=<parameter>FILENAME</parameter></term>
+      <listitem>
+       <para>
+         Specify that FILENAME is the name of the Unix domain
+         socket to use when connecting to the local CTDB
+         daemon. The default is
+         <filename>/tmp/ctdb.socket</filename>.
+       </para>
+      </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+
+  <refsect1>
+    <title>Administrative Commands</title>
+    <para>
+      These are commands used to monitor and administer a CTDB cluster.
+    </para>
+
+    <refsect2>
+      <title>pnn</title>
+      <para>
+       This command displays the PNN of the current node.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>xpnn</title>
+      <para>
+       This command displays the PNN of the current node without
+       contacting the CTDB daemon.  It parses the nodes file
+       directly, so can produce unexpected output if the nodes file
+       has been edited but has not been reloaded.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>status</title>
+      <para>
+       This command shows the current status of all CTDB nodes based
+       on information from the queried node.
+      </para>
+
+      <para>
+       Note: If the the queried node is INACTIVE then the status
+       might not be current.
+      </para>
+
+      <refsect3>
+       <title>Node status</title>
+       <para>
+         This includes the number of physical nodes and the status of
+         each node.  See <citerefentry><refentrytitle>ctdb</refentrytitle>
+         <manvolnum>7</manvolnum></citerefentry> for information
+         about node states.
+       </para>
+      </refsect3>
+
+      <refsect3>
+       <title>Generation</title>
+       <para>
+         The generation id is a number that indicates the current generation 
+         of a cluster instance. Each time a cluster goes through a 
+         reconfiguration or a recovery its generation id will be changed.
+       </para>
+       <para>
+         This number does not have any particular meaning other than
+         to keep track of when a cluster has gone through a
+         recovery. It is a random number that represents the current
+         instance of a ctdb cluster and its databases.  The CTDB
+         daemon uses this number internally to be able to tell when
+         commands to operate on the cluster and the databases was
+         issued in a different generation of the cluster, to ensure
+         that commands that operate on the databases will not survive
+         across a cluster database recovery.  After a recovery, all
+         old outstanding commands will automatically become invalid.
+       </para>
+       <para>
+         Sometimes this number will be shown as "INVALID". This only means that
+         the ctdbd daemon has started but it has not yet merged with the cluster through a recovery.
+         All nodes start with generation "INVALID" and are not assigned a real
+         generation id until they have successfully been merged with a cluster
+         through a recovery.
+       </para>
+      </refsect3>
+
+      <refsect3>
+       <title>Virtual Node Number (VNN) map</title>
+       <para>
+         Consists of the number of virtual nodes and mapping from
+         virtual node numbers to physical node numbers.  Virtual
+         nodes host CTDB databases.  Only nodes that are
+         participating in the VNN map can become lmaster or dmaster
+         for database records.
+       </para>
+      </refsect3>
+
+      <refsect3>
+       <title>Recovery mode</title>
+       <para>
+         This is the current recovery mode of the cluster. There are two possible modes:
+       </para>
+       <para>
+         NORMAL - The cluster is fully operational.
+       </para>
+       <para>
+         RECOVERY - The cluster databases have all been frozen, pausing all services while the cluster awaits a recovery process to complete. A recovery process should finish within seconds. If a cluster is stuck in the RECOVERY state this would indicate a cluster malfunction which needs to be investigated.
+       </para>
+       <para>
+         Once the recovery master detects an inconsistency, for example a node 
+         becomes disconnected/connected, the recovery daemon will trigger a 
+         cluster recovery process, where all databases are remerged across the
+         cluster. When this process starts, the recovery master will first
+         "freeze" all databases to prevent applications such as samba from 
+         accessing the databases and it will also mark the recovery mode as
+         RECOVERY.
+       </para>
+       <para>
+         When the CTDB daemon starts up, it will start in RECOVERY
+         mode.  Once the node has been merged into a cluster and all
+         databases have been recovered, the node mode will change into
+         NORMAL mode and the databases will be "thawed", allowing samba
+         to access the databases again.
+       </para>
+      </refsect3>
+      <refsect3>
+       <title>Recovery master</title>
+       <para>
+         This is the cluster node that is currently designated as the recovery master. This node is responsible of monitoring the consistency of the cluster and to perform the actual recovery process when reqired.
+       </para>
+       <para>
+         Only one node at a time can be the designated recovery master. Which
+         node is designated the recovery master is decided by an election
+         process in the recovery daemons running on each node.
+       </para>
+      </refsect3>
+
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb status
+Number of nodes:4
+pnn:0 192.168.2.200       OK (THIS NODE)
+pnn:1 192.168.2.201       OK
+pnn:2 192.168.2.202       OK
+pnn:3 192.168.2.203       OK
+Generation:1362079228
+Size:4
+hash:0 lmaster:0
+hash:1 lmaster:1
+hash:2 lmaster:2
+hash:3 lmaster:3
+Recovery mode:NORMAL (0)
+Recovery master:0
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>nodestatus <optional><parameter>PNN-LIST</parameter></optional></title>
+      <para>
+       This command is similar to the <command>status</command>
+       command.  It displays the "node status" subset of output.  The
+       main differences are:
+      </para>
+
+      <itemizedlist>
+       <listitem>
+         <para>
+           The exit code is the bitwise-OR of the flags for each
+           specified node, while <command>ctdb status</command> exits
+           with 0 if it was able to retrieve status for all nodes.
+         </para>
+       </listitem>
+
+       <listitem>
+         <para>
+           <command>ctdb status</command> provides status information
+           for all nodes.  <command>ctdb nodestatus</command>
+           defaults to providing status for only the current node.
+           If PNN-LIST is provided then status is given for
+           the indicated node(s).
+         </para>
+
+         <para>
+           By default, <command>ctdb nodestatus</command> gathers
+           status from the local node.  However, if invoked with "-n
+           all" (or similar) then status is gathered from the given
+           node(s).  In particular <command>ctdb nodestatus
+           all</command> and <command>ctdb nodestatus -n
+           all</command> will produce different output.  It is
+           possible to provide 2 different nodespecs (with and
+           without "-n") but the output is usually confusing!
+         </para>
+       </listitem>
+      </itemizedlist>
+
+      <para>
+       A common invocation in scripts is <command>ctdb nodestatus
+       all</command> to check whether all nodes in a cluster are
+       healthy.
+      </para>
+
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb nodestatus
+pnn:0 10.0.0.30        OK (THIS NODE)
+
+# ctdb nodestatus all
+Number of nodes:2
+pnn:0 10.0.0.30        OK (THIS NODE)
+pnn:1 10.0.0.31        OK
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>recmaster</title>
+      <para>
+       This command shows the pnn of the node which is currently the recmaster.
+      </para>
+
+      <para>
+       Note: If the the queried node is INACTIVE then the status
+       might not be current.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>uptime</title>
+      <para>
+       This command shows the uptime for the ctdb daemon. When the last recovery or ip-failover completed and how long it took. If the "duration" is shown as a negative number, this indicates that there is a recovery/failover in progress and it started that many seconds ago.
+      </para>
+
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb uptime
+Current time of node          :                Thu Oct 29 10:38:54 2009
+Ctdbd start time              : (000 16:54:28) Wed Oct 28 17:44:26 2009
+Time of last recovery/failover: (000 16:53:31) Wed Oct 28 17:45:23 2009
+Duration of last recovery/failover: 2.248552 seconds
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>listnodes</title>
+      <para>
+       This command shows lists the ip addresses of all the nodes in the cluster.
+      </para>
+
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb listnodes
+192.168.2.200
+192.168.2.201
+192.168.2.202
+192.168.2.203
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>natgwlist</title>
+      <para>
+       Show the current NAT gateway master and the status of all
+       nodes in the current NAT gateway group.  See the
+       <citetitle>NAT GATEWAY</citetitle> section in
+       <citerefentry><refentrytitle>ctdb</refentrytitle>
+       <manvolnum>7</manvolnum></citerefentry> for more details.
+      </para>
+
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb natgwlist
+0 192.168.2.200
+Number of nodes:4
+pnn:0 192.168.2.200       OK (THIS NODE)
+pnn:1 192.168.2.201       OK
+pnn:2 192.168.2.202       OK
+pnn:3 192.168.2.203       OK
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>ping</title>
+      <para>
+       This command will "ping" specified CTDB nodes in the cluster
+       to verify that they are running.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb ping -n all
+response from 0 time=0.000054 sec  (3 clients)
+response from 1 time=0.000144 sec  (2 clients)
+response from 2 time=0.000105 sec  (2 clients)
+response from 3 time=0.000114 sec  (2 clients)
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>ifaces</title>
+      <para>
+       This command will display the list of network interfaces, which could
+       host public addresses, along with their status.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb ifaces
+Interfaces on node 0
+name:eth5 link:up references:2
+name:eth4 link:down references:0
+name:eth3 link:up references:1
+name:eth2 link:up references:1
+
+# ctdb ifaces -Y
+:Name:LinkStatus:References:
+:eth5:1:2
+:eth4:0:0
+:eth3:1:1
+:eth2:1:1
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>ip</title>
+      <para>
+       This command will display the list of public addresses that are provided by the cluster and which physical node is currently serving this ip. By default this command will ONLY show those public addresses that are known to the node itself. To see the full list of all public ips across the cluster you must use "ctdb ip -n all".
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb ip
+Public IPs on node 0
+172.31.91.82 node[1] active[] available[eth2,eth3] configured[eth2,eth3]
+172.31.91.83 node[0] active[eth3] available[eth2,eth3] configured[eth2,eth3]
+172.31.91.84 node[1] active[] available[eth2,eth3] configured[eth2,eth3]
+172.31.91.85 node[0] active[eth2] available[eth2,eth3] configured[eth2,eth3]
+172.31.92.82 node[1] active[] available[eth5] configured[eth4,eth5]
+172.31.92.83 node[0] active[eth5] available[eth5] configured[eth4,eth5]
+172.31.92.84 node[1] active[] available[eth5] configured[eth4,eth5]
+172.31.92.85 node[0] active[eth5] available[eth5] configured[eth4,eth5]
+
+# ctdb ip -Y
+:Public IP:Node:ActiveInterface:AvailableInterfaces:ConfiguredInterfaces:
+:172.31.91.82:1::eth2,eth3:eth2,eth3:
+:172.31.91.83:0:eth3:eth2,eth3:eth2,eth3:
+:172.31.91.84:1::eth2,eth3:eth2,eth3:
+:172.31.91.85:0:eth2:eth2,eth3:eth2,eth3:
+:172.31.92.82:1::eth5:eth4,eth5:
+:172.31.92.83:0:eth5:eth5:eth4,eth5:
+:172.31.92.84:1::eth5:eth4,eth5:
+:172.31.92.85:0:eth5:eth5:eth4,eth5:
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>ipinfo <parameter>IP</parameter></title>
+      <para>
+       This command will display details about the specified public addresses.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb ipinfo 172.31.92.85
+Public IP[172.31.92.85] info on node 0
+IP:172.31.92.85
+CurrentNode:0
+NumInterfaces:2
+Interface[1]: Name:eth4 Link:down References:0
+Interface[2]: Name:eth5 Link:up References:2 (active)
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>scriptstatus</title>
+      <para>
+       This command displays which scripts where run in the previous monitoring cycle and the result of each script. If a script failed with an error, causing the node to become unhealthy, the output from that script is also shown.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb scriptstatus
+7 scripts were executed last monitoring cycle
+00.ctdb              Status:OK    Duration:0.056 Tue Mar 24 18:56:57 2009
+10.interface         Status:OK    Duration:0.077 Tue Mar 24 18:56:57 2009
+11.natgw             Status:OK    Duration:0.039 Tue Mar 24 18:56:57 2009
+20.multipathd        Status:OK    Duration:0.038 Tue Mar 24 18:56:57 2009
+31.clamd             Status:DISABLED
+40.vsftpd            Status:OK    Duration:0.045 Tue Mar 24 18:56:57 2009
+41.httpd             Status:OK    Duration:0.039 Tue Mar 24 18:56:57 2009
+50.samba             Status:ERROR    Duration:0.082 Tue Mar 24 18:56:57 2009
+OUTPUT:ERROR: Samba tcp port 445 is not responding
+      </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>disablescript <parameter>SCRIPT</parameter></title>
+      <para>
+       This command is used to disable an eventscript.
+      </para>
+      <para>
+       This will take effect the next time the eventscripts are being executed so it can take a short while until this is reflected in 'scriptstatus'.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>enablescript <parameter>SCRIPT</parameter></title>
+      <para>
+       This command is used to enable an eventscript.
+      </para>
+      <para>
+       This will take effect the next time the eventscripts are being executed so it can take a short while until this is reflected in 'scriptstatus'.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>listvars</title>
+      <para>
+       List all tuneable variables, except the values of the obsolete tunables
+       like VacuumMinInterval. The obsolete tunables can be retrieved only
+       explicitly with the "ctdb getvar" command.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb listvars
+MaxRedirectCount        = 3
+SeqnumInterval          = 1000
+ControlTimeout          = 60
+TraverseTimeout         = 20
+KeepaliveInterval       = 5
+KeepaliveLimit          = 5
+RecoverTimeout          = 20
+RecoverInterval         = 1
+ElectionTimeout         = 3
+TakeoverTimeout         = 9
+MonitorInterval         = 15
+TickleUpdateInterval    = 20
+EventScriptTimeout      = 30
+EventScriptTimeoutCount = 1
+RecoveryGracePeriod     = 120
+RecoveryBanPeriod       = 300
+DatabaseHashSize        = 100001
+DatabaseMaxDead         = 5
+RerecoveryTimeout       = 10
+EnableBans              = 1
+DeterministicIPs        = 0
+LCP2PublicIPs           = 1
+ReclockPingPeriod       = 60
+NoIPFailback            = 0
+DisableIPFailover       = 0
+VerboseMemoryNames      = 0
+RecdPingTimeout         = 60
+RecdFailCount           = 10
+LogLatencyMs            = 0
+RecLockLatencyMs        = 1000
+RecoveryDropAllIPs      = 120
+VerifyRecoveryLock      = 1
+VacuumInterval          = 10
+VacuumMaxRunTime        = 30
+RepackLimit             = 10000
+VacuumLimit             = 5000
+VacuumFastPathCount     = 60
+MaxQueueDropMsg         = 1000000
+UseStatusEvents         = 0
+AllowUnhealthyDBRead    = 0
+StatHistoryInterval     = 1
+DeferredAttachTO        = 120
+AllowClientDBAttach     = 1
+RecoverPDBBySeqNum      = 0
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>getvar <parameter>NAME</parameter></title>
+      <para>
+       Get the runtime value of a tuneable variable.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb getvar MaxRedirectCount
+MaxRedirectCount    = 3
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>setvar <parameter>NAME</parameter> <parameter>VALUE</parameter></title>
+      <para>
+       Set the runtime value of a tuneable variable.
+      </para>
+      <para>
+       Example: ctdb setvar MaxRedirectCount 5
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>lvsmaster</title>
+      <para>
+       This command shows which node is currently the LVSMASTER. The
+       LVSMASTER is the node in the cluster which drives the LVS system and
+       which receives all incoming traffic from clients.
+      </para>
+      <para>
+       LVS is the mode where the entire CTDB/Samba cluster uses a single
+       ip address for the entire cluster. In this mode all clients connect to
+       one specific node which will then multiplex/loadbalance the clients
+       evenly onto the other nodes in the cluster. This is an alternative to using
+       public ip addresses. See the manpage for ctdbd for more information
+       about LVS.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>lvs</title>
+      <para>
+       This command shows which nodes in the cluster are currently active in the
+       LVS configuration. I.e. which nodes we are currently loadbalancing
+       the single ip address across.
+      </para>
+
+      <para>
+       LVS will by default only loadbalance across those nodes that are both
+       LVS capable and also HEALTHY. Except if all nodes are UNHEALTHY in which
+       case LVS will loadbalance across all UNHEALTHY nodes as well.
+       LVS will never use nodes that are DISCONNECTED, STOPPED, BANNED or
+       DISABLED.
+      </para>
+
+      <para>
+       Example output:
+      </para>
+      <screen format="linespecific">
+2:10.0.0.13
+3:10.0.0.14
+      </screen>
+
+    </refsect2>
+
+
+    <refsect2>
+      <title>getcapabilities</title>
+
+      <para>
+       This command shows the capabilities of the current node.  See
+       the <citetitle>CAPABILITIES</citetitle> section in
+       <citerefentry><refentrytitle>ctdb</refentrytitle>
+       <manvolnum>7</manvolnum></citerefentry> for more details.
+      </para>
+
+      <para>
+       Example output:
+      </para>
+      <screen format="linespecific">
+RECMASTER: YES
+LMASTER: YES
+LVS: NO
+NATGW: YES
+      </screen>
+
+    </refsect2>
+
+    <refsect2>
+      <title>statistics</title>
+      <para>
+       Collect statistics from the CTDB daemon about how many calls it has served.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb statistics
+CTDB version 1
+num_clients                        3
+frozen                             0
+recovering                         0
+client_packets_sent           360489
+client_packets_recv           360466
+node_packets_sent             480931
+node_packets_recv             240120
+keepalive_packets_sent             4
+keepalive_packets_recv             3
+node
+req_call                       2
+reply_call                     2
+req_dmaster                    0
+reply_dmaster                  0
+reply_error                    0
+req_message                   42
+req_control               120408
+reply_control             360439
+client
+req_call                       2
+req_message                   24
+req_control               360440
+timeouts
+call                           0
+control                        0
+traverse                       0
+total_calls                        2
+pending_calls                      0
+lockwait_calls                     0
+pending_lockwait_calls             0
+memory_used                     5040
+max_hop_count                      0
+max_call_latency                   4.948321 sec
+max_lockwait_latency               0.000000 sec
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>statisticsreset</title>
+      <para>
+       This command is used to clear all statistics counters in a node.
+      </para>
+      <para>
+       Example: ctdb statisticsreset
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>dbstatistics <parameter>DBNAME</parameter>|<parameter>HASH</parameter></title>
+      <para>
+       Display statistics about the specified database.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb dbstatistics locking.tdb
+DB Statistics: locking.tdb
+ ro_delegations                     0
+ ro_revokes                         0
+ locks
+     total                      14356
+     failed                         0
+     current                        0
+     pending                        0
+ hop_count_buckets: 28087 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+ lock_buckets: 0 14188 38 76 32 19 3 0 0 0 0 0 0 0 0 0
+ locks_latency      MIN/AVG/MAX     0.001066/0.012686/4.202292 sec out of 14356
+ Num Hot Keys:     1
+     Count:8 Key:ff5bd7cb3ee3822edc1f0000000000000000000000000000
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>getreclock</title>
+      <para>
+       This command is used to show the filename of the reclock file that is used.
+      </para>
+
+      <para>
+       Example output:
+      </para>
+      <screen format="linespecific">
+       Reclock file:/gpfs/.ctdb/shared
+      </screen>
+
+    </refsect2>
+
+    <refsect2>
+      <title>setreclock [filename]</title>
+      <para>
+       This command is used to modify, or clear, the file that is used as the reclock file at runtime. When this command is used, the reclock file checks are disabled. To re-enable the checks the administrator needs to activate the "VerifyRecoveryLock" tunable using "ctdb setvar".
+      </para>
+
+      <para>
+       If run with no parameter this will remove the reclock file completely. If run with a parameter the parameter specifies the new filename to use for the recovery lock.
+      </para>
+
+      <para>
+       This command only affects the runtime settings of a ctdb node and will be lost when ctdb is restarted. For persistent changes to the reclock file setting you must edit /etc/sysconfig/ctdb.
+      </para>
+    </refsect2>
+
+
+
+    <refsect2>
+      <title>getdebug</title>
+      <para>
+       Get the current debug level for the node. the debug level controls what information is written to the log file.
+      </para>
+      <para>
+       The debug levels are mapped to the corresponding syslog levels.
+       When a debug level is set, only those messages at that level and higher
+       levels will be printed.
+      </para>
+      <para>
+       The list of debug levels from highest to lowest are :
+      </para>
+      <para>
+       EMERG ALERT CRIT ERR WARNING NOTICE INFO DEBUG
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>setdebug <parameter>DEBUGLEVEL</parameter></title>
+      <para>
+       Set the debug level of a node. This controls what information will be logged.
+      </para>
+      <para>
+       The debuglevel is one of EMERG ALERT CRIT ERR WARNING NOTICE INFO DEBUG
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>getpid</title>
+      <para>
+       This command will return the process id of the ctdb daemon.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>disable</title>
+      <para>
+       This command is used to administratively disable a node in the cluster.
+       A disabled node will still participate in the cluster and host
+       clustered TDB records but its public ip address has been taken over by
+       a different node and it no longer hosts any services.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>enable</title>
+      <para>
+       Re-enable a node that has been administratively disabled.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>stop</title>
+      <para>
+       This command is used to administratively STOP a node in the cluster.
+       A STOPPED node is connected to the cluster but will not host any
+       public ip addresse, nor does it participate in the VNNMAP.
+       The difference between a DISABLED node and a STOPPED node is that
+       a STOPPED node does not host any parts of the database which means
+       that a recovery is required to stop/continue nodes.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>continue</title>
+      <para>
+       Re-start a node that has been administratively stopped.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>addip <parameter>IPADDR</parameter>/<parameter>mask</parameter> <parameter>IFACE</parameter></title>
+      <para>
+       This command is used to add a new public ip to a node during runtime.
+       This allows public addresses to be added to a cluster without having
+       to restart the ctdb daemons.
+      </para>
+      <para>
+       Note that this only updates the runtime instance of ctdb. Any changes will be lost next time ctdb is restarted and the public addresses file is re-read.
+       If you want this change to be permanent you must also update the public addresses file manually.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>delip <parameter>IPADDR</parameter></title>
+      <para>
+       This command is used to remove a public ip from a node during runtime.
+       If this public ip is currently hosted by the node it being removed from, the ip will first be failed over to another node, if possible, before it is removed.
+      </para>
+      <para>
+       Note that this only updates the runtime instance of ctdb. Any changes will be lost next time ctdb is restarted and the public addresses file is re-read.
+       If you want this change to be permanent you must also update the public addresses file manually.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>moveip <parameter>IPADDR</parameter> <parameter>PNN</parameter></title>
+      <para>
+       This command can be used to manually fail a public ip address to a
+       specific node.
+      </para>
+      <para>
+       In order to manually override the "automatic" distribution of public 
+       ip addresses that ctdb normally provides, this command only works
+       when you have changed the tunables for the daemon to:
+      </para>
+      <para>
+       DeterministicIPs = 0
+      </para>
+      <para>
+       NoIPFailback = 1
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>shutdown</title>
+      <para>
+       This command will shutdown a specific CTDB daemon.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>setlmasterrole on|off</title>
+      <para>
+       This command is used ot enable/disable the LMASTER capability for a node at runtime. This capability determines whether or not a node can be used as an LMASTER for records in the database. A node that does not have the LMASTER capability will not show up in the vnnmap.
+      </para>
+
+      <para>
+       Nodes will by default have this capability, but it can be stripped off nodes by the setting in the sysconfig file or by using this command.
+      </para>
+      <para>
+       Once this setting has been enabled/disabled, you need to perform a recovery for it to take effect.
+      </para>
+      <para>
+       See also "ctdb getcapabilities"
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>setrecmasterrole on|off</title>
+      <para>
+       This command is used ot enable/disable the RECMASTER capability for a node at runtime. This capability determines whether or not a node can be used as an RECMASTER for the cluster. A node that does not have the RECMASTER capability can not win a recmaster election. A node that already is the recmaster for the cluster when the capability is stripped off the node will remain the recmaster until the next cluster election.
+      </para>
+
+      <para>
+       Nodes will by default have this capability, but it can be stripped off nodes by the setting in the sysconfig file or by using this command.
+      </para>
+      <para>
+       See also "ctdb getcapabilities"
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>reloadnodes</title>
+      <para>
+       This command is used when adding new nodes, or removing existing nodes from an existing cluster.
+      </para>
+      <para>
+       Procedure to add a node:
+      </para>
+      <para>
+       1, To expand an existing cluster, first ensure with 'ctdb status' that
+       all nodes are up and running and that they are all healthy.
+       Do not try to expand a cluster unless it is completely healthy!
+      </para>
+      <para>
+       2, On all nodes, edit /etc/ctdb/nodes and add the new node as the last
+       entry to the file. The new node MUST be added to the end of this file!
+      </para>
+      <para>
+       3, Verify that all the nodes have identical /etc/ctdb/nodes files after you edited them and added the new node!
+      </para>
+      <para>
+       4, Run 'ctdb reloadnodes' to force all nodes to reload the nodesfile.
+      </para>
+      <para>
+       5, Use 'ctdb status' on all nodes and verify that they now show the additional node.
+      </para>
+      <para>
+       6, Install and configure the new node and bring it online.
+      </para>
+      <para>
+       Procedure to remove a node:
+      </para>
+      <para>
+       1, To remove a node from an existing cluster, first ensure with 'ctdb status' that
+       all nodes, except the node to be deleted, are up and running and that they are all healthy.
+       Do not try to remove a node from a cluster unless the cluster is completely healthy!
+      </para>
+      <para>
+       2, Shutdown and poweroff the node to be removed.
+      </para>
+      <para>
+       3, On all other nodes, edit the /etc/ctdb/nodes file and comment out the node to be removed. Do not delete the line for that node, just comment it out by adding a '#' at the beginning of the line.
+      </para>
+      <para>
+       4, Run 'ctdb reloadnodes' to force all nodes to reload the nodesfile.
+      </para>
+      <para>
+       5, Use 'ctdb status' on all nodes and verify that the deleted node no longer shows up in the list..
+      </para>
+      <para>
+      </para>
+
+    </refsect2>
+
+    <refsect2>
+      <title>
+       reloadips
+       <optional><parameter>PNN-LIST</parameter></optional>
+      </title>
+      <para>
+       This command reloads the public addresses configuration file
+       on the specified nodes.  When it completes addresses will be
+       reconfigured and reassigned across the cluster as necessary.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>getdbmap</title>
+      <para>
+       This command lists all clustered TDB databases that the CTDB daemon has attached to. Some databases are flagged as PERSISTENT, this means that the database stores data persistently and the data will remain across reboots. One example of such a database is secrets.tdb where information about how the cluster was joined to the domain is stored.
+      </para>
+      <para>
+       If a PERSISTENT database is not in a healthy state the database is
+       flagged as UNHEALTHY. If there's at least one completely healthy node running in
+       the cluster, it's possible that the content is restored by a recovery
+       run automaticly. Otherwise an administrator needs to analyze the
+       problem.
+      </para>
+      <para>
+       See also "ctdb getdbstatus", "ctdb backupdb", "ctdb restoredb",
+       "ctdb dumpbackup", "ctdb wipedb", "ctdb setvar AllowUnhealthyDBRead 1"
+       and (if samba or tdb-utils are installed) "tdbtool check".
+      </para>
+      <para>
+       Most databases are not persistent and only store the state information that the currently running samba daemons need. These databases are always wiped when ctdb/samba starts and when a node is rebooted.
+      </para>
+
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb getdbmap
+Number of databases:10
+dbid:0x435d3410 name:notify.tdb path:/var/ctdb/notify.tdb.0 
+dbid:0x42fe72c5 name:locking.tdb path:/var/ctdb/locking.tdb.0
+dbid:0x1421fb78 name:brlock.tdb path:/var/ctdb/brlock.tdb.0 
+dbid:0x17055d90 name:connections.tdb path:/var/ctdb/connections.tdb.0 
+dbid:0xc0bdde6a name:sessionid.tdb path:/var/ctdb/sessionid.tdb.0 
+dbid:0x122224da name:test.tdb path:/var/ctdb/test.tdb.0 
+dbid:0x2672a57f name:idmap2.tdb path:/var/ctdb/persistent/idmap2.tdb.0 PERSISTENT
+dbid:0xb775fff6 name:secrets.tdb path:/var/ctdb/persistent/secrets.tdb.0 PERSISTENT
+dbid:0xe98e08b6 name:group_mapping.tdb path:/var/ctdb/persistent/group_mapping.tdb.0 PERSISTENT
+dbid:0x7bbbd26c name:passdb.tdb path:/var/ctdb/persistent/passdb.tdb.0 PERSISTENT
+
+# ctdb getdbmap  # example for unhealthy database
+Number of databases:1
+dbid:0xb775fff6 name:secrets.tdb path:/var/ctdb/persistent/secrets.tdb.0 PERSISTENT UNHEALTHY
+
+# ctdb -Y getdbmap
+:ID:Name:Path:Persistent:Unhealthy:
+:0x7bbbd26c:passdb.tdb:/var/ctdb/persistent/passdb.tdb.0:1:0:
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>backupdb <parameter>DBNAME</parameter> <parameter>FILE</parameter></title>
+      <para>
+       This command can be used to copy the entire content of a database out to a file. This file can later be read back into ctdb using the restoredb command.
+       This is mainly useful for backing up persistent databases such as secrets.tdb and similar.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>restoredb <parameter>FILE</parameter> [<parameter>DBNAME</parameter>]</title>
+      <para>
+       This command restores a persistent database that was previously backed up using backupdb.
+       By default the data will be restored back into the same database as
+       it was created from. By specifying dbname you can restore the data
+       into a different database.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>getlog [<parameter>LEVEL</parameter>] [recoverd]</title>
+      <para>
+       In addition to the normal logging to a log file, CTDB also
+       keeps a in-memory ringbuffer containing the most recent log
+       entries for all log levels (except DEBUG).
+      </para>
+      <para>
+       This is useful since it allows for keeping continuous logs to a file
+       at a reasonable non-verbose level, but shortly after an incident has
+       occured, a much more detailed log can be pulled from memory. This
+       can allow you to avoid having to reproduce an issue due to the
+       on-disk logs being of insufficient detail.
+      </para>
+      <para>
+       This command extracts all messages of level or lower log level
+       from memory and prints it to the screen.  The level is not
+       specified it defaults to NOTICE.
+      </para>
+      <para>
+       By default, logs are extracted from the main CTDB daemon.  If
+       the recoverd option is given then logs are extracted from the
+       recovery daemon.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>clearlog [recoverd]</title>
+      <para>
+       This command clears the in-memory logging ringbuffer.
+      </para>
+      <para>
+       By default, logs are cleared in the main CTDB daemon.  If the
+       recoverd option is given then logs are cleared in the recovery
+       daemon.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>setdbreadonly <parameter>DBNAME</parameter>|<parameter>HASH</parameter></title>
+      <para>
+       This command will enable the read-only record support for a
+       database.  This is an experimental feature to improve
+       performance for contended records primarily in locking.tdb and
+       brlock.tdb.  When enabling this feature you must set it on all
+       nodes in the cluster.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>setdbsticky <parameter>DBNAME</parameter>|<parameter>HASH</parameter></title>
+      <para>
+       This command will enable the sticky record support for the
+       specified database.  This is an experimental feature to
+       improve performance for contended records primarily in
+       locking.tdb and brlock.tdb.  When enabling this feature you
+       must set it on all nodes in the cluster.
+      </para>
+    </refsect2>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Internal commands</title>
+
+    <para>
+      Internal commands are used by CTDB's scripts and are not
+      required for managing a CTDB cluster.  Their parameters and
+      behaviour are subject to change.
+    </para>
+
+    <refsect2>
+      <title>runstate [setup|first_recovery|startup|running]</title>
+      <para>
+       Print the runstate of the specified node.  Runstates are used
+       to serialise important state transitions in CTDB, particularly
+       during startup.
+      </para>
+      <para>
+       If one or more optional runstate arguments are specified then
+       the node must be in one of these runstates for the command to
+       succeed.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb runstate
+RUNNING
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>setifacelink <parameter>IFACE</parameter> <parameter>STATUS</parameter></title>
+      <para>
+       This command will set the status of a network interface.
+       The status needs to be "up" or "down". This is typically
+       used in the 10.interfaces script in the "monitor" event.
+      </para>
+      <para>
+       Example: ctdb setifacelink eth0 up
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>killtcp</title>
+      <para>
+       This command reads a list of TCP connections, one per line,
+       from standard input and terminates each connection.  A connection
+       is specified as:
+      </para>
+      <synopsis>
+       <parameter>SRC-IPADDR</parameter>:<parameter>SRC-PORT</parameter> <parameter>DST-IPADDR</parameter>:<parameter>DST-PORT</parameter>
+      </synopsis>
+      <para>
+       A connection is terminated by issuing a TCP RST to the
+       SRC-IPADDR:SRC-PORT endpoint.
+      </para>
+      <para>
+       A single connection can be specified on the command-line
+       rather than on standard input.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>gratiousarp <parameter>IPADDR</parameter> <parameter>INTERFACE</parameter></title>
+      <para>
+       This command will send out a gratious arp for the specified interface
+       through the specified interface. This command is mainly used by the
+       ctdb eventscripts.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>tickle <parameter>SRC-IPADDR</parameter>:<parameter>SRC-PORT</parameter> <parameter>DST-IPADDR</parameter>:<parameter>DST-PORT</parameter></title>
+      <para>
+       This command will will send a TCP tickle to the source host for the
+       specified TCP connection.
+       A TCP tickle is a TCP ACK packet with an invalid sequence and 
+       acknowledge number and will when received by the source host result
+       in it sending an immediate correct ACK back to the other end.
+      </para>
+      <para>
+       TCP tickles are useful to "tickle" clients after a IP failover has 
+       occured since this will make the client immediately recognize the 
+       TCP connection has been disrupted and that the client will need
+       to reestablish. This greatly speeds up the time it takes for a client
+       to detect and reestablish after an IP failover in the ctdb cluster.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>gettickles <parameter>IPADDR</parameter></title>
+      <para>
+       This command is used to show which TCP connections are registered with
+       CTDB to be "tickled" if there is a failover.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>version</title>
+      <para>
+       Displays the CTDB version.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>setnatgwstate on|off</title>
+      <para>
+       Enable or disable the NAT gateway master capability on a node.
+      </para>
+    </refsect2>
+
+  </refsect1>
+
+  <refsect1>
+    <title>Debugging Commands</title>
+    <para>
+      These commands are primarily used for CTDB development and testing and
+      should not be used for normal administration.
+    </para>
+
+
+    <refsect2>
+      <title>OPTIONS</title>
+
+      <variablelist>
+       <varlistentry><term>--print-emptyrecords</term>
+       <listitem>
+         <para>
+           This enables printing of empty records when dumping databases
+           with the catdb, cattbd and dumpdbbackup commands. Records with
+           empty data segment are considered deleted by ctdb and cleaned
+           by the vacuuming mechanism, so this switch can come in handy for
+           debugging the vacuuming behaviour.
+         </para>
+       </listitem>
+       </varlistentry>
+
+       <varlistentry><term>--print-datasize</term>
+       <listitem>
+         <para>
+           This lets database dumps (catdb, cattdb, dumpdbbackup) print the
+           size of the record data instead of dumping the data contents.
+         </para>
+       </listitem>
+       </varlistentry>
+
+       <varlistentry><term>--print-lmaster</term>
+       <listitem>
+         <para>
+           This lets catdb print the lmaster for each record.
+         </para>
+       </listitem>
+       </varlistentry>
+
+       <varlistentry><term>--print-hash</term>
+       <listitem>
+         <para>
+           This lets database dumps (catdb, cattdb, dumpdbbackup) print the
+           hash for each record.
+         </para>
+       </listitem>
+       </varlistentry>
+
+       <varlistentry><term>--print-recordflags</term>
+       <listitem>
+         <para>
+           This lets catdb and dumpdbbackup print the
+           record flags for each record. Note that cattdb always
+           prints the flags.
+         </para>
+       </listitem>
+       </varlistentry>
+
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>process-exists <parameter>PID</parameter></title>
+      <para>
+       This command checks if a specific process exists on the CTDB host. This is mainly used by Samba to check if remote instances of samba are still running or not.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>getdbstatus <parameter>DBNAME</parameter></title>
+      <para>
+       This command displays more details about a database.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb getdbstatus test.tdb.0
+dbid: 0x122224da
+name: test.tdb
+path: /var/ctdb/test.tdb.0
+PERSISTENT: no
+HEALTH: OK
+
+# ctdb getdbstatus registry.tdb  # with a corrupted TDB
+dbid: 0xf2a58948
+name: registry.tdb
+path: /var/ctdb/persistent/registry.tdb.0
+PERSISTENT: yes
+HEALTH: NO-HEALTHY-NODES - ERROR - Backup of corrupted TDB in '/var/ctdb/persistent/registry.tdb.0.corrupted.20091208091949.0Z'
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>catdb <parameter>DBNAME</parameter></title>
+      <para>
+       This command will dump a clustered TDB database to the screen. This is a debugging command.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>cattdb <parameter>DBNAME</parameter></title>
+      <para>
+       This command will dump the content of the local TDB database to the screen. This is a debugging command.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>dumpdbbackup <parameter>BACKUP-FILE</parameter></title>
+      <para>
+       This command will dump the content of database backup to the screen
+       (similar to ctdb catdb). This is a debugging command.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>wipedb <parameter>DBNAME</parameter></title>
+      <para>
+       This command can be used to remove all content of a database.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>recover</title>
+      <para>
+       This command will trigger the recovery daemon to do a cluster
+       recovery.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>ipreallocate, sync</title>
+      <para>
+       This command will force the recovery master to perform a full ip reallocation process and redistribute all ip addresses. This is useful to "reset" the allocations back to its default state if they have been changed using the "moveip" command. While a "recover" will also perform this reallocation, a recovery is much more hevyweight since it will also rebuild all the databases.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>getmonmode</title>
+      <para>
+       This command returns the monutoring mode of a node. The monitoring mode is either ACTIVE or DISABLED. Normally a node will continuously monitor that all other nodes that are expected are in fact connected and that they respond to commands.
+      </para>
+      <para>
+       ACTIVE - This is the normal mode. The node is actively monitoring all other nodes, both that the transport is connected and also that the node responds to commands. If a node becomes unavailable, it will be marked as DISCONNECTED and a recovery is initiated to restore the cluster.
+      </para>
+      <para>
+       DISABLED - This node is not monitoring that other nodes are available. In this mode a node failure will not be detected and no recovery will be performed. This mode is useful when for debugging purposes one wants to attach GDB to a ctdb process but wants to prevent the rest of the cluster from marking this node as DISCONNECTED and do a recovery.
+      </para>
+    </refsect2>
+
+
+    <refsect2>
+      <title>setmonmode 0|1</title>
+      <para>
+       This command can be used to explicitly disable/enable monitoring mode on a node. The main purpose is if one wants to attach GDB to a running ctdb daemon but wants to prevent the other nodes from marking it as DISCONNECTED and issuing a recovery. To do this, set monitoring mode to 0 on all nodes before attaching with GDB. Remember to set monitoring mode back to 1 afterwards.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>attach <parameter>DBNAME</parameter> [persistent]</title>
+      <para>
+       This is a debugging command. This command will make the CTDB daemon create a new CTDB database and attach to it.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>dumpmemory</title>
+      <para>
+       This is a debugging command. This command will make the ctdb
+       daemon to write a fill memory allocation map to standard output.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>rddumpmemory</title>
+      <para>
+       This is a debugging command. This command will dump the talloc memory
+       allocation tree for the recovery daemon to standard output.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>thaw</title>
+      <para>
+       Thaw a previously frozen node.
+      </para>
+    </refsect2>
+
+
+    <refsect2>
+      <title>eventscript <parameter>ARGUMENTS</parameter></title>
+      <para>
+       This is a debugging command. This command can be used to manually
+       invoke and run the eventscritps with arbitrary arguments.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>ban <parameter>BANTIME</parameter></title>
+      <para>
+       Administratively ban a node for BANTIME seconds.  The node
+       will be unbanned after BANTIME seconds have elapsed.
+      </para>
+      <para>
+       A banned node does not participate in the cluster.  It does
+       not host any records for the clustered TDB and does not host
+       any public IP addresses.
+      </para>
+      <para>
+       Nodes are automatically banned if they misbehave.  For
+       example, a node may be banned if it causes too many cluster
+       recoveries.
+      </para>
+      <para>
+       To administratively exclude a node from a cluster use the
+       <command>stop</command> command.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>unban</title>
+      <para>
+       This command is used to unban a node that has either been
+       administratively banned using the ban command or has been
+       automatically banned.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>
+       rebalancenode
+       <optional><parameter>PNN-LIST</parameter></optional>
+      </title>
+      <para>
+       This command marks the given nodes as rebalance targets in the
+       LCP2 IP allocation algorithm.  The
+       <command>reloadips</command> command will do this as necessary
+       so this command should not be needed.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>check_srvids <parameter>SRVID</parameter> ...</title>
+      <para>
+       This command checks whether a set of srvid message ports are
+       registered on the node or not. The command takes a list of
+       values to check.
+      </para>
+      <refsect3>
+       <title>Example</title>
+       <screen format="linespecific">
+# ctdb check_srvids 1 2 3 14765
+Server id 0:1 does not exist
+Server id 0:2 does not exist
+Server id 0:3 does not exist
+Server id 0:14765 exists
+       </screen>
+      </refsect3>
+    </refsect2>
+
+    <refsect2>
+      <title>vacuum [<parameter>max-records</parameter>]</title>
+      <para>
+       Over time CTDB databases will fill up with empty deleted
+       records which will lead to a progressive slow down of CTDB
+       database access.  This command is used to prune all databases
+       and delete all empty records from the cluster.
+      </para>
+
+      <para>
+       By default, vacuum will delete all empty records from all databases.
+       If [max_records] is specified, the command will only delete the first
+       [max_records] empty records for each database.
+      </para>
+
+      <para>
+       Vacuum only deletes records where the local node is the
+       lmaster.  To delete all records from the entire cluster you
+       need to run a vacuum from each node.
+
+       This command is not disruptive. Samba is unaffected and will still be able to read/write records normally while the database is being vacuumed.
+      </para>
+
+      <para>
+       Example: ctdb vacuum
+      </para>
+
+      <para>
+       By default, this operation is issued from the 00.ctdb event script every 5 minutes.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>repack [max_freelist]</title>
+      <para>
+       Over time, when records are created and deleted in a TDB, the TDB list of free space will become fragmented. This can lead to a slowdown in accessing TDB records.
+       This command is used to defragment a TDB database and pruning the freelist.
+      </para>
+
+      <para>
+       If [max_freelist] is specified, then a database will only be repacked if it has more than this number of entries in the freelist.
+      </para>
+      <para>
+       During repacking of the database, the entire TDB database will be locked to prevent writes. If samba tries to write to a record in the database during a repack operation, samba will block until the repacking has completed.
+      </para>
+
+      <para>
+       This command can be disruptive and can cause samba to block for the duration of the repack operation. In general, a repack operation will take less than one second to complete.
+      </para>
+
+      <para>
+       A repack operation will only defragment the local TDB copy of the CTDB database. You need to run this command on all of the nodes to repack a CTDB database completely.
+      </para>
+
+      <para>
+       Example: ctdb repack 1000
+      </para>
+
+      <para>
+       By default, this operation is issued from the 00.ctdb event script every 5 minutes.
+      </para>
+
+    </refsect2>
+
+  </refsect1>
+
+  <!-- UNDOCUMENTED: showban stats disablemonitor enablemonitor
+       isnotrecmaster addtickle deltickle regsrvid unregsrvid chksrvid
+       getsrvids rebalanceip setdbprio getdbprio msglisten msgsend
+       pfetch pstore pdelete tfetch tstore readkey writekey
+       checktcpport getdbseqnum ipiface
+  -->
+
+
+  <refsect1>
+    <title>SEE ALSO</title>
+    <para>
+      <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>onnode</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb-tunables</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by
+       Ronnie Sahlberg,
+       Amitay Isaacs,
+       Martin Schwenke
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2007</year>
+      <holder>Andrew Tridgell</holder>
+      <holder>Ronnie Sahlberg</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/ctdb.7.xml b/ctdb/doc/ctdb.7.xml
new file mode 100644 (file)
index 0000000..989a280
--- /dev/null
@@ -0,0 +1,1001 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<refentry id="ctdb.7">
+
+<refmeta>
+       <refentrytitle>ctdb</refentrytitle>
+       <manvolnum>7</manvolnum>
+       <refmiscinfo class="source">ctdb</refmiscinfo>
+       <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+</refmeta>
+
+
+<refnamediv>
+       <refname>ctdb</refname>
+        <refpurpose>Clustered TDB</refpurpose>
+</refnamediv>
+
+<refsect1>
+  <title>DESCRIPTION</title>
+
+  <para>
+    CTDB is a clustered database component in clustered Samba that
+    provides a high-availability load-sharing CIFS server cluster.
+  </para>
+
+  <para>
+    The main functions of CTDB are:
+  </para>
+
+  <itemizedlist>
+    <listitem>
+      <para>
+       Provide a clustered version of the TDB database with automatic
+       rebuild/recovery of the databases upon node failures.
+      </para>
+    </listitem>
+
+    <listitem>
+      <para>
+      Monitor nodes in the cluster and services running on each node.
+      </para>
+    </listitem>
+
+    <listitem>
+      <para>
+       Manage a pool of public IP addresses that are used to provide
+       services to clients.  Alternatively, CTDB can be used with
+       LVS.
+      </para>
+    </listitem>
+  </itemizedlist>
+
+  <para>
+    Combined with a cluster filesystem CTDB provides a full
+    high-availablity (HA) environment for services such as clustered
+    Samba, NFS and other services.
+  </para>
+</refsect1>
+
+<refsect1>
+  <title>ANATOMY OF A CTDB CLUSTER</title>
+
+  <para>
+    A CTDB cluster is a collection of nodes with 2 or more network
+    interfaces.  All nodes provide network (usually file/NAS) services
+    to clients.  Data served by file services is stored on shared
+    storage (usually a cluster filesystem) that is accessible by all
+    nodes.
+  </para>
+  <para>
+    CTDB provides an "all active" cluster, where services are load
+    balanced across all nodes.
+  </para>
+</refsect1>
+
+  <refsect1>
+    <title>Private vs Public addresses</title>
+
+    <para>
+      Each node in a CTDB cluster has multiple IP addresses assigned
+      to it:
+
+      <itemizedlist>
+       <listitem>
+         <para>
+           A single private IP address that is used for communication
+           between nodes.
+         </para>
+       </listitem>
+       <listitem>
+         <para>
+           One or more public IP addresses that are used to provide
+           NAS or other services.
+         </para>
+       </listitem>
+      </itemizedlist>
+    </para>
+
+    <refsect2>
+      <title>Private address</title>
+
+      <para>
+        Each node is configured with a unique, permanently assigned
+        private address.  This address is configured by the operating
+        system.  This address uniquely identifies a physical node in
+        the cluster and is the address that CTDB daemons will use to
+        communicate with the CTDB daemons on other nodes.
+      </para>
+      <para>
+        Private addresses are listed in the file specified by the
+        <varname>CTDB_NODES</varname> configuration variable (see
+        <citerefentry><refentrytitle>ctdbd.conf</refentrytitle>
+        <manvolnum>5</manvolnum></citerefentry>, default
+        <filename>/etc/ctdb/nodes</filename>).  This file contains the
+        list of private addresses for all nodes in the cluster, one
+        per line. This file must be the same on all nodes in the
+        cluster.
+      </para>
+      <para>
+       Private addresses should not be used by clients to connect to
+       services provided by the cluster.
+      </para>
+      <para>
+        It is strongly recommended that the private addresses are
+        configured on a private network that is separate from client
+        networks.
+      </para>
+
+      <para>
+       Example <filename>/etc/ctdb/nodes</filename> for a four node
+       cluster:
+      </para>
+      <screen format="linespecific">
+192.168.1.1
+192.168.1.2
+192.168.1.3
+192.168.1.4
+      </screen>
+    </refsect2>
+
+    <refsect2>
+      <title>Public addresses</title>
+
+      <para>
+       Public addresses are used to provide services to clients.
+       Public addresses are not configured at the operating system
+       level and are not permanently associated with a particular
+       node.  Instead, they are managed by CTDB and are assigned to
+       interfaces on physical nodes at runtime.
+      </para>
+      <para>
+        The CTDB cluster will assign/reassign these public addresses
+        across the available healthy nodes in the cluster. When one
+        node fails, its public addresses will be taken over by one or
+        more other nodes in the cluster.  This ensures that services
+        provided by all public addresses are always available to
+        clients, as long as there are nodes available capable of
+        hosting this address.
+      </para>
+      <para>
+       The public address configuration is stored in a file on each
+       node specified by the <varname>CTDB_PUBLIC_ADDRESSES</varname>
+       configuration variable (see
+       <citerefentry><refentrytitle>ctdbd.conf</refentrytitle>
+       <manvolnum>5</manvolnum></citerefentry>, recommended
+       <filename>/etc/ctdb/public_addresses</filename>).  This file
+       contains a list of the public addresses that the node is
+       capable of hosting, one per line.  Each entry also contains
+       the netmask and the interface to which the address should be
+       assigned.
+      </para>
+
+      <para>
+       Example <filename>/etc/ctdb/public_addresses</filename> for a
+       node that can host 4 public addresses, on 2 different
+       interfaces:
+      </para>
+      <screen format="linespecific">
+10.1.1.1/24 eth1
+10.1.1.2/24 eth1
+10.1.2.1/24 eth2
+10.1.2.2/24 eth2
+      </screen>
+
+      <para>
+       In many cases the public addresses file will be the same on
+       all nodes.  However, it is possible to use different public
+       address configurations on different nodes.
+      </para>
+
+      <para>
+       Example: 4 nodes partitioned into two subgroups:
+      </para>
+      <screen format="linespecific">
+Node 0:/etc/ctdb/public_addresses
+       10.1.1.1/24 eth1
+       10.1.1.2/24 eth1
+
+Node 1:/etc/ctdb/public_addresses
+       10.1.1.1/24 eth1
+       10.1.1.2/24 eth1
+
+Node 2:/etc/ctdb/public_addresses
+       10.1.2.1/24 eth2
+       10.1.2.2/24 eth2
+
+Node 3:/etc/ctdb/public_addresses
+       10.1.2.1/24 eth2
+       10.1.2.2/24 eth2
+      </screen>
+      <para>
+       In this example nodes 0 and 1 host two public addresses on the
+       10.1.1.x network while nodes 2 and 3 host two public addresses
+       for the 10.1.2.x network.
+      </para>
+      <para>
+       Public address 10.1.1.1 can be hosted by either of nodes 0 or
+       1 and will be available to clients as long as at least one of
+       these two nodes are available.
+      </para>
+      <para>
+       If both nodes 0 and 1 become unavailable then public address
+       10.1.1.1 also becomes unavailable. 10.1.1.1 can not be failed
+       over to nodes 2 or 3 since these nodes do not have this public
+       address configured.
+      </para>
+      <para>
+        The <command>ctdb ip</command> command can be used to view the
+        current assignment of public addresses to physical nodes.
+      </para>
+    </refsect2>
+  </refsect1>
+
+
+  <refsect1>
+    <title>Node status</title>
+
+    <para>
+      The current status of each node in the cluster can be viewed by the 
+      <command>ctdb status</command> command.
+    </para>
+
+    <para>
+      A node can be in one of the following states:
+    </para>
+
+    <variablelist>
+      <varlistentry>
+       <term>OK</term>
+       <listitem>
+         <para>
+           This node is healthy and fully functional.  It hosts public
+           addresses to provide services.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>DISCONNECTED</term>
+       <listitem>
+         <para>
+           This node is not reachable by other nodes via the private
+           network.  It is not currently participating in the cluster.
+           It <emphasis>does not</emphasis> host public addresses to
+           provide services.  It might be shut down.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>DISABLED</term>
+       <listitem>
+         <para>
+           This node has been administratively disabled. This node is
+           partially functional and participates in the cluster.
+           However, it <emphasis>does not</emphasis> host public
+           addresses to provide services.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>UNHEALTHY</term>
+       <listitem>
+         <para>
+           A service provided by this node has failed a health check
+           and should be investigated.  This node is partially
+           functional and participates in the cluster.  However, it
+           <emphasis>does not</emphasis> host public addresses to
+           provide services.  Unhealthy nodes should be investigated
+           and may require an administrative action to rectify.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>BANNED</term>
+       <listitem>
+         <para>
+           CTDB is not behaving as designed on this node.  For example,
+           it may have failed too many recovery attempts.  Such nodes
+           are banned from participating in the cluster for a
+           configurable time period before they attempt to rejoin the
+           cluster.  A banned node <emphasis>does not</emphasis> host
+           public addresses to provide services.  All banned nodes
+           should be investigated and may require an administrative
+           action to rectify.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>STOPPED</term>
+       <listitem>
+         <para>
+           This node has been administratively exclude from the
+           cluster.  A stopped node does no participate in the cluster
+           and <emphasis>does not</emphasis> host public addresses to
+           provide services.  This state can be used while performing
+           maintenance on a node.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>PARTIALLYONLINE</term>
+       <listitem>
+         <para>
+           A node that is partially online participates in a cluster
+           like a healthy (OK) node.  Some interfaces to serve public
+           addresses are down, but at least one interface is up.  See
+           also <command>ctdb ifaces</command>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>CAPABILITIES</title>
+
+    <para>
+      Cluster nodes can have several different capabilities enabled.
+      These are listed below.
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+       <term>RECMASTER</term>
+       <listitem>
+         <para>
+           Indicates that a node can become the CTDB cluster recovery
+           master.  The current recovery master is decided via an
+           election held by all active nodes with this capability.
+         </para>
+         <para>
+           Default is YES.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>LMASTER</term>
+       <listitem>
+         <para>
+           Indicates that a node can be the location master (LMASTER)
+           for database records.  The LMASTER always knows which node
+           has the latest copy of a record in a volatile database.
+         </para>
+         <para>
+           Default is YES.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>LVS</term>
+       <listitem>
+         <para>
+           Indicates that a node is configued in Linux Virtual Server
+           (LVS) mode.  In this mode the entire CTDB cluster uses one
+           single public address for the entire cluster instead of
+           using multiple public addresses in failover mode.  This is
+           an alternative to using a load-balancing layer-4 switch.
+           See the <citetitle>LVS</citetitle> section for more
+           details.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>NATGW</term>
+       <listitem>
+         <para>
+           Indicates that this node is configured to become the NAT
+           gateway master in a NAT gateway group.  See the
+           <citetitle>NAT GATEWAY</citetitle> section for more
+           details.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+
+    <para>
+      The RECMASTER and LMASTER capabilities can be disabled when CTDB
+      is used to create a cluster spanning across WAN links. In this
+      case CTDB acts as a WAN accelerator.
+    </para>
+
+  </refsect1>
+
+  <refsect1>
+    <title>LVS</title>
+
+    <para>
+      LVS is a mode where CTDB presents one single IP address for the
+      entire cluster. This is an alternative to using public IP
+      addresses and round-robin DNS to loadbalance clients across the
+      cluster.
+    </para>
+
+    <para>
+      This is similar to using a layer-4 loadbalancing switch but with
+      some restrictions.
+    </para>
+
+    <para>
+      In this mode the cluster selects a set of nodes in the cluster
+      and loadbalance all client access to the LVS address across this
+      set of nodes. This set of nodes are all LVS capable nodes that
+      are HEALTHY, or if no HEALTHY nodes exists all LVS capable nodes
+      regardless of health status.  LVS will however never loadbalance
+      traffic to nodes that are BANNED, STOPPED, DISABLED or
+      DISCONNECTED. The <command>ctdb lvs</command> command is used to
+      show which nodes are currently load-balanced across.
+    </para>
+
+    <para>
+      One of the these nodes are elected as the LVSMASTER. This node
+      receives all traffic from clients coming in to the LVS address
+      and multiplexes it across the internal network to one of the
+      nodes that LVS is using.  When responding to the client, that
+      node will send the data back directly to the client, bypassing
+      the LVSMASTER node.  The command <command>ctdb
+      lvsmaster</command> will show which node is the current
+      LVSMASTER.
+    </para>
+
+    <para>
+      The path used for a client I/O is:
+      <orderedlist>
+       <listitem>
+         <para>
+           Client sends request packet to LVSMASTER.
+         </para>
+       </listitem>
+       <listitem>
+         <para>
+           LVSMASTER passes the request on to one node across the
+           internal network.
+         </para>
+       </listitem>
+       <listitem>
+         <para>
+           Selected node processes the request.
+         </para>
+       </listitem>
+       <listitem>
+         <para>
+           Node responds back to client.
+         </para>
+       </listitem>
+      </orderedlist>
+    </para>
+
+    <para> 
+      This means that all incoming traffic to the cluster will pass
+      through one physical node, which limits scalability. You can
+      send more data to the LVS address that one physical node can
+      multiplex. This means that you should not use LVS if your I/O
+      pattern is write-intensive since you will be limited in the
+      available network bandwidth that node can handle.  LVS does work
+      wery well for read-intensive workloads where only smallish READ
+      requests are going through the LVSMASTER bottleneck and the
+      majority of the traffic volume (the data in the read replies)
+      goes straight from the processing node back to the clients. For
+      read-intensive i/o patterns you can acheive very high throughput
+      rates in this mode.
+    </para>
+
+    <para>
+      Note: you can use LVS and public addresses at the same time.
+    </para>
+
+    <para>
+      If you use LVS, you must have a permanent address configured for
+      the public interface on each node. This address must be routable
+      and the cluster nodes must be configured so that all traffic
+      back to client hosts are routed through this interface. This is
+      also required in order to allow samba/winbind on the node to
+      talk to the domain controller.  This LVS IP address can not be
+      used to initiate outgoing traffic.
+    </para>
+    <para>
+      Make sure that the domain controller and the clients are
+      reachable from a node <emphasis>before</emphasis> you enable
+      LVS.  Also ensure that outgoing traffic to these hosts is routed
+      out through the configured public interface.
+    </para>
+
+    <refsect2>
+      <title>Configuration</title>
+
+      <para>
+       To activate LVS on a CTDB node you must specify the
+       <varname>CTDB_PUBLIC_INTERFACE</varname> and
+       <varname>CTDB_LVS_PUBLIC_IP</varname> configuration variables.
+       Setting the latter variable also enables the LVS capability on
+       the node at startup.
+      </para>
+       
+      <para>
+       Example:
+       <screen format="linespecific">
+CTDB_PUBLIC_INTERFACE=eth1
+CTDB_LVS_PUBLIC_IP=10.1.1.237
+       </screen>
+      </para>
+
+    </refsect2>
+  </refsect1>
+    
+  <refsect1>
+    <title>NAT GATEWAY</title>
+
+    <para>
+      NAT gateway (NATGW) is an optional feature that is used to
+      configure fallback routing for nodes.  This allows cluster nodes
+      to connect to external services (e.g. DNS, AD, NIS and LDAP)
+      when they do not host any public addresses (e.g. when they are
+      unhealthy).
+    </para>
+    <para>
+      This also applies to node startup because CTDB marks nodes as
+      UNHEALTHY until they have passed a "monitor" event.  In this
+      context, NAT gateway helps to avoid a "chicken and egg"
+      situation where a node needs to access an external service to
+      become healthy.
+    </para>
+    <para>
+      Another way of solving this type of problem is to assign an
+      extra static IP address to a public interface on every node.
+      This is simpler but it uses an extra IP address per node, while
+      NAT gateway generally uses only one extra IP address.
+    </para>
+
+    <refsect2>
+      <title>Operation</title>
+
+      <para>
+       One extra NATGW public address is assigned on the public
+       network to each NATGW group.  Each NATGW group is a set of
+       nodes in the cluster that shares the same NATGW address to
+       talk to the outside world.  Normally there would only be one
+       NATGW group spanning an entire cluster, but in situations
+       where one CTDB cluster spans multiple physical sites it might
+       be useful to have one NATGW group for each site.
+      </para>
+      <para>
+       There can be multiple NATGW groups in a cluster but each node
+       can only be member of one NATGW group.
+      </para>
+      <para>
+       In each NATGW group, one of the nodes is selected by CTDB to
+       be the NATGW master and the other nodes are consider to be
+       NATGW slaves.  NATGW slaves establish a fallback default route
+       to the NATGW master via the private network.  When a NATGW
+       slave hosts no public IP addresses then it will use this route
+       for outbound connections.  The NATGW master hosts the NATGW
+       public IP address and routes outgoing connections from
+       slave nodes via this IP address.  It also establishes a
+       fallback default route.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>Configuration</title>
+
+      <para>
+       NATGW is usually configured similar to the following example configuration:
+      </para>
+      <screen format="linespecific">
+CTDB_NATGW_NODES=/etc/ctdb/natgw_nodes
+CTDB_NATGW_PRIVATE_NETWORK=192.168.1.0/24
+CTDB_NATGW_PUBLIC_IP=10.0.0.227/24
+CTDB_NATGW_PUBLIC_IFACE=eth0
+CTDB_NATGW_DEFAULT_GATEWAY=10.0.0.1
+      </screen>
+
+      <para>
+       Normally any node in a NATGW group can act as the NATGW
+       master.  Some configurations may have special nodes that lack
+       connectivity to a public network.  In such cases,
+       <varname>CTDB_NATGW_SLAVE_ONLY</varname> can be used to limit the
+       NATGW functionality of thos nodes.
+      </para>
+
+      <para>
+       See the <citetitle>NAT GATEWAY</citetitle> section in
+       <citerefentry><refentrytitle>ctdb.conf</refentrytitle>
+       <manvolnum>5</manvolnum></citerefentry> for more details of
+       NATGW configuration.
+      </para>
+    </refsect2>
+
+
+    <refsect2>
+      <title>Implementation details</title>
+
+      <para>
+       When the NATGW functionality is used, one of the nodes is
+       selected to act as a NAT gateway for all the other nodes in
+       the group when they need to communicate with the external
+       services.  The NATGW master is selected to be a node that is
+       most likely to have usable networks.
+      </para>
+
+      <para>
+       The NATGW master hosts the NATGW public IP address
+       <varname>CTDB_NATGW_PUBLIC_IP</varname> on the configured public
+       interfaces <varname>CTDB_NATGW_PUBLIC_IFACE</varname> and acts as
+       a router, masquerading outgoing connections from slave nodes
+       via this IP address.  It also establishes a fallback default
+       route to the configured default gateway
+       <varname>CTDB_NATGW_DEFAULT_GATEWAY</varname> with a metric of 10.
+       A metric 10 route is used so it can co-exist with other
+       default routes that may be available.
+      </para>
+
+      <para>
+       A NATGW slave establishes its fallback default route to the
+       NATGW master via the private network
+       <varname>CTDB_NATGW_PRIVATE_NETWORK</varname>with a metric of 10.
+       This route is used for outbound connections when no other
+       default route is available because the node hosts no public
+       addresses.  A metric 10 routes is used so that it can co-exist
+       with other default routes that may be available when the node
+       is hosting public addresses.
+      </para>
+
+      <para>
+       This is implemented in the <filename>11.natgw</filename>
+       eventscript. Please see the eventscript file for the finer
+       details.
+      </para>
+
+    </refsect2>
+  </refsect1>
+
+  <refsect1>
+    <title>POLICY ROUTING</title>
+
+    <para>
+      Policy routing is an optional CTDB feature to support complex
+      network topologies.  Public addresses may be spread across
+      several different networks (or VLANs) and it may not be possible
+      to route packets from these public addresses via the system's
+      default route.  Therefore, CTDB has support for policy routing
+      via the <filename>13.per_ip_routing</filename> eventscript.
+      This allows routing to be specified for packets sourced from
+      each public address.  The routes are added and removed as CTDB
+      moves public addresses between nodes.
+    </para>
+
+    <refsect2>
+      <title>Configuration variables</title>
+
+      <para>
+       There are 4 configuration variables related to policy routing:
+       <varname>CTDB_PER_IP_ROUTING_CONF</varname>,
+       <varname>CTDB_PER_IP_ROUTING_RULE_PREF</varname>,
+       <varname>CTDB_PER_IP_ROUTING_TABLE_ID_LOW</varname>,
+       <varname>CTDB_PER_IP_ROUTING_TABLE_ID_HIGH</varname>.  See the
+       <citetitle>POLICY ROUTING</citetitle> section in
+       <citerefentry><refentrytitle>ctdbd.conf</refentrytitle>
+       <manvolnum>5</manvolnum></citerefentry> for more details.
+      </para>
+    </refsect2>
+
+    <refsect2>
+      <title>Configuration</title>
+
+      <para>
+       The format of each line of
+       <varname>CTDB_PER_IP_ROUTING_CONF</varname> is:
+      </para>
+      
+      <screen>
+&lt;public_address&gt; &lt;network&gt; [ &lt;gateway&gt; ]
+      </screen>
+
+      <para>
+       Leading whitespace is ignored and arbitrary whitespace may be
+       used as a separator.  Lines that have a "public address" item
+       that doesn't match an actual public address are ignored.  This
+       means that comment lines can be added using a leading
+       character such as '#', since this will never match an IP
+       address.
+      </para>
+
+      <para>
+       A line without a gateway indicates a link local route.
+      </para>
+
+      <para>
+       For example, consider the configuration line:
+      </para>
+
+      <screen>
+  192.168.1.99 192.168.1.1/24
+      </screen>
+
+      <para>
+       If the corresponding public_addresses line is:
+      </para>
+
+      <screen>
+  192.168.1.99/24     eth2,eth3
+      </screen>
+
+      <para>
+       <varname>CTDB_PER_IP_ROUTING_RULE_PREF</varname> is 100, and
+       CTDB adds the address to eth2 then the following routing
+       information is added:
+      </para>
+
+      <screen>
+  ip rule add from 192.168.1.99 pref 100 table ctdb.192.168.1.99
+  ip route add 192.168.1.0/24 dev eth2 table ctdb.192.168.1.99
+      </screen>
+
+      <para>  
+       This causes traffic from 192.168.1.1 to 192.168.1.0/24 go via
+       eth2.
+      </para>
+
+      <para>
+       The <command>ip rule</command> command will show (something
+       like - depending on other public addresses and other routes on
+       the system):
+      </para>
+
+      <screen>
+  0:           from all lookup local 
+  100:         from 192.168.1.99 lookup ctdb.192.168.1.99
+  32766:       from all lookup main 
+  32767:       from all lookup default 
+      </screen>
+
+      <para>
+       <command>ip route show table ctdb.192.168.1.99</command> will show:
+      </para>
+
+      <screen>
+  192.168.1.0/24 dev eth2 scope link
+      </screen>
+
+      <para>
+       The usual use for a line containing a gateway is to add a
+       default route corresponding to a particular source address.
+       Consider this line of configuration:
+      </para>
+
+      <screen>
+  192.168.1.99 0.0.0.0/0       192.168.1.1
+      </screen>
+
+      <para>
+       In the situation described above this will cause an extra
+       routing command to be executed:
+      </para>
+
+      <screen>
+  ip route add 0.0.0.0/0 via 192.168.1.1 dev eth2 table ctdb.192.168.1.99
+      </screen>
+
+      <para>
+       With both configuration lines, <command>ip route show table
+       ctdb.192.168.1.99</command> will show:
+      </para>
+
+      <screen>
+  192.168.1.0/24 dev eth2 scope link 
+  default via 192.168.1.1 dev eth2 
+      </screen>
+    </refsect2>
+
+    <refsect2>
+      <title>Sample configuration</title>
+
+      <para>
+       Here is a more complete example configuration.
+      </para>
+
+      <screen>
+/etc/ctdb/public_addresses:
+
+  192.168.1.98 eth2,eth3
+  192.168.1.99 eth2,eth3
+
+/etc/ctdb/policy_routing:
+
+  192.168.1.98 192.168.1.0/24
+  192.168.1.98 192.168.200.0/24        192.168.1.254
+  192.168.1.98 0.0.0.0/0       192.168.1.1
+  192.168.1.99 192.168.1.0/24
+  192.168.1.99 192.168.200.0/24        192.168.1.254
+  192.168.1.99 0.0.0.0/0       192.168.1.1
+      </screen>
+
+      <para>
+       The routes local packets as expected, the default route is as
+       previously discussed, but packets to 192.168.200.0/24 are
+       routed via the alternate gateway 192.168.1.254.
+      </para>
+
+    </refsect2>
+  </refsect1>
+
+  <refsect1>
+    <title>NOTIFICATION SCRIPT</title>
+
+    <para>
+      When certain state changes occur in CTDB, it can be configured
+      to perform arbitrary actions via a notification script.  For
+      example, sending SNMP traps or emails when a node becomes
+      unhealthy or similar.
+    </para>
+    <para>
+      This is activated by setting the
+      <varname>CTDB_NOTIFY_SCRIPT</varname> configuration variable.
+      The specified script must be executable.  
+    </para>
+    <para>
+      Use of the provided <filename>/etc/ctdb/notify.sh</filename>
+      script is recommended.  It executes files in
+      <filename>/etc/ctdb/notify.d/</filename>.
+    </para>
+    <para>
+      CTDB currently generates notifications after CTDB changes to
+      these states:
+    </para>
+
+    <simplelist>
+      <member>init</member>
+      <member>setup</member>
+      <member>startup</member>
+      <member>healthy</member>
+      <member>unhealthy</member>
+    </simplelist>
+
+  </refsect1>
+
+  <refsect1>
+    <title>DEBUG LEVELS</title>
+
+    <para>
+      Valid values for DEBUGLEVEL are:
+    </para>
+
+    <simplelist>
+      <member>EMERG (-3)</member>
+      <member>ALERT (-2)</member>
+      <member>CRIT (-1)</member>
+      <member>ERR (0)</member>
+      <member>WARNING (1)</member>
+      <member>NOTICE (2)</member>
+      <member>INFO (3)</member>
+      <member>DEBUG (4)</member>
+    </simplelist>
+  </refsect1>
+
+
+  <refsect1>
+    <title>REMOTE CLUSTER NODES</title>
+    <para>
+It is possible to have a CTDB cluster that spans across a WAN link. 
+For example where you have a CTDB cluster in your datacentre but you also
+want to have one additional CTDB node located at a remote branch site.
+This is similar to how a WAN accelerator works but with the difference 
+that while a WAN-accelerator often acts as a Proxy or a MitM, in 
+the ctdb remote cluster node configuration the Samba instance at the remote site
+IS the genuine server, not a proxy and not a MitM, and thus provides 100%
+correct CIFS semantics to clients.
+    </para>
+
+    <para>
+       See the cluster as one single multihomed samba server where one of
+       the NICs (the remote node) is very far away.
+    </para>
+
+    <para>
+       NOTE: This does require that the cluster filesystem you use can cope
+       with WAN-link latencies. Not all cluster filesystems can handle
+       WAN-link latencies! Whether this will provide very good WAN-accelerator
+       performance or it will perform very poorly depends entirely
+       on how optimized your cluster filesystem is in handling high latency
+       for data and metadata operations.
+    </para>
+
+    <para>
+       To activate a node as being a remote cluster node you need to set
+       the following two parameters in /etc/sysconfig/ctdb  for the remote node:
+        <screen format="linespecific">
+CTDB_CAPABILITY_LMASTER=no
+CTDB_CAPABILITY_RECMASTER=no
+       </screen>
+    </para>
+
+    <para>
+       Verify with the command "ctdb getcapabilities" that that node no longer
+       has the recmaster or the lmaster capabilities.
+    </para>
+
+  </refsect1>
+
+
+  <refsect1>
+    <title>SEE ALSO</title>
+
+    <para>
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd_wrapper</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ltdbtool</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>onnode</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ping_pong</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd.conf</refentrytitle>
+      <manvolnum>5</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb-tunables</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by
+       Ronnie Sahlberg,
+       Amitay Isaacs,
+       Martin Schwenke
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2007</year>
+      <holder>Andrew Tridgell</holder>
+      <holder>Ronnie Sahlberg</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/ctdbd.1.xml b/ctdb/doc/ctdbd.1.xml
new file mode 100644 (file)
index 0000000..402f083
--- /dev/null
@@ -0,0 +1,623 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+
+<refentry id="ctdbd.1">
+
+  <refmeta>
+    <refentrytitle>ctdbd</refentrytitle>
+    <manvolnum>1</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>ctdbd</refname>
+    <refpurpose>The CTDB cluster daemon</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>ctdbd</command>
+      <arg rep="repeat"><replaceable>OPTION</replaceable></arg>
+    </cmdsynopsis>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+    <para>
+      ctdbd is the main CTDB daemon.
+    </para>
+
+    <para>
+      Note that ctdbd is not usually invoked directly.  It is invoked
+      via <citerefentry><refentrytitle>ctdbd_wrapper</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry> or via the initscript.
+    </para>
+
+    <para>
+      See <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry> for an overview of CTDB.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>GENERAL OPTIONS</title>
+
+    <variablelist>
+      <varlistentry>
+       <term>-d, --debug=<parameter>DEBUGLEVEL</parameter></term>
+       <listitem>
+         <para>
+           This option sets the debug level to DEBUGLEVEL, which
+           controls what will be written to the logfile. The default is
+           0 which will only log important events and errors. A larger
+           number will provide additional logging.
+         </para>
+         <para>
+           See the <citetitle>DEBUG LEVELS</citetitle> section in
+           <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--dbdir=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           DIRECTORY on local storage where ctdbd keeps a local copy of
+           TDB databases.  This directory is local for each node and
+           should not be stored on the shared cluster filesystem.
+         </para>
+         <para>
+           This directory would usually be <filename>/var/ctdb</filename>
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--dbdir-persistent=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           DIRECTORY on local storage where ctdbd keeps a local copy of
+           persistent TDB databases.  This directory is local for each
+           node and should not be stored on the shared cluster
+           filesystem.
+         </para>
+         <para>
+           This directory would usually be
+           <filename>/etc/ctdb/persistent</filename>
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--dbdir-state=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           DIRECTORY on local storage where ctdbd keep internal state
+           TDB files.  This directory is local for each node and
+           should not be stored on the shared cluster filesystem.
+         </para>
+         <para>
+           This directory would usually be
+           <filename>/var/ctdb/state</filename>
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--event-script-dir=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           DIRECTORY where the CTDB event scripts are stored.  See the
+           <citetitle>EVENT SCRIPTS</citetitle> section in
+           <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more information.
+         </para>
+         <para>
+           Default is <envar>CTDB_BASE</envar>/events.d, so usually
+           <filename>/etc/ctdb/events.d</filename>, which is part of
+           the CTDB installation.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--logfile=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME where ctdbd will write its log. This is usually
+           <filename>/var/log/log.ctdb</filename>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--log-ringbuf-size=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           Set the size of the log ringbuffer to NUM entries.
+         </para>
+         <para>
+           CTDB uses an in-memory ringbuffer containing NUM most
+           recent log entries for all log levels (except DEBUG).  The
+           ringbugger can be useful for extracting detailed logs even
+           if some entries are not logged to the regular logs.
+         </para>
+         <para>
+           Use the <command>ctdb getlog</command> command to retrieve
+           log entries from the ringbuffer.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--lvs</term>
+       <listitem>
+         <para>
+           This option is used to activate the LVS capability on a CTDB
+           node.  Please see the <citetitle>LVS</citetitle> section in
+           <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--max-persistent-check-errors=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           NUM specifies the maximum number of health check failures
+           allowed for persistent databases during startup.
+         </para>
+         <para>
+           The default value is 0.  Setting this to non-zero allows a
+           node with unhealthy persistent databases to startup and
+           join the cluster as long as there is another node with
+           healthy persistent databases.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--nlist=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME containing a list of the private IP addresses, one
+           per line, for each node in the cluster.  This file
+           <emphasis>must be the same on each node</emphasis> in the
+           cluster.
+         </para>
+         <para>
+           Default is <envar>CTDB_BASE</envar>/nodes, so usually
+           <filename>/etc/ctdb/nodes</filename>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--no-lmaster</term>
+       <listitem>
+         <para>
+           This argument specifies that this node can NOT become an lmaster
+           for records in the database. This means that it will never show up
+           in the vnnmap. This feature is primarily used for making a cluster
+           span across a WAN link and use CTDB as a WAN-accelerator.
+         </para>
+         <para>
+           Please see the <citetitle>REMOTE CLUSTER NODES</citetitle>
+           section in <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--no-recmaster</term>
+       <listitem>
+         <para>
+           This argument specifies that this node can NOT become a recmaster
+           for the database. This feature is primarily used for making a cluster
+           span across a WAN link and use CTDB as a WAN-accelerator.
+         </para>
+         <para>
+           Please see the <citetitle>REMOTE CLUSTER NODES</citetitle>
+           section in <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--notification-script=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME specifying a script to be invoked by ctdbd when
+           certain state changes occur.
+         </para>
+         <para>
+           This file is usually
+           <filename>/etc/ctdb/notify.sh</filename>.
+         </para>
+         <para>
+           Please see the <citetitle>NOTIFICATION SCRIPT</citetitle>
+           section in <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--pidfile=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME for file containing process ID of main CTDB
+           daemon.  This file is automatically created and removed by
+           CTDB.
+         </para>
+         <para>
+           The default is to not create a PID file.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--public_addresses=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME specifying a file containing the public IP
+           addresses to use on the cluster when CTDB should use IP
+           takeover. This file contains a list of IP addresses,
+           netmasks and interfaces.  CTDB will distribute these public
+           IP addresses appropriately across the available nodes.
+         </para>
+         <para>
+           The IP addresses specified in this file can differ across
+           nodes.
+         </para>
+         <para>
+           This is usually the file
+           <filename>/etc/ctdb/public_addresses</filename>
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--public-interface=<parameter>INTERFACE</parameter></term>
+       <listitem>
+         <para>
+           INTERFACE on which to attach public IP addresses or on which
+           to attach the single-public-ip when used.
+         </para>
+         <para>
+           When using public IP addresses, this is only required if
+           interfaces are not explicitly specified in the public
+           addresses file.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--reclock=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME is the name of the recovery lock file stored in
+           <emphasis>shared storage</emphasis> that ctdbd uses to
+           prevent split brains from occuring.
+         </para>
+         <para>
+           It is possible to run CTDB without a recovery lock file, but
+           then there will be no protection against split brain if the
+           cluster/network becomes partitioned. Using CTDB without a
+           reclock file is strongly discouraged.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--single-public-ip=<parameter>IPADDR</parameter></term>
+       <listitem>
+         <para>
+           IPADDR specifies the single IP that CTDB will use in
+           conjuction with LVS.
+         </para>
+         <para>
+           Please see the <citetitle>LVS</citetitle> section in
+           <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--start-as-disabled</term>
+       <listitem>
+         <para>
+           This makes ctdbd start in the DISABLED state.
+         </para>
+         <para>
+           To allow the node to host public IP addresses and
+           services, it must be manually enabled using the
+           <command>ctdb enable</command> command.
+         </para>
+         <para>
+           Please see the <citetitle>NODE STATES</citetitle> section
+           in <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information about the DISABLED state.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--start-as-stopped</term>
+       <listitem>
+         <para>
+           This makes ctdbd start in the STOPPED state.
+         </para>
+         <para>
+           To allow the node to take part in the cluster it must be
+           manually continued with the the <command>ctdb
+           enable</command> command.
+         </para>
+         <para>
+           Please see the <citetitle>NODE STATES</citetitle> section
+           in <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information about the STOPPED state.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--syslog</term>
+       <listitem>
+         <para>
+           Send log messages to syslog instead of the CTDB logfile.
+           This option overrides --logfile.  The default is to log to
+           a file.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--transport=tcp|infiniband</term>
+       <listitem>
+         <para>
+           This option specifies which transport to use for ctdbd
+           internode communications. The default is "tcp".
+         </para>
+         <para>
+           The "infiniband" support is not regularly tested.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>-?, --help</term>
+       <listitem>
+         <para>
+           Display a summary of options.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>DEBUGGING OPTIONS</title>
+
+    <variablelist>
+
+      <varlistentry>
+       <term>-i, --interactive</term>
+       <listitem>
+         <para>
+           Enable interactive mode.  This will make ctdbd run in the
+           foreground and not detach from the terminal.  By default
+           ctdbd will detach itself and run in the background as a
+           daemon.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--listen=<parameter>IPADDR</parameter></term>
+       <listitem>
+         <para>
+           This specifies which IP address that ctdbd will bind to.
+         </para>
+         <para>
+           By default ctdbd will bind to the first address it finds in
+           the <filename>/etc/ctdb/nodes</filename> file that is also
+           present on the local system.
+         </para>
+         <para>
+           This option is only required when you want to run multiple
+           ctdbd daemons/nodes on the same physical host in which case
+           there would be multiple entries in
+           <filename>/etc/ctdb/nodes</filename> that would match a
+           local interface.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--nopublicipcheck</term>
+       <listitem>
+         <para>
+           This option is used when testing with multiple local
+           daemons on a single machine.  It disables checks related
+           to public IP addresses.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--nosetsched</term>
+       <listitem>
+         <para>
+           This is a debugging option. This option is only used when
+           debugging ctdbd.
+         </para>
+         <para>
+           Normally ctdbd will change its scheduler to run as a
+           real-time process. This is the default mode for a normal
+           ctdbd operation to gurarantee that ctdbd always gets the CPU
+           cycles that it needs.
+         </para>
+         <para>
+           This option is used to tell ctdbd to
+           <emphasis>not</emphasis> run as a real-time process and
+           instead run ctdbd as a normal userspace process.  This is
+           useful for debugging and when you want to run ctdbd under
+           valgrind or gdb. (You don't want to attach valgrind or gdb
+           to a real-time process.)
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--socket=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME specifies the name of the Unix domain socket that
+           ctdbd will create. This socket is used by local clients to
+           communicate with ctdbd.
+         </para>
+         <para>
+           The default is <filename>/tmp/ctdb.socket</filename> . You
+           only need to use this option if you plan to run multiple
+           ctdbd daemons on the same physical host, usually for
+           testing.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--script-log-level=<parameter>DEBUGLEVEL</parameter></term>
+       <listitem>
+         <para>
+           This option sets the debug level of event script output to
+           DEBUGLEVEL.  The default is ERR (0).
+         </para>
+         <para>
+           See the <citetitle>DEBUG LEVELS</citetitle> section in
+           <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>7</manvolnum></citerefentry> for more
+           information.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--sloppy-start</term>
+       <listitem>
+         <para>
+           This is debugging option.  This speeds up the initial
+           recovery during startup at the expense of some consistency
+           checking.  <emphasis>Don't use this option in
+           production</emphasis>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--torture</term>
+       <listitem>
+         <para>
+           This option is only used for development and testing of
+           CTDB.  It adds artificial errors and failures to the
+           common codepaths in ctdbd to verify that ctdbd can recover
+           correctly from failures.
+         </para>
+         <para>
+           <emphasis>Do not use this option</emphasis> unless you are
+           developing and testing new functionality in CTDB.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>--valgrinding</term>
+       <listitem>
+         <para>
+           This is a debugging option. This option is only used when
+           debugging ctdbd.  This enables additional debugging
+           capabilities and implies --nosetsched.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>SEE ALSO</title>
+    <para>
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd_wrapper</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>onnode</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb-tunables</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by
+       Ronnie Sahlberg,
+       Amitay Isaacs,
+       Martin Schwenke
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2007</year>
+      <holder>Andrew Tridgell</holder>
+      <holder>Ronnie Sahlberg</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/ctdbd.conf.5.xml b/ctdb/doc/ctdbd.conf.5.xml
new file mode 100644 (file)
index 0000000..a1f6db5
--- /dev/null
@@ -0,0 +1,1598 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+
+<refentry id="ctdbd.conf.5">
+
+  <refmeta>
+    <refentrytitle>ctdbd.conf</refentrytitle>
+    <manvolnum>5</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>ctdbd.conf</refname>
+    <refpurpose>CTDB daemon configuration file</refpurpose>
+  </refnamediv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+
+    <para>
+      This file contains CTDB configuration variables that are affect
+      the operation of CTDB.  The default location of this file is
+      <filename>/etc/ctdb/ctdbd.conf</filename>.
+    </para>
+
+    <para>
+      This file is a shell script (see
+      <citerefentry><refentrytitle>sh</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>) but is usually limited
+      to simple variable assignments and shell-style comments.
+    </para>
+
+    <para>
+      CTDB configuration variables are grouped into several categories below.
+    </para>
+
+    <para>
+      Variables defined in this document can also be set in a
+      distribution-specific configuration file such as
+      <filename>/etc/sysconfig/ctdb</filename> (Red Hat) or
+      <filename>/etc/default/ctdb</filename> (Debian).  However, these
+      files should be reserved for variables used by the initscript.
+      A historical alternative is
+      <filename>/etc/ctdb/sysconfig/ctdb</filename> - this is
+      deprecated.
+    </para>
+
+  </refsect1>
+
+  <refsect1>
+    <title>
+      INITSCRIPT CONFIGURATION
+    </title>
+
+    <para>
+      Some options must be available to the initscript so they need to
+      be set in the distribution-specific initscript configuration,
+      such as <filename>/etc/sysconfig/ctdb</filename> or
+      <filename>/etc/default/ctdb</filename>.
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+       <term>CTDB_PIDFILE=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME is the name of the file used to contain the
+           process ID (PID) of the main CTDB daemon when it is
+           running.  This is passed from the initscript to
+           <citerefentry><refentrytitle>ctdbd_wrapper</refentrytitle>
+           <manvolnum>1</manvolnum></citerefentry>.
+         </para>
+
+         <para>
+           Default is <filename>/var/run/ctdb/ctdbd.pid</filename>.
+           Corresponds to <option>--pidfile</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>
+      GLOBAL CONFIGURATION
+    </title>
+
+    <para>
+      These options may be used in the initscripts, daemon and
+      scripts.
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+       <term>CTDB_BASE=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           DIRECTORY containing CTDB scripts and configuration files.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_VARDIR=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           DIRECTORY containing CTDB files that are modified at
+           runtime.
+         </para>
+         <para>
+           Defaults to <filename>/var/ctdb</filename>, unless
+           <filename>/var/lib/ctdb</filename> already exists in which
+           case it is used.
+         </para>
+       </listitem>
+      </varlistentry>
+
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>
+      DAEMON CONFIGURATION
+    </title>
+
+    <para>
+      Variables in this section are processed by
+      <citerefentry><refentrytitle>ctdbd_wrapper</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry> and are converted into
+      command-line arguments to
+      <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>.  Correspondence with
+      <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry> options is shown for
+      each variable.  The the documentation for the relevant options
+      for more details.
+    </para>
+
+    <para>
+      Many of these variables are also used by event scripts.
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+       <term>CTDB_CAPABILITY_LMASTER=yes|no</term>
+       <listitem>
+         <para>
+           Defaults to yes.  Corresponds to <option>--no-lmaster</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_CAPABILITY_RECMASTER=yes|no</term>
+       <listitem>
+         <para>
+           Defaults to yes.  Corresponds to
+           <option>--no-recmaster</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_DBDIR=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           Defaults to <varname>CTDB_VARDIR</varname>.  Corresponds to
+           <option>--dbdir</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_DBDIR_PERSISTENT=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           Defaults to <varname>CTDB_VARDIR</varname>/persistent.
+           Corresponds to <option>--dbdir-persistent</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_DBDIR_STATE=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           Defaults to <varname>CTDB_VARDIR</varname>/state.
+           Corresponds to <option>--dbdir-state</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_DEBUGLEVEL=<parameter>DEBUGLEVEL</parameter></term>
+       <listitem>
+         <para>
+           Default is ERR (0).  Corresponds to <option>-d</option> or
+           <option>--debug</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_EVENT_SCRIPT_DIR=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           Default is <varname>CTDB_BASE</varname>/events.d, so usually
+           <filename>/etc/ctdb/events.d</filename>.  Corresponds to
+           <option>--event-script-dir</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_LOGFILE=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           Defaults to <filename>/var/log/log.ctdb</filename>.
+           Corresponds to <option>--logfile</option>.  See also
+           <citetitle>CTDB_SYSLOG</citetitle>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_LOG_RINGBUF_SIZE=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           Default is 0.  Corresponds to
+           <option>--log-ringbuf-size</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_LVS_PUBLIC_IP=<parameter>IPADDR</parameter></term>
+       <listitem>
+         <para>
+           No default.  Corresponds to "<option>--lvs</option>
+           <option>--single-public-ip IPADDR"</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_NODES=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           Default is <varname>CTDB_BASE</varname>/nodes, so usually
+           <filename>/etc/ctdb/nodes</filename>.  Corresponds to
+           <option>--nlist</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_NOTIFY_SCRIPT=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           No default, usually
+           <filename>/etc/ctdb/notify.sh</filename>.  Corresponds to
+           <option>--notification-script</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_MAX_PERSISTENT_CHECK_ERRORS=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           Default 0.  Corresponds to
+           <option>--max-persistent-check-errors</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_PUBLIC_ADDRESSES=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           No default, usually
+           <filename>/etc/ctdb/public_addresses</filename>.
+           Corresponds to <option>--public-addresses</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_PUBLIC_INTERFACE=<parameter>INTERFACE</parameter></term>
+       <listitem>
+         <para>
+           No default.  Corresponds to
+           <option>--public-interface</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_RECOVERY_LOCK=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           Defaults to
+           <filename>/some/place/on/shared/storage</filename>, which
+           should be change to a useful value.  Corresponds to
+           <option>--reclock</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_SCRIPT_LOG_LEVEL=<parameter>DEBUGLEVEL</parameter></term>
+       <listitem>
+         <para>
+           Defaults to ERR (0).  Corresponds to
+           <option>--script-log-level</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_SOCKET=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           Defaults to <filename>/tmp/ctdb.socket</filename>.
+           Corresponds to <option>--socket</option>.
+         </para>
+         <para>
+           If you change this then you probably want to set this in
+           root's enviroment (perhaps in a file in
+           <filename>/etc/profile.d</filename>) so that you can use
+           the <citerefentry><refentrytitle>ctdb</refentrytitle>
+           <manvolnum>1</manvolnum></citerefentry> command in a
+           straightforward manner.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_START_AS_DISABLED=yes|no</term>
+       <listitem>
+         <para>
+           Default is no.  Corresponds to
+           <option>--start-as-disabled</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_START_AS_STOPPED=yes|no</term>
+       <listitem>
+         <para>
+           Default is no.  Corresponds to
+           <option>--start-as-stopped</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_SYSLOG=yes|no</term>
+       <listitem>
+         <para>
+           Default is no.  Corresponds to <option>--syslog</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_TRANSPORT=tcp|infiniband</term>
+       <listitem>
+         <para>
+           Defaults to tcp.  Corresponds to
+           <option>--transport</option>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+
+    <para>
+      While the following variables do not translate into daemon
+      options they are used by
+      <citerefentry><refentrytitle>ctdbd_wrapper</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry> when starting and
+      stopping <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>.
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+       <term>CTDB_SHUTDOWN_TIMEOUT=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           NUM is the number of seconds to wait for
+           <citerefentry><refentrytitle>ctdbd</refentrytitle>
+           <manvolnum>1</manvolnum></citerefentry> to shut down
+           gracefully before giving up and killing it.
+         </para>
+
+         <para>
+           Defaults is 30.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_STARTUP_TIMEOUT=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           NUM is the number of seconds to wait for
+           <citerefentry><refentrytitle>ctdbd</refentrytitle>
+           <manvolnum>1</manvolnum></citerefentry> complete early
+           initialisation up to a point where it is unlikely to
+           abort.  If <command>ctdbd</command> doesn't complete the
+           "setup" event before this timeout then it is killed.
+         </para>
+
+         <para>
+           Defaults is 10.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>NETWORK CONFIGURATION</title>
+
+    <refsect2>
+      <title>NAT GATEWAY</title>
+
+      <para>
+       NAT gateway is used to configure fallback routing for nodes
+       when they do not host any public IP addresses.  For example,
+       it allows unhealthy nodes to reliably communicate with
+       external infrastructure.  One node in a NAT gateway group will
+       be designated as the NAT gateway master node and other (slave)
+       nodes will be configured with fallback routes via the NAT
+       gateway master node.  For more information, see the
+       <citetitle>NAT GATEWAY</citetitle> section in
+       <citerefentry><refentrytitle>ctdb</refentrytitle>
+       <manvolnum>7</manvolnum></citerefentry>.
+      </para>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_NATGW_DEFAULT_GATEWAY=<parameter>IPADDR</parameter></term>
+         <listitem>
+           <para>
+             IPADDR is an alternate network gateway to use on the NAT
+             gateway master node.  A fallback default route is added
+             via this network gateway.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NATGW_NODES=<parameter>FILENAME</parameter></term>
+         <listitem>
+           <para>
+             FILENAME contains the list of nodes that belong to the
+             same NAT gateway group.
+           </para>
+           <para>
+             File format:
+             <screen>
+<parameter>IPADDR</parameter>
+             </screen>
+           </para>
+           <para>
+             No default, usually
+             <filename>/etc/ctdb/natgw_nodes</filename> when enabled.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NATGW_PRIVATE_NETWORK=<parameter>IPADDR/MASK</parameter></term>
+         <listitem>
+           <para>
+             IPADDR/MASK is the private sub-network that is
+             internally routed via the NAT gateway master node.  This
+             is usually the private network that is used for node
+             addresses.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NATGW_PUBLIC_IFACE=<parameter>IFACE</parameter></term>
+         <listitem>
+           <para>
+             IFACE is the network interface on which the
+             CTDB_NATGW_PUBLIC_IP will be configured.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NATGW_PUBLIC_IP=<parameter>IPADDR/MASK</parameter></term>
+         <listitem>
+           <para>
+             IPADDR/MASK indicates the IP address that is used for
+             outgoing traffic (originating from
+             CTDB_NATGW_PRIVATE_NETWORK) on the NAT gateway master
+             node.  This <emphasis>must not</emphasis> be a
+             configured public IP address.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NATGW_SLAVE_ONLY=yes|no</term>
+         <listitem>
+           <para>
+             When set to "yes" a node can not be a NAT gateway master node.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+      </variablelist>
+
+      <refsect3>
+       <title>Example</title>
+       <screen>
+CTDB_NATGW_NODES=/etc/ctdb/natgw_nodes
+CTDB_NATGW_PRIVATE_NETWORK=192.168.1.0/24
+CTDB_NATGW_DEFAULT_GATEWAY=10.0.0.1
+CTDB_NATGW_PUBLIC_IP=10.0.0.227/24
+CTDB_NATGW_PUBLIC_IFACE=eth0
+       </screen>
+      </refsect3>
+
+    </refsect2>
+
+    <refsect2>
+      <title>POLICY ROUTING</title>
+
+      <para>
+       A node running CTDB may be a component of a complex network
+       topology.  In particular, public addresses may be spread
+       across several different networks (or VLANs) and it may not be
+       possible to route packets from these public addresses via the
+       system's default route.  Therefore, CTDB has support for
+       policy routing via the <filename>13.per_ip_routing</filename>
+       eventscript.  This allows routing to be specified for packets
+       sourced from each public address.  The routes are added and
+       removed as CTDB moves public addresses between nodes.
+      </para>
+
+      <para>
+       For more information, see the <citetitle>POLICY
+       ROUTING</citetitle> section in
+       <citerefentry><refentrytitle>ctdb</refentrytitle>
+       <manvolnum>7</manvolnum></citerefentry>.
+      </para>
+
+      <variablelist>
+       <varlistentry>
+         <term>CTDB_PER_IP_ROUTING_CONF=<parameter>FILENAME</parameter></term>
+         <listitem>
+           <para>
+             FILENAME contains elements for constructing the desired
+             routes for each source address.
+           </para>
+
+           <para>
+             The special FILENAME value
+             <constant>__auto_link_local__</constant> indicates that no
+             configuration file is provided and that CTDB should
+             generate reasonable link-local routes for each public IP
+             address.
+           </para>
+
+           <para>
+             File format:
+             <screen>
+<parameter>IPADDR</parameter> <parameter>DEST-IPADDR/MASK</parameter> <optional><parameter>GATEWAY-IPADDR</parameter></optional>
+             </screen>
+           </para>
+
+           <para>
+             No default, usually
+             <filename>/etc/ctdb/policy_routing</filename> when enabled.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_PER_IP_ROUTING_RULE_PREF=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           NUM sets the priority (or preference) for the routing
+           rules that are added by CTDB.
+         </para>
+
+         <para>
+           This should be (strictly) greater than 0 and (strictly)
+           less than 32766.  A priority of 100 is recommended, unless
+           this conflicts with a priority already in use on the
+           system.  See
+           <citerefentry><refentrytitle>ip</refentrytitle>
+           <manvolnum>8</manvolnum></citerefentry>, for more details.
+         </para>
+       </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>
+           CTDB_PER_IP_ROUTING_TABLE_ID_LOW=<parameter>LOW-NUM</parameter>,
+           CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=<parameter>HIGH-NUM</parameter>
+         </term>
+         <listitem>
+           <para>
+             CTDB determines a unique routing table number to use for
+             the routing related to each public address.  LOW-NUM and
+             HIGH-NUM indicate the minimum and maximum routing table
+             numbers that are used.
+           </para>
+
+           <para>
+             <citerefentry><refentrytitle>ip</refentrytitle>
+             <manvolnum>8</manvolnum></citerefentry> uses some
+             reserved routing table numbers below 255.  Therefore,
+             CTDB_PER_IP_ROUTING_TABLE_ID_LOW should be (strictly)
+             greater than 255.
+           </para>
+
+           <para>
+             CTDB uses the standard file
+             <filename>/etc/iproute2/rt_tables</filename> to maintain
+             a mapping between the routing table numbers and labels.
+             The label for a public address
+             <replaceable>ADDR</replaceable> will look like
+             ctdb.<replaceable>addr</replaceable>.  This means that
+             the associated rules and routes are easy to read (and
+             manipulate).
+           </para>
+
+           <para>
+             No default, usually 1000 and 9000.
+           </para>
+         </listitem>
+       </varlistentry>
+      </variablelist>
+
+      <refsect3>
+       <title>Example</title>
+       <screen>
+CTDB_PER_IP_ROUTING_CONF=/etc/ctdb/policy_routing
+CTDB_PER_IP_ROUTING_RULE_PREF=100
+CTDB_PER_IP_ROUTING_TABLE_ID_LOW=1000
+CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
+       </screen>
+      </refsect3>
+
+    </refsect2>
+
+    <refsect2>
+      <title>MISCELLANEOUS NETWORK CONFIGURATION</title>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_PARTIALLY_ONLINE_INTERFACES=yes|no</term>
+         <listitem>
+           <para>
+             Whether one or more offline interfaces should cause a
+             monitor event to fail if there are other interfaces that
+             are up.  If this is "yes" and a node has some interfaces
+             that are down then <command>ctdb status</command> will
+             display the node as "PARTIALLYONLINE".
+           </para>
+
+           <para>
+             Default is "no".
+           </para>
+         </listitem>
+       </varlistentry>
+
+      </variablelist>
+    </refsect2>
+
+  </refsect1>
+
+  <refsect1>
+    <title>SERVICE CONFIGURATION</title>
+
+    <para>
+      CTDB can be configured to manage and/or monitor various NAS (and
+      other) services via its eventscripts.
+    </para>
+
+    <para>
+      In the simplest case CTDB will manage a service.  This means the
+      service will be started and stopped along with CTDB, CTDB will
+      monitor the service and CTDB will do any required
+      reconfiguration of the service when public IP addresses are
+      failed over.
+    </para>
+
+    <refsect2>
+      <title>SAMBA</title>
+
+      <refsect3>
+       <title>Eventscripts</title>
+
+       <simplelist>
+         <member><filename>49.winbind</filename></member>
+         <member><filename>50.samba</filename></member>
+       </simplelist>
+      </refsect3>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_MANAGES_SAMBA=yes|no</term>
+         <listitem>
+           <para>
+             Should CTDB manage Samba?
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_MANAGES_WINBIND=yes|no</term>
+         <listitem>
+           <para>
+             Should CTDB manage Winbind?
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_SAMBA_CHECK_PORTS=<parameter>PORT-LIST</parameter></term>
+         <listitem>
+           <para>
+             When monitoring Samba, check TCP ports in
+             space-separated PORT-LIST.
+           </para>
+           <para>
+             Default is to monitor ports that Samba is configured to listen on.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_SAMBA_SKIP_SHARE_CHECK=yes|no</term>
+         <listitem>
+           <para>
+             As part of monitoring, should CTDB skip the check for
+             the existence of each directory configured as share in
+             Samba.  This may be desirable if there is a large number
+             of shares.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_SERVICE_NMB=<parameter>SERVICE</parameter></term>
+         <listitem>
+           <para>
+             Distribution specific SERVICE for managing nmbd.
+           </para>
+           <para>
+             Default is distribution-dependant.
+           </para>
+         </listitem>
+       </varlistentry>
+       <varlistentry>
+         <term>CTDB_SERVICE_SMB=<parameter>SERVICE</parameter></term>
+         <listitem>
+           <para>
+             Distribution specific SERVICE for managing smbd.
+           </para>
+           <para>
+             Default is distribution-dependant.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_SERVICE_WINBIND=<parameter>SERVICE</parameter></term>
+         <listitem>
+           <para>
+             Distribution specific SERVICE for managing winbindd.
+           </para>
+           <para>
+             Default is "winbind".
+           </para>
+         </listitem>
+       </varlistentry>
+
+      </variablelist>
+
+    </refsect2>
+
+    <refsect2>
+      <title>NFS</title>
+
+      <para>
+       This includes parameters for the kernel NFS server and the
+       user-space
+       <ulink url="https://github.com/nfs-ganesha/nfs-ganesha/wiki">NFS-Ganesha</ulink>
+       server.
+      </para>
+
+      <refsect3>
+       <title>Eventscripts</title>
+
+       <simplelist>
+         <member><filename>60.nfs</filename></member>
+         <member><filename>60.ganesha</filename></member>
+       </simplelist>
+      </refsect3>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_CLUSTER_FILESYSTEM_TYPE=gpfs</term>
+         <listitem>
+           <para>
+             The type of cluster filesystem to use with NFS-ganesha.
+             Currently only "gpfs" is supported.
+           </para>
+           <para>
+             Default is "gpfs".
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_MANAGES_NFS=yes|no</term>
+         <listitem>
+           <para>
+             Should CTDB manage NFS?
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_MONITOR_NFS_THREAD_COUNT=yes|no</term>
+         <listitem>
+           <para>
+             Whether to monitor the NFS kernel server thread count.
+           </para>
+           <para>
+             This works around a limitation in some NFS initscripts
+             where some threads can be stuck in host filesystem calls
+             (perhaps due to slow storage), a restart occurs, some
+             threads don't exit, the start only adds the missing
+             number of threads, the stuck threads exit, and the
+             result is a lower than expected thread count.  Note that
+             if you must also set <varname>RPCNFSDCOUNT</varname>
+             (RedHat/Debian) or <varname>USE_KERNEL_NFSD_NUMBER</varname>
+             (SUSE) in your NFS configuration so the monitoring code
+             knows how many threads there should be - if neither of
+             these are set then this option will be ignored.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NFS_DUMP_STUCK_THREADS=<parameter>NUM</parameter></term>
+         <listitem>
+           <para>
+             NUM is the number of NFS kernel server threads to dump
+             stack traces for if some are still alive after stopping
+             NFS during a restart.
+           </para>
+           <para>
+             Default is 0.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NFS_SERVER_MODE=kernel|ganesha</term>
+         <listitem>
+           <para>
+             Selects which NFS server to be managed.
+           </para>
+           <para>
+             This replaces the deprecated variable
+             <varname>NFS_SERVER_MODE</varname>.
+           </para>
+           <para>
+             Default is "kernel".
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NFS_SKIP_KNFSD_ALIVE_CHECK=yes|no</term>
+         <listitem>
+           <para>
+             During monitoring, should CTDB skip the
+             <command>rpcinfo</command> check that is used to see if
+             the NFS kernel server is functional.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_NFS_SKIP_SHARE_CHECK=yes|no</term>
+         <listitem>
+           <para>
+             As part of monitoring, should CTDB skip the check for
+             the existence of each directory exported via NFS.  This
+             may be desirable if there is a large number of exports.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_RPCINFO_LOCALHOST=<parameter>IPADDR</parameter>|<parameter>HOSTNAME</parameter></term>
+         <listitem>
+           <para>
+             IPADDR or HOSTNAME indicates the address that
+             <command>rpcinfo</command> should connect to when doing
+             <command>rpcinfo</command> check on RPC service during
+             monitoring.  Optimally this would be "localhost".
+             However, this can add some performance overheads.
+           </para>
+           <para>
+             Default is "127.0.0.1".
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_SKIP_GANESHA_NFSD_CHECK=yes|no</term>
+         <listitem>
+           <para>
+             As part of monitoring, should CTDB skip the check for
+             the existence of each directory exported via
+             NFS-Ganesha.  This may be desirable if there is a large
+             number of exports.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+      </variablelist>
+
+    </refsect2>
+
+    <refsect2>
+      <title>APACHE HTTPD</title>
+
+      <para>
+       CTDB can manage the Apache web server.
+      </para>
+
+      <refsect3>
+       <title>Eventscript</title>
+
+       <simplelist>
+         <member><filename>41.httpd</filename></member>
+       </simplelist>
+      </refsect3>
+
+      <variablelist>
+       <varlistentry>
+         <term>CTDB_MANAGES_HTTPD=yes|no</term>
+         <listitem>
+           <para>
+             Should CTDB manage the Apache web server?
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>CLAMAV</title>
+
+      <para>
+       CTDB has support to manage the popular anti-virus daemon
+       ClamAV.
+      </para>
+
+      <refsect3>
+       <title>Eventscript</title>
+
+       <simplelist>
+         <member><filename>31.clamd</filename></member>
+       </simplelist>
+
+       <para>
+         This eventscript is not enabled by default.  Use
+         <command>ctdb enablescript</command> to enable it.
+       </para>
+
+      </refsect3>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_MANAGES_CLAMD=yes|no</term>
+         <listitem>
+           <para>
+             Should CTDB manage ClamAV?
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_CLAMD_SOCKET=<parameter>FILENAME</parameter></term>
+         <listitem>
+           <para>
+             FILENAME is the socket to monitor ClamAV.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+      </variablelist>
+
+    </refsect2>
+
+    <refsect2>
+      <title>ISCSI</title>
+
+      <para>
+       CTDB has support for managing the Linux iSCSI tgtd service.
+      </para>
+
+      <refsect3>
+       <title>Eventscript</title>
+
+       <simplelist>
+         <member><filename>70.iscsi</filename></member>
+       </simplelist>
+      </refsect3>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_MANAGES_ISCSI=yes|no</term>
+         <listitem>
+           <para>
+             Should CTDB manage iSCSI tgtd?
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_START_ISCSI_SCRIPTS=<parameter>DIRECTORY</parameter></term>
+         <listitem>
+           <para>
+             DIRECTORY on shared storage containing scripts to start
+             tgtd for each public IP address.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>MULTIPATHD</title>
+
+      <para>
+       CTDB can monitor multipath devices to ensure that active paths
+       are available.
+      </para>
+
+      <refsect3>
+       <title>Eventscript</title>
+
+       <simplelist>
+         <member><filename>20.multipathd</filename></member>
+       </simplelist>
+
+       <para>
+         This eventscript is not enabled by default.  Use
+         <command>ctdb enablescript</command> to enable it.
+       </para>
+      </refsect3>
+
+      <variablelist>
+       <varlistentry>
+         <term>CTDB_MONITOR_MPDEVICES=<parameter>MP-DEVICE-LIST</parameter></term>
+         <listitem>
+           <para>
+             MP-DEVICE-LIST is a list of multipath devices for CTDB to monitor?
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>VSFTPD</title>
+
+      <para>
+       CTDB can manage the vsftpd FTP server.
+      </para>
+
+      <refsect3>
+       <title>Eventscript</title>
+
+       <simplelist>
+         <member><filename>40.vsftpd</filename></member>
+       </simplelist>
+      </refsect3>
+
+      <variablelist>
+       <varlistentry>
+         <term>CTDB_MANAGES_VSFTPD=yes|no</term>
+         <listitem>
+           <para>
+             Should CTDB manage the vsftpd FTP server?
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>
+       SYSTEM RESOURCE MONITORING CONFIGURATION
+      </title>
+
+      <para>
+       CTDB can experience seemingly random (performance and other)
+       issues if system resources become too contrained.  Options in
+       this section can be enabled to allow certain system resources to
+       be checked.
+      </para>
+
+      <refsect3>
+       <title>Eventscripts</title>
+
+       <simplelist>
+         <member><filename>00.ctdb</filename></member>
+         <member><filename>40.fs_use</filename></member>
+       </simplelist>
+
+       <para>
+         Filesystem usage monitoring is in
+         <filename>40.fs_use</filename>.  This eventscript is not
+         enabled by default.  Use <command>ctdb
+         enablescript</command> to enable it.
+       </para>
+      </refsect3>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_CHECK_FS_USE=<parameter>FS-LIMIT-LIST</parameter></term>
+         <listitem>
+           <para>
+             FS-LIMIT-LIST is a space-separated list of
+             <parameter>FILESYSTEM</parameter>:<parameter>LIMIT</parameter>
+             pairs indicating that a node should be flagged unhealthy
+             if the space used on FILESYSTEM reaches LIMIT%.
+           </para>
+
+           <para>
+             No default.
+           </para>
+
+           <para>
+             Note that this feature uses the
+             <filename>40.fs_use</filename> eventscript, which is not
+             enabled by default.  Use <command>ctdb
+             enablescript</command> to enable it.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_CHECK_SWAP_IS_NOT_USED=yes|no</term>
+         <listitem>
+           <para>
+             Should a warning be logged if swap space is in use.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_MONITOR_FREE_MEMORY=<parameter>NUM</parameter></term>
+         <listitem>
+           <para>
+             NUM is a lower limit on available system memory, expressed
+             in megabytes.  If this is set and the amount of available
+             memory falls below this limit then some debug information
+             will be logged, the node will be disabled and then CTDB
+             will be shut down.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_MONITOR_FREE_MEMORY_WARN=<parameter>NUM</parameter></term>
+         <listitem>
+           <para>
+             NUM is a lower limit on available system memory, expressed
+             in megabytes.  If this is set and the amount of available
+             memory falls below this limit then a warning will be
+             logged.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+      </variablelist>
+    </refsect2>
+
+    <refsect2>
+      <title>MISCELLANEOUS SERVICE-RELATED CONFIGURATION</title>
+
+      <variablelist>
+
+       <varlistentry>
+         <term>CTDB_MANAGED_SERVICES=<parameter>SERVICE-LIST</parameter></term>
+         <listitem>
+           <para>
+             SERVICE-LIST is a space-separated list of SERVICEs that
+             CTDB should manage.  This can be used as an alternative
+             to the
+             <varname>CTDB_MANAGES_<replaceable>SERVICE</replaceable></varname>
+             variables.
+           </para>
+           <para>
+             No default.
+           </para>
+         </listitem>
+       </varlistentry>
+
+       <varlistentry>
+         <term>CTDB_SERVICE_AUTOSTARTSTOP=yes|no</term>
+         <listitem>
+           <para>
+             When CTDB should start and stop services if they become
+             managed or unmanaged.
+           </para>
+           <para>
+             Default is no.
+           </para>
+         </listitem>
+       </varlistentry>
+
+      </variablelist>
+
+    </refsect2>
+
+  </refsect1>
+
+  <refsect1>
+    <title>
+      TUNABLES CONFIGURATION
+    </title>
+
+    <para>
+      CTDB tunables (see
+      <citerefentry><refentrytitle>ctdbd-tunables</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>) can be set from the
+      configuration file.  They are set as follows:
+
+      <literallayout>
+CTDB_SET_<replaceable>TUNABLE</replaceable>=<replaceable>VALUE</replaceable>
+      </literallayout>
+    </para>
+
+    <para>
+      For example:
+
+      <screen format="linespecific">
+CTDB_SET_MonitorInterval=20
+      </screen>
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>
+      DEBUG AND TEST
+    </title>
+
+    <para>
+      Variable in this section are for debugging and testing CTDB.
+      They should not generally be needed.
+    </para>
+
+    <variablelist>
+
+      <varlistentry>
+       <term>CTDB_DEBUG_HUNG_SCRIPT=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME is a script to run to log debug information when
+           an event script times out.
+         </para>
+         <para>
+           Default is <filename><varname>CTDB_BASE</varname>/debug-hung-script.sh</filename>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_DEBUG_LOCKS=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME is a script to run to log debug information when
+           an CTDB fails to freeze databases during recovery.
+         </para>
+         <para>
+           No default, usually
+           <filename><varname>CTDB_BASE</varname>/debug_locks.sh</filename>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_ETCDIR=<parameter>DIRECTORY</parameter></term>
+       <listitem>
+         <para>
+           DIRECTORY containing system configuration files.  This is
+           used to provide alternate configuration when testing and
+           should not need to be changed from the default.
+         </para>
+         <para>
+           Default is <filename>/etc</filename>.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_INIT_STYLE=debian|redhat|suse</term>
+       <listitem>
+         <para>
+           This is the init style used by the Linux distribution (or
+           other operating system) being used.  This is usually
+           determined dynamically by checking the system.  This
+           variable is used by the initscript to determine which init
+           system primitives to use.  It is also used by some
+           eventscripts to choose the name of initscripts for certain
+           services, since these can vary between distributions.
+         </para>
+         <para>
+           No fixed default.
+         </para>
+         <para>
+           If this option needs to be changed from the calculated
+           default for the initscript to function properly, then it
+           must be set in the distribution-specific initscript
+           configuration, such as
+           <filename>/etc/sysconfig/ctdb</filename>
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_MAX_CORRUPT_DB_BACKUPS=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           NUM is the maximum number of volatile TDB database backups
+           to be kept (for each database) when a corrupt database is
+           found during startup.  Volatile TDBs are zeroed during
+           startup so backups are needed to debug any corruption that
+           occurs before a restart.
+         </para>
+         <para>
+           Default is 10.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_RC_LOCAL=<parameter>FILENAME</parameter></term>
+       <listitem>
+         <para>
+           FILENAME is a script fragment to be sourced by the
+           <filename>functions</filename> that is sourced by scripts.
+           On example use would be to override function definitions
+           in unit tests.  As a sanity check, this file must be
+           executable for it to be used.
+         </para>
+         <para>
+           No default.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_RUN_TIMEOUT_MONITOR=yes|no</term>
+       <listitem>
+         <para>
+           Whether CTDB should simulate timing out monitor events.
+           This uses the <filename>99.timeout</filename> eventscript.
+         </para>
+         <para>
+           Default is no.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_SCRIPT_DEBUGLEVEL=<parameter>NUM</parameter></term>
+       <listitem>
+         <para>
+           NUM is the level debugging messages printed by CTDB
+           scripts.  Setting this to a higher number (e.g. 4) will
+           cause some scripts to log more messages.
+         </para>
+         <para>
+           Default is 2.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_SUPPRESS_COREFILE=yes|no</term>
+       <listitem>
+         <para>
+           Whether CTDB core files should be suppressed.
+         </para>
+         <para>
+           Default is no.
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>CTDB_VALGRIND=yes|no|<parameter>COMMAND</parameter></term>
+       <listitem>
+         <para>
+           If "yes", this causes
+           <citerefentry><refentrytitle>ctdbd</refentrytitle>
+           <manvolnum>1</manvolnum></citerefentry> to be run under
+           <citerefentry><refentrytitle>valgrind</refentrytitle>
+           <manvolnum>1</manvolnum></citerefentry> with logs going to
+           <filename>/var/log/ctdb_valgrind</filename>.  If neither
+           "yes" nor "no" then the value is assumed to be a COMMAND
+           (e.g. a <command>valgrind</command> variation, a
+           <citerefentry><refentrytitle>gdb</refentrytitle>
+           <manvolnum>1</manvolnum></citerefentry> command) that is
+           used in place of the default <command>valgrind</command>
+           command.  In either case, the <option>--valgrind</option>
+           option is passed to <command>ctdbd</command>.
+         </para>
+         <para>
+           Default is no.
+         </para>
+       </listitem>
+      </varlistentry>
+
+    </variablelist>
+
+  </refsect1>
+
+
+  <refsect1>
+    <title>FILES</title>
+
+    <simplelist>
+      <member><filename>/etc/ctdb/ctdbd.conf</filename></member>
+      <member><filename>/etc/sysconfig/ctdb</filename></member>
+      <member><filename>/etc/default/ctdb</filename></member>
+      <member><filename>/etc/ctdb/sysconfig/ctdb</filename></member>
+    </simplelist>
+  </refsect1>
+
+  <refsect1>
+    <title>SEE ALSO</title>
+    <para>
+      <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd_wrapper</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>onnode</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb-tunables</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by
+       Amitay Isaacs,
+       Martin Schwenke
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2007</year>
+      <holder>Andrew Tridgell</holder>
+      <holder>Ronnie Sahlberg</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/ctdbd_wrapper.1.xml b/ctdb/doc/ctdbd_wrapper.1.xml
new file mode 100644 (file)
index 0000000..b119681
--- /dev/null
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+
+<refentry id="ctdbd_wrapper.1">
+
+  <refmeta>
+    <refentrytitle>ctdbd_wrapper</refentrytitle>
+    <manvolnum>1</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>ctdbd_wrapper</refname>
+    <refpurpose>Wrapper for ctdbd</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>ctdbd_wrapper</command>
+      <arg choice="req"><replaceable>PIDFILE</replaceable></arg>
+      <group choice="req">
+       <arg choice="plain">start</arg>
+       <arg choice="plain">stop</arg>
+      </group>
+    </cmdsynopsis>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+    <para>
+      ctdbd_wrapper is used to start or stop the main CTDB daemon.
+    </para>
+
+    <para>
+      <replaceable>PIDFILE</replaceable> specifies the location of the
+      file containing the PID of the main CTDB daemon.
+    </para>
+
+    <para>
+      ctdbd_wrapper constructs command-line options for ctdbd from
+      configuration variables specified in
+      <citerefentry><refentrytitle>ctdbd.conf</refentrytitle>
+      <manvolnum>5</manvolnum></citerefentry>.
+    </para>
+
+    <para>
+      See <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry> for an overview of CTDB.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>SEE ALSO</title>
+    <para>
+      <citerefentry><refentrytitle>ctdbd</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdbd.conf</refentrytitle>
+      <manvolnum>5</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by
+       Amitay Isaacs,
+       Martin Schwenke
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2007</year>
+      <holder>Andrew Tridgell</holder>
+      <holder>Ronnie Sahlberg</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/examples/README b/ctdb/doc/examples/README
new file mode 100644 (file)
index 0000000..71337b7
--- /dev/null
@@ -0,0 +1,4 @@
+This directory includes sample CTDB cluster configurations.
+
+  o cluster.conf - Basic cluster setup
+  o natgw.conf   - Basic cluster setup with NAT gateway feature
diff --git a/ctdb/doc/examples/cluster.conf b/ctdb/doc/examples/cluster.conf
new file mode 100644 (file)
index 0000000..871468e
--- /dev/null
@@ -0,0 +1,92 @@
+#
+# CTDB configuration for simple cluster
+#
+# This is the sample configuration for a 3-node CTDB cluster providing file
+# services via Samba and NFS.
+#
+# Cluster details:
+#
+#                internal network (192.168.1.0/24)
+#   -------+----------------------+-----------------------+----------
+#          |                      |                       |
+#          |                      |                       |
+#     eth0 | 192.168.1.1     eth0 | 192.168.1.2      eth0 | 192.168.1.3
+#    +-----+-----+          +-----+-----+           +-----+-----+
+#    |           |          |           |           |           |
+#    |  Node 1   |          |  Node 2   |           |  Node 3   |
+#    |           |          |           |           |           |
+#    +-----+-----+          +-----+-----+           +-----+-----+
+#     eth1 | 10.1.1.1        eth1 | 10.1.1.2         eth1 | 10.1.1.3
+#          |                      |                       |
+#          |                      |                       |
+#   -------+----------------------+-----------------------+----------
+#                  public network (10.1.1.0/24)
+#
+#
+# Storage details:
+#
+#  Each node has a shared storage - /shared
+#
+#
+# Service details:
+#
+#  Cluster provides file services on following IP addresses
+#
+#     10.1.1.101 - 10.1.1.106
+#
+#  Each node also has a fixed IP address on public network.  This is used to
+#  communicate to network infrastructure (e.g. DNS, Active Directory, ...).
+#  Make sure that file services are not available on these fixed IP addresses
+#  (e.g. network filtering, using cluster hostname instead of IPs)
+
+
+CTDB_RECOVERY_LOCK=/shared/recovery.lock
+
+#
+# Nodes configuration
+#
+# ---------- /etc/ctdb/nodes ----------
+# 192.168.1.1
+# 192.168.1.2
+# 192.168.1.3
+# ---------- /etc/ctdb/nodes ----------
+#
+CTDB_NODES=/etc/ctdb/nodes
+
+#
+# Public addresses configuration
+#
+# ---------- /etc/ctdb/public_addresses ----------
+# 10.1.1.101/24 eth1
+# 10.1.1.102/24 eth1
+# 10.1.1.103/24 eth1
+# 10.1.1.104/24 eth1
+# 10.1.1.105/24 eth1
+# 10.1.1.106/24 eth1
+# ---------- /etc/ctdb/public_addresses ----------
+#
+CTDB_PUBLIC_ADDRESSES=/etc/ctdb/public_addresses
+
+# Enable logging to syslog
+CTDB_SYSLOG=yes
+
+# Default log level
+CTDB_DEBUGLEVEL=NOTICE
+
+# Auto start/stop managed services
+CTDB_AUTO_STARTSTOP=yes
+
+#
+# Samba configuration
+#
+CTDB_MANAGES_SAMBA=yes
+# CTDB_SAMBA_SKIP_SHARE_CHECK=yes
+
+CTDB_MANAGES_WINBIND=yes
+
+#
+# NFS configuration
+#
+CTDB_MANAGES_NFS=yes
+CTDB_RPCINFO_LOCALHOST="127.0.0.1"
+# CTDB_MONITOR_NFS_THREAD_COUNT=yes
diff --git a/ctdb/doc/examples/natgw.conf b/ctdb/doc/examples/natgw.conf
new file mode 100644 (file)
index 0000000..2e3a3ea
--- /dev/null
@@ -0,0 +1,109 @@
+#
+# CTDB configuration for simple cluster with NAT gateway
+#
+# This is the sample configuration for a 3-node CTDB cluster providing file
+# services via Samba and NFS.
+#
+# Cluster details:
+#
+#                internal network (192.168.1.0/24)
+#   -------+----------------------+-----------------------+----------
+#          |                      |                       |
+#          |                      |                       |
+#     eth0 | 192.168.1.1     eth0 | 192.168.1.2      eth0 | 192.168.1.3
+#    +-----+-----+          +-----+-----+           +-----+-----+
+#    |           |          |           |           |           |
+#    |  Node 1   |          |  Node 2   |           |  Node 3   |
+#    |           |          |           |           |           |
+#    +-----+-----+          +-----+-----+           +-----+-----+
+#     eth1 |                 eth1 |                  eth1 |
+#          |                      |                       |
+#          |                      |                       |
+#   -------+----------------------+-----------------------+-----+----
+#                  public network (10.1.1.0/24)                 |
+#                                                               | 10.1.1.254
+#                                                               o (router)
+#
+# Storage details:
+#
+#  Each node has a shared storage - /shared
+#
+#
+# Service details:
+#
+#  Cluster provides file services on following IP addresses
+#
+#     10.1.1.101 - 10.1.1.106
+#
+#  When a node is not hosting any IPs, it cannot connect to network
+#  infrastructure (e.g. DNS, Active Directory, ...).
+#
+#  Using NAT gateway feature of CTDB allows a node not hosting IPs to connect
+#  to network infrastructure.
+
+
+CTDB_RECOVERY_LOCK=/shared/recovery.lock
+
+#
+# Nodes configuration
+#
+# ---------- /etc/ctdb/nodes ----------
+# 192.168.1.1
+# 192.168.1.2
+# 192.168.1.3
+# ---------- /etc/ctdb/nodes ----------
+#
+CTDB_NODES=/etc/ctdb/nodes
+
+#
+# Public addresses configuration
+#
+# ---------- /etc/ctdb/public_addresses ----------
+# 10.1.1.101/24 eth1
+# 10.1.1.102/24 eth1
+# 10.1.1.103/24 eth1
+# 10.1.1.104/24 eth1
+# 10.1.1.105/24 eth1
+# 10.1.1.106/24 eth1
+# ---------- /etc/ctdb/public_addresses ----------
+#
+CTDB_PUBLIC_ADDRESSES=/etc/ctdb/public_addresses
+
+# Enable logging to syslog
+CTDB_SYSLOG=yes
+
+# Default log level
+CTDB_DEBUGLEVEL=NOTICE
+
+# Auto start/stop managed services
+CTDB_AUTO_STARTSTOP=yes
+
+#
+# Samba configuration
+#
+CTDB_MANAGES_SAMBA=yes
+# CTDB_SAMBA_SKIP_SHARE_CHECK=yes
+
+CTDB_MANAGES_WINBIND=yes
+
+#
+# NFS configuration
+#
+CTDB_MANAGES_NFS=yes
+CTDB_RPCINFO_LOCALHOST="127.0.0.1"
+# CTDB_MONITOR_NFS_THREAD_COUNT=yes
+
+#
+# NAT gateway configuration
+#
+# ---------- /etc/ctdb/natgw_nodes ----------
+# 192.168.1.1
+# 192.168.1.2
+# 192.168.1.3
+# ---------- /etc/ctdb/natgw_nodes ----------
+#
+CTDB_NATGW_PUBLIC_IP=10.1.1.121/24
+CTDB_NATGW_PUBLIC_IFACE=eth1
+CTDB_NATGW_DEFAULT_GATEWAY=10.1.1.254
+CTDB_NATGW_PRIVATE_NETWORK=192.168.1.0/24
+CTDB_NATGW_NODES=/etc/ctdb/natgw_nodes
diff --git a/ctdb/doc/ltdbtool.1.xml b/ctdb/doc/ltdbtool.1.xml
new file mode 100644 (file)
index 0000000..790db0c
--- /dev/null
@@ -0,0 +1,300 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<refentry id="ltdbtool.1">
+
+  <refmeta>
+    <refentrytitle>ltdbtool</refentrytitle>
+    <manvolnum>1</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>ltdbtool</refname>
+    <refpurpose>manipulate CTDB's local TDB files</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>ltdbtool</command>
+      <arg rep="repeat"><replaceable>OPTION</replaceable></arg>
+      <arg choice="req"><replaceable>COMMAND</replaceable></arg>
+      <arg><replaceable>COMMAND-ARGS</replaceable></arg>
+    </cmdsynopsis>
+</refsynopsisdiv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+
+    <para>
+      ltdbtool is a utility to manipulate CTDB's local TDB databases
+      (LTDBs) without connecting to a CTDB daemon.
+    </para>
+
+    <para>
+      It can be used to:
+    </para>
+
+    <itemizedlist>
+      <listitem>
+       <para>
+         dump the contents of a LTDB, optionally printing the CTDB
+           record header information,
+       </para>
+      </listitem>
+      <listitem>
+       <para>
+         convert between an LTDB and a non-clustered tdb
+         by adding or removing CTDB headers and
+       </para>
+      </listitem>
+      <listitem>
+         <para>convert between 64 and 32 bit LTDBs where the CTDB record
+         headers differ by 4 bytes of padding.
+         </para>
+      </listitem>
+    </itemizedlist>
+  </refsect1>
+
+  <refsect1>
+    <title>OPTIONS</title>
+
+    <variablelist>
+      <varlistentry><term>-e</term>
+        <listitem>
+          <para>
+           Dump empty records.  These are normally excluded.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-p</term>
+        <listitem>
+          <para>
+           Dump with header information, similar to "ctdb catdb".
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>
+         -s
+         <group choice="req">
+           <arg choice="plain">0</arg>
+           <arg choice="plain">32</arg>
+           <arg choice="plain">64</arg>
+         </group>
+       </term>
+        <listitem>
+         <para>
+           Specify how to determine the CTDB record header size
+           for the input database:
+           <variablelist spacing="normal">
+             <varlistentry><term>0</term>
+             <listitem>
+               <para>no CTDB header</para>
+             </listitem>
+             </varlistentry>
+             <varlistentry><term>32</term>
+             <listitem>
+               <para>CTDB header size of a 32 bit system (20 bytes)</para>
+             </listitem>
+             </varlistentry>
+             <varlistentry><term>64</term>
+             <listitem>
+               <para>CTDB header size of a 64 bit system (24 bytes)</para>
+             </listitem>
+             </varlistentry>
+           </variablelist>
+           The default is 32 or 64 depending on the system architecture.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>
+         -o
+         <group choice="req">
+           <arg choice="plain">0</arg>
+           <arg choice="plain">32</arg>
+           <arg choice="plain">64</arg>
+         </group>
+       </term>
+        <listitem>
+          <para>
+           Specify how to determine the CTDB record header size
+           for the output database, see -s.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-S <parameter>SIZE</parameter></term>
+        <listitem>
+          <para>
+           Explicitly specify the CTDB record header SIZE of the
+           input database in bytes.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-O <parameter>SIZE</parameter></term>
+        <listitem>
+          <para>
+           Explicitly specify the CTDB record header SIZE for the
+           output database in bytes.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-h</term>
+        <listitem>
+          <para>
+            Print help text.
+         </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>COMMANDS</title>
+
+    <variablelist>
+      <varlistentry><term>help</term>
+        <listitem>
+          <para>
+           Print help text.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>dump <parameter>IDB</parameter></term>
+        <listitem>
+          <para>
+           Dump the contents of an LTDB input file IDB to standard
+           output in a human-readable format.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>
+         convert <parameter>IDB</parameter> <parameter>ODB</parameter>
+       </term>
+        <listitem>
+          <para>
+           Copy an LTDB input file IDB to output file ODB, optionally
+           adding or removing CTDB headers.
+         </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>EXAMPLES</title>
+
+    <para>
+      Print a local tdb in "tdbdump" style:
+    </para>
+    <screen format="linespecific">
+      ltdbtool dump idmap2.tdb.0
+    </screen>
+
+    <para>
+      Print a local tdb with header information similar to "ctdb catdb":
+    </para>
+    <screen format="linespecific">
+      ltdbtool dump -p idmap2.tdb.0
+    </screen>
+
+    <para>
+      Strip the CTDB headers from records:
+    </para>
+    <screen format="linespecific">
+      ltdbtool convert -o0 idmap2.tdb.0 idmap.tdb
+    </screen>
+
+    <para>
+      Strip 64 bit CTDB headers from records, running on i386:
+    </para>
+    <screen format="linespecific">
+      ltdbtool convert -s64 -o0 idmap2.tdb.0 idmap.tdb
+    </screen>
+
+    <para>
+      Strip the CTDB headers from records by piping through tdbrestore:
+    </para>
+    <screen format="linespecific">
+      ltdbtool dump idmap2.tdb.0 | tdbrestore idmap.tdb
+    </screen>
+
+    <para>
+      Convert a local tdb from a 64 bit system for usage on a 32 bit system:
+    </para>
+    <screen format="linespecific">
+      ltdbtool convert -s64 -o32 idmap2.tdb.0 idmap2.tdb.1
+    </screen>
+
+    <para>
+      Add a default header:
+    </para>
+    <screen format="linespecific">
+      ltdbtool convert -s0 idmap.tdb idmap2.tdb.0
+    </screen>
+  </refsect1>
+
+  <refsect1><title>SEE ALSO</title>
+    <para>
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>tdbdump</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>tdbrestore</refentrytitle>
+      <manvolnum>1</manvolnum></citerefentry>,
+
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by Gregor Beck
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2011</year>
+      <holder>Gregor Beck</holder>
+      <holder>Michael Adam</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/onnode.1.xml b/ctdb/doc/onnode.1.xml
new file mode 100644 (file)
index 0000000..561764c
--- /dev/null
@@ -0,0 +1,359 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<refentry id="onnode.1">
+
+  <refmeta>
+    <refentrytitle>onnode</refentrytitle>
+    <manvolnum>1</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>onnode</refname>
+    <refpurpose>run commands on CTDB cluster nodes</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>onnode</command>
+      <arg rep="repeat"><replaceable>OPTION</replaceable></arg>
+      <arg choice="req"><replaceable>NODES</replaceable></arg>
+      <arg choice="req"><replaceable>COMMAND</replaceable></arg>
+    </cmdsynopsis>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+    <para>
+      onnode is a utility to run commands on a specific node of a CTDB
+      cluster, or on all nodes.
+    </para>
+    <para>
+      <replaceable>NODES</replaceable> specifies which node(s) to run
+      a command on.  See section <citetitle>NODES
+      SPECIFICATION</citetitle> for details.
+    </para>
+    <para>
+      <replaceable>COMMAND</replaceable> can be any shell command. The
+      onnode utility uses ssh or rsh to connect to the remote nodes
+      and run the command.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>OPTIONS</title>
+
+    <variablelist>
+      <varlistentry><term>-c</term>
+        <listitem>
+          <para>
+            Execute COMMAND in the current working directory on the
+            specified nodes.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-f <parameter>FILENAME</parameter></term>
+        <listitem>
+          <para>
+            Specify an alternative nodes FILENAME to use instead of
+            the default.  This option overrides the CTDB_NODES_FILE
+            environment variable.  See the discussion of
+            <filename>/etc/ctdb/nodes</filename> in the FILES section
+            for more details.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-n</term>
+        <listitem>
+          <para>
+            Allow nodes to be specified by name rather than node
+            numbers.  These nodes don't need to be listed in the nodes
+            file.  You can avoid the nodes file entirely by combining
+            this with <code>-f /dev/null</code>.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-o <parameter>PREFIX</parameter></term>
+        <listitem>
+          <para>
+           Causes standard output from each node to be saved into a
+           file with name PREFIX.<replaceable>IP</replaceable>.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-p</term>
+        <listitem>
+          <para>
+            Run COMMAND in parallel on the specified nodes.  The
+            default is to run COMMAND sequentially on each node.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-P</term>
+        <listitem>
+          <para>
+            Push files to nodes.  Names of files to push are specified
+            rather than the usual command.  Quoting is fragile/broken
+            - filenames with whitespace in them are not supported.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-q</term>
+        <listitem>
+          <para>
+            Do not print node addresses.  Normally, onnode prints
+            informational node addresses if more than one node is
+            specified.  This overrides -v.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-v</term>
+        <listitem>
+          <para>
+            Print node addresses even if only one node is specified.
+            Normally, onnode prints informational node addresses when
+            more than one node is specified.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term>-h, --help</term>
+        <listitem>
+          <para>
+            Show a short usage guide.
+         </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>NODES SPECIFICATION</title>
+
+    <para>
+      Nodes can be specified via numeric node numbers (from 0 to N-1)
+      or mnemonics.  Multiple nodes are specified using lists of
+      nodes, separated by commas, and ranges of numeric node numbers,
+      separated by dashes.  If nodes are specified multiple times then
+      the command will be executed multiple times on those nodes.  The
+      order of nodes is significant.
+    </para>
+
+    <para>
+      The following mnemonics are available:
+    </para>
+
+    <variablelist>
+      <varlistentry><term>all</term>
+        <listitem>
+          <para>
+            All nodes.
+         </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry><term>any</term>
+        <listitem>
+          <para>
+             A node where ctdbd is running.  This semi-random but
+             there is a bias towards choosing a low numbered node.
+         </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry><term>ok | healthy</term>
+        <listitem>
+          <para>
+            All nodes that are not disconnected, banned, disabled or
+            unhealthy.
+         </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry><term>con | connected</term>
+        <listitem>
+          <para>
+            All nodes that are not disconnected.
+         </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry><term>lvs | lvsmaster</term>
+        <listitem>
+          <para>
+            The current LVS master.
+         </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry><term>natgw | natgwlist</term>
+        <listitem>
+          <para>
+            The current NAT gateway.
+         </para>
+        </listitem>
+      </varlistentry>
+      <varlistentry><term>rm | recmaster</term>
+        <listitem>
+          <para>
+            The current recovery master.
+         </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>EXAMPLES</title>
+
+    <para>
+      The following command would show the process ID of ctdbd on all nodes
+    </para>
+    <screen format="linespecific">
+      onnode all ctdb getpid
+    </screen>
+
+    <para>
+      The following command would show the last 5 lines of log on each
+      node, preceded by the node's hostname
+    </para>
+    <screen format="linespecific">
+      onnode all "hostname; tail -5 /var/log/log.ctdb"
+    </screen>
+
+    <para>
+      The following command would restart the ctdb service on all
+      nodes, in parallel.
+    </para>
+    <screen format="linespecific">
+      onnode -p all service ctdb restart
+    </screen>
+
+    <para>
+      The following command would run ./foo in the current working
+      directory, in parallel, on nodes 0, 2, 3 and 4.
+    </para>
+    <screen format="linespecific">
+      onnode -c -p 0,2-4 ./foo
+    </screen>
+  </refsect1>
+
+  <refsect1>
+    <title>ENVIRONMENT</title>
+
+    <variablelist>
+      <varlistentry><term><envar>CTDB_BASE</envar></term>
+        <listitem>
+          <para>
+           Directory containing CTDB configuration files.  The
+           default is <filename>/etc/ctdb</filename>.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term><envar>CTDB_NODES_FILE</envar></term>
+        <listitem>
+          <para>
+           Name of alternative nodes file to use instead of the
+           default.  See the <citetitle>FILES</citetitle> section for
+           more details.
+         </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>FILES</title>
+
+    <variablelist>
+      <varlistentry><term><filename>/etc/ctdb/nodes</filename></term>
+        <listitem>
+          <para>
+            Default file containing a list of each node's IP address
+            or hostname.
+         </para>
+         <para>
+           Actually, the default is
+           <filename>$CTDB_BASE/nodes</filename>, where
+           <envar>CTDB_BASE</envar> defaults to
+           <filename>/etc/ctdb</filename>.  If a relative path is
+           given (via the -f option or <envar>CTDB_BASE</envar>) and
+           no corresponding file exists relative to the current
+           directory then the file is also searched for in the
+           <filename>$CTDB_BASE</filename> directory.
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry><term><filename>/etc/ctdb/onnode.conf</filename></term>
+        <listitem>
+          <para>
+            If this file exists it is sourced by onnode.  The main
+            purpose is to allow the administrator to set
+            <envar>SSH</envar> to something other than "ssh".  In this
+            case the -t option is ignored.  For example, the
+            administrator may choose to use use rsh instead of ssh.
+         </para>
+        </listitem>
+      </varlistentry>
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>SEE ALSO</title>
+
+    <para>
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="http://ctdb.samba.org/"/>
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by
+       Andrew Tridgell,
+       Martin Schwenke
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2007</year>
+      <holder>Andrew Tridgell</holder>
+      <holder>Ronnie Sahlberg</holder>
+    </copyright>
+    <copyright>
+      <year>2008</year>
+      <holder>Martin Schwenke</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/ping_pong.1.xml b/ctdb/doc/ping_pong.1.xml
new file mode 100644 (file)
index 0000000..47e90e8
--- /dev/null
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry
+       PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN"
+       "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<refentry id="ping_pong.1">
+
+  <refmeta>
+    <refentrytitle>ping_pong</refentrytitle>
+    <manvolnum>1</manvolnum>
+    <refmiscinfo class="source">ctdb</refmiscinfo>
+    <refmiscinfo class="manual">CTDB - clustered TDB database</refmiscinfo>
+  </refmeta>
+
+  <refnamediv>
+    <refname>ping_pong</refname>
+    <refpurpose>measures the ping-pong byte range lock latency</refpurpose>
+  </refnamediv>
+
+  <refsynopsisdiv>
+    <cmdsynopsis>
+      <command>ping_pong</command>
+      <group choice="req">
+       <arg choice="plain">-r</arg>
+       <arg choice="plain">-w</arg>
+       <arg choice="plain">-rw</arg>
+      </group>
+      <arg>-m</arg>
+      <arg>-c</arg>
+      <arg choice="req"><replaceable>FILENAME</replaceable></arg>
+      <arg choice="req"><replaceable>NUM-LOCKS</replaceable></arg>
+    </cmdsynopsis>
+  </refsynopsisdiv>
+
+  <refsect1>
+    <title>DESCRIPTION</title>
+    <para>
+      ping_pong measures the byte range lock latency. It is especially
+      useful on a cluster of nodes sharing a common lock manager as it
+      will give some indication of the lock manager's performance
+      under stress.
+    </para>
+
+    <para>
+      FILENAME is a file on shared storage to use for byte range
+      locking tests.
+    </para>
+
+    <para>
+      NUM-LOCKS is the number of byte range locks, so needs to be
+      (strictly) greater than the number of nodes in the cluster.
+    </para>
+  </refsect1>
+
+  <refsect1>
+    <title>OPTIONS</title>
+
+    <variablelist>
+      <varlistentry>
+       <term>-r</term>
+       <listitem>
+         <para>
+           test read performance
+         </para>
+       </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>-w</term>
+        <listitem>
+          <para>
+           test write performance
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>-m</term>
+        <listitem>
+          <para>
+           use mmap
+         </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+       <term>-c</term>
+       <listitem>
+          <para>
+           validate the locks
+         </para>
+        </listitem>
+      </varlistentry>
+
+    </variablelist>
+  </refsect1>
+
+  <refsect1>
+    <title>EXAMPLES</title>
+    <para>
+      Testing lock coherence
+    </para>
+    <screen format="linespecific">
+      ping_pong test.dat N
+    </screen>
+
+    <para>
+      Testing lock coherence with lock validation
+    </para>
+    <screen format="linespecific">
+      ping_pong -c test.dat N
+    </screen>
+
+    <para>
+      Testing IO coherence
+    </para>
+    <screen format="linespecific">
+      ping_pong -rw test.dat N
+    </screen>
+  </refsect1>
+
+  <refsect1>
+    <title>SEE ALSO</title>
+    <para>
+      <citerefentry><refentrytitle>ctdb</refentrytitle>
+      <manvolnum>7</manvolnum></citerefentry>,
+
+      <ulink url="https://wiki.samba.org/index.php/Ping_pong"/>
+
+    </para>
+  </refsect1>
+
+  <refentryinfo>
+    <author>
+      <contrib>
+       This documentation was written by Mathieu Parent
+      </contrib>
+    </author>
+
+    <copyright>
+      <year>2002</year>
+      <holder>Andrew Tridgell</holder>
+    </copyright>
+    <legalnotice>
+      <para>
+       This program is free software; you can redistribute it and/or
+       modify it under the terms of the GNU General Public License as
+       published by the Free Software Foundation; either version 3 of
+       the License, or (at your option) any later version.
+      </para>
+      <para>
+       This program is distributed in the hope that it will be
+       useful, but WITHOUT ANY WARRANTY; without even the implied
+       warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+       PURPOSE.  See the GNU General Public License for more details.
+      </para>
+      <para>
+       You should have received a copy of the GNU General Public
+       License along with this program; if not, see
+       <ulink url="http://www.gnu.org/licenses"/>.
+      </para>
+    </legalnotice>
+  </refentryinfo>
+
+</refentry>
diff --git a/ctdb/doc/readonlyrecords.txt b/ctdb/doc/readonlyrecords.txt
new file mode 100644 (file)
index 0000000..d108ffd
--- /dev/null
@@ -0,0 +1,343 @@
+Read-Only locks in CTDB
+=======================
+
+Problem
+=======
+CTDB currently only supports exclusive Read-Write locks for clients(samba) accessing the
+TDB databases.
+This mostly works well but when very many clients are accessing the same file,
+at the same time, this causes the exclusive lock as well as the record itself to
+rapidly bounce between nodes and acts as a scalability limitation.
+
+This primarily affects locking.tdb and brlock.tdb, two databases where record access is 
+read-mostly and where writes are semi-rare.
+
+For the common case, if CTDB provided shared non-exlusive Read-Only lock semantincs
+this would greatly improve scaling for these workloads.
+
+
+Desired properties
+==================
+We can not make backward incompatible changes the ctdb_ltdb header for the records.
+
+A Read-Only lock enabled ctdb demon must be able to interoperate with a non-Read-Only
+lock enbled daemon.
+
+Getting a Read-Only lock should not be slower than getting a Read-Write lock.
+
+When revoking Read-Only locks for a record, this should involve only those nodes that
+currently hold a Read-Only lock and should avoid broadcasting opportunistic revocations.
+(must track which nodes are delegated to)
+
+When a Read-Write lock is requested, if there are Read-Only locks delegated to other
+nodes, the DMASTER will defer the record migration until all read-only locks are first
+revoked (synchronous revoke).
+
+Due to the cost of revoking Read-Only locks has on getting a Read-Write lock, the
+implementation should try to avoid creating Read-Only locks unless it has indication
+that there is contention. This may mean that even if client requests a Read-Only lock
+we might still provide a full Read-Write lock in order to avoid the cost of revoking
+the locks in some cases.
+
+Read-Only locks require additional state to be stored in a separate database, containing
+information about which nodes have have been delegated Read-Only locks.
+This database should be kept at minimal size.
+
+Read-Only locks should not significantly complicate the normal record
+create/migration/deletion cycle for normal records.
+
+Read-Only locks should not complicate the recovery process.
+
+Read-Only locks should not complicate the vacuuming process.
+
+We should avoid forking new child processes as far as possible from the main daemon.
+
+Client-side implementation, samba, libctdb, others, should have minimal impact when
+Read-Only locks are implemented.
+Client-side implementation must be possible with only minor conditionals added to the
+existing lock-check-fetch-unlock loop that clients use today for Read-Write locks. So
+that clients only need one single loop that can handle both Read-Write locking as well
+as Read-Only locking. Clients should not need two nearly identical loops.
+
+
+Implementation
+==============
+
+Four new flags are allocated in the ctdb_ltdb record header.
+HAVE_DELEGATIONS, HAVE_READONLY_LOCK, REVOKING_READONLY and REVOKE_COMPLETE
+
+HAVE_DELEGATIONS is a flag that can only be set on the node that is currently the
+DMASTER for the record. When set, this flag indicates that there are Read-Only locks
+delegated to other nodes in the cluster for this record.
+
+HAVE_READONLY is a flag that is only set on nodes that are NOT the DMASTER for the
+record. If set this flag indicates that this record contains an up-to-date Read-Only
+version of this record. A client that only needs to read, but not to write, the record
+can safely use the content of this record as is regardless of the value of the DMASTER
+field of the record.
+
+REVOKING_READONLY is a flag that is used while a set of read only delegations are being
+revoked.
+This flag is only set when HAVE_DELEGATIONS is also set, and is cleared at the same time
+as HAVE_DELEGATIONS is cleared.
+Normal operations is that first the HAVE_DELEGATIONS flag is set when the first
+delegation is generated. When the delegations are about to be revoked, the
+REVOKING_READONLY flag is set too.
+Once all delegations are revoked, both flags are cleared at the same time.
+While REVOKING_READONLY is set, any requests for the record, either normal request or
+request for readonly will be deferred.
+Deferred requests are linked on a list for deferred requests until the time that the
+revokation is completed.
+This flags is set by the main ctdb daemon when it starts revoking this record.
+
+REVOKE_COMPLETE
+The actual revoke of records is done by a child process, spawned from the main ctdb
+daemon when it starts the process to revoke the records.
+Once the child process has finished revoking all delegations it will set the flag
+REVOKE_COMPLETE for this record to signal to the main daemon that the record has been
+successfully revoked.
+At this stage the child process will also trigger an event in the main daemon that
+revoke is complete and that the main dameon should start re-processing all deferred
+requests.
+
+
+
+Once the revoke process is completed there will be at least one deferred request to
+access this record. That is the initical call to for an exclusive fetch_lock() that
+triggered the revoke process to be started.
+In addition to this deferred request there may also be additional requests that have
+also become deferred while the revoke was in process. These can be either exclusive
+fetch_locks() or they can be readonly lock requests.
+Once the revoke is completed the main daemon will reprocess all exclusive fetch_lock()
+requests immediately and respond to these clients.
+Any requests for readadonly lock requests will be deferred for an additional period of
+time before they are re-processed.
+This is to allow the client that needs a fetch_lock() to update the record to get some
+time to access and work on the record without having to compete with the possibly
+very many readonly requests.
+
+
+
+
+
+The ctdb_db structure is expanded so that it contains one extra TDB database for each
+normal, non-persistent datbase.
+This new database is used for tracking delegations for the records.
+A record in the normal database that has "HAVE_DELEGATION" set will always have a
+corresponding record at the same key. This record contains the set of all nodes that
+the record is delegated to.
+This tracking database is lockless, using TDB_NOLOCK, and is only ever accessed by
+the main ctdbd daemon.
+The lockless nature and the fact that no other process ever access this TDB means we
+are guaranteed non-blocking access to records in the tracking database.
+
+The ctdb_call PDU is allocated with a new flag WANT_READONLY and possibly also a new
+callid: CTDB_FETCH_WITH_HEADER_FUNC.
+This new function returns not only the record, as CTDB_FETCH_FUNC does, but also
+returns the full ctdb_ltdb record HEADER prepended to the record.
+This function is optional, clients that do not care what the header is can continue
+using just CTDB_FETCH_FUNC
+
+
+This flag is used to requesting a read-only record from the DMASTER/LMASTER.
+If the record does not yet exist, this is a returned as an error to the client and the
+client will retry the request loop.
+
+A new control is added to make remote nodes remove the HAVE_READONLY_LOCK from a record
+and to invalidate any deferred readonly copies from the databases.
+
+
+
+Client implementation
+=====================
+Clients today use a loop for record fetch lock that looks like this
+    try_again:
+        lock record in tdb
+
+        if record does not exist in tdb,
+            unlock record
+            ask ctdb to migrate record onto the node
+            goto try_again
+
+        if record dmaster != this node pnn
+            unlock record
+            ask ctdb to migrate record onto the node
+            goto try_again
+
+    finished:
+
+where we basically spin, until the record is migrated onto the node and we have managed
+to pin it down.
+
+This will change to instead to something like
+
+    try_again:
+        lock record in tdb
+
+        if record does not exist in tdb,
+            unlock record
+            ask ctdb to migrate record onto the node
+            goto try_again
+
+        if record dmaster == current node pnn
+            goto finished
+
+        if read-only lock
+            if HAVE_READONLY_LOCK or HAVE_DELEGATIONS is set
+                goto finished
+            else
+                unlock record 
+                ask ctdb for read-only copy (WANT_READONLY[|WITH_HEADER])
+                if failed to get read-only copy (*A)
+                    ask ctdb to migrate the record onto the node
+                    goto try_again
+                lock record in tdb
+                goto finished
+
+        unlock record
+        ask ctdb to migrate record onto the node
+        goto try_again
+
+    finished:
+
+If the record does not yet exist in the local TDB, we always perform a full fetch for a
+Read-Write lock even if only a Read-Only lock was requested.
+This means that for first access we always grab a Read-Write lock and thus upgrade any
+requests for Read-Only locks into a Read-Write request.
+This creates the record, migrates it onto the node and makes the local node become
+the DMASTER for the record.
+
+Future reference to this same record by the local samba daemons will still access/lock
+the record locally without triggereing a Read-Only delegation to be created since the
+record is already hosted on the local node as DMASTER.
+
+Only if the record is contended, i.e. it has been created an migrated onto the node but
+we are no longer the DMASTER for this record, only for this case will we create a
+Read-Only delegation.
+This heuristics provide a mechanism where we will not create Read-Only delegations until
+we have some indication that the record may be contended.
+
+This avoids creating and revoking Read-Only delegations when only a single client is
+repeatedly accessing the same set of records.
+This also aims to limit the size of the tracking tdb.
+
+
+Server implementation
+=====================
+When receiving a ctdb_call with the WANT_READONLY flag:
+
+If this is the LMASTER for the record and the record does not yet exist, LMASTER will
+return an error back to the client (*A above) and the client will try to recover.
+In particular, LMASTER will not create a new record for this case.
+
+If this is the LMASTER for the record and the record exists, the PDU will be forwarded to
+the DMASTER for the record.
+
+If this node is not the DMASTER for this record, we forward the PDU back to the
+LMASTER. Just as we always do today.
+
+If this is the DMASTER for the record, we need to create a Read-Only delegation.
+This is done by
+     lock record
+     increase the RSN by one for this record
+     set the HAVE_DELEGATIONS flag for the record
+     write the updated record to the TDB
+     create/update the tracking TDB nd add this new node to the set of delegations
+     send a modified copy of the record back to the requesting client.
+         modifications are that RSN is decremented by one, so delegated records are "older" than on the DMASTER,
+         it has HAVE_DELEGATIONS flag stripped off, and has HAVE_READONLY_LOCK added.
+     unlock record
+
+Important to note is that this does not trigger a record migration.
+
+
+When receiving a ctdb_call without the WANT_READONLY flag:
+
+If this is the DMASTER for the this might trigger a migration. If there exists
+delegations we must first revoke these before allowing the Read-Write request from
+proceeding. So,
+IF the record has HAVE_DELEGATIONS set, we create a child process and defer processing
+of this PDU until the child process has completed.
+
+From the child process we will call out to all nodes that have delegations for this
+record and tell them to invalidate this record by clearing the HAVE_READONLY_LOCK from
+the record.
+Once all delegated nodes respond back, the child process signals back to the main daemon
+the revoke has completed. (child process may not access the tracking tdb since it is
+lockless)
+
+Main process is triggered to re-process the PDU once the child process has finished.
+Main daemon deletes the corresponding record in the tracking database, clears the
+HAVE_DELEGATIONS flag for the record and then proceeds to perform the migration as usual.
+
+When receiving a ctdb_call without the flag we want all delegations to be revoked,
+so we must take care that the delegations are revoked unconditionally before we even
+check if we are already the DMASTER (in which case thie ctdb_call would normally just
+be  no-op  (*B below))
+
+
+
+Recovery process changes
+========================
+A recovery implicitly clears/revokes any read only records and delegations from all
+databases.
+
+During delegations of Read-Only locks, this is done in such way that delegated records
+will have a RSN smaller than the DMASTER. This guarantees that read-only copies always
+have a RSN that is smaller than the DMASTER.
+
+During recoveries we do not need to take any special action other than always picking
+the copy of the record that has the highest RSN, which is what we already do today.
+
+During the recovery process, we strip all flags off all records while writing the new
+content of the database during the PUSH_DB control. 
+
+During processing of the PUSH_DB control and once the new database has been written we
+then also wipe the tracking database.
+
+This makes changes to the recovery process minimal and nonintrusive.
+
+
+
+Vacuuming process
+=================
+Vacuuming needs only minimal changes.
+
+
+When vacuuming runs, it will do a fetch_lock to migrate any remote records back onto the
+LMASTER before the record can be purged. This will automatically force all delegations
+for that record to be revoked before the migration is copied back onto the LMASTER.
+This handles the case where LMASTER is not the DMASTER for the record that will be
+purged.
+The migration in this case does force any delegations to be revoked before the
+vacuuming takes place.
+
+Missing is the case when delegations exist and the LMASTER is also the DMASTER.
+For this case we need to change the vacuuming to unconditionally always try to do a
+fetch_lock when HAVE_DELEGATIONS is set, even if the record is already stored locally.
+(*B)
+This fetch lock will not cause any migrations by the ctdb daemon, but since it does
+not have the WANT_READONLY this will still force the delegations to be revoked but no
+migration will trigger.
+
+
+Traversal process
+=================
+Traversal process is changed to ignore any records with the HAVE_READONLY_LOCK
+
+
+Forward/Backward Compatibility
+==============================
+Non-readonly locking daemons must be able to interoperate with readonly locking enabled daemons.
+
+Non-readonly enabled daemons fetching records from Readonly enabled daemons:
+Non-readonly enabled daemons do not know, and never set the WANT_READONLY flag so these daemons will always request a full migration for a full fetch-lock for all records. Thus a request from a non-readonly enabled daemon will always cause any existing delegations to be immediately revoked. Access will work but performance may be harmed since there will be a lot of revoking of delegations.
+
+Readonly enabled dameons fetching records with WANT_READONLY from non-readonly enabled daemons:
+Non-readonly enabled daemons ingore the WANT_READONLY flag and never return delegations. They always return a full record migration.
+Full record migration is allowed by the protocol, even if the originator only requests the 'hint' WANT_READONLY,
+so this access also interoperates between daemons with different capabilities.
+
+
+
+
diff --git a/ctdb/doc/recovery-process.txt b/ctdb/doc/recovery-process.txt
new file mode 100644 (file)
index 0000000..333eeb2
--- /dev/null
@@ -0,0 +1,436 @@
+Valid as of 1.0.66, may/will change in the future
+
+
+RECMASTER
+=========
+Recovery Master, this is one of the nodes in the cluster that has been designated to
+be the "recovery master".
+The recovery master is responsible for performing full checks of cluster and cluster node consistency and is also responsible for performing the actual database recovery procedure.
+
+Only one node at a time can be the recovery master.
+This is ensured by CTDB using a lock on a single file in the shared gpfs filesystem:
+  /etc/sysconfig/ctdb :
+  ...
+  # Options to ctdbd. This is read by /etc/init.d/ctdb
+  # you must specify the location of a shared lock file across all the
+  # nodes. This must be on shared storage
+  # there is no default here
+  CTDB_RECOVERY_LOCK=/gpfs/.ctdb/shared
+  ...
+
+In order to prevent that two nodes become recovery master at the same time (==split brain)
+CTDB here relies on GPFS that GPFS will guarantee coherent locking across the cluster.
+Thus CTDB relies on that GPFS MUST only allow one ctdb process on one node to take out and
+hold this lock.
+
+The recovery master is designated through an election process. 
+
+
+VNNMAP
+======
+The VNNMAP is a list of all nodes in the cluster that is currently part of the cluster
+and participates in hosting the cluster databases.
+All nodes that are CONNECTED but not BANNED be present in the VNNMAP.
+
+The VNNMAP is the list of LMASTERS for the cluster as reported by 'ctdb status' "
+  ...
+  Size:3
+  hash:0 lmaster:0
+  hash:1 lmaster:1
+  hash:2 lmaster:2
+  ...
+
+
+CLUSTER MONITORING
+==================
+All nodes in the cluster monitor its own health and its own consistency regards to the
+recovery master. How and what the nodes monitor for differs between the node which is
+the recovery master and normal nodes.
+This monitoring it to ensure that the cluster is healthy and consistent. 
+This is not related to monitoring of inidividual node health, a.k.a. eventscript monitroing.
+
+At the end of each step in the process are listed some of the most common and important
+error messages that can be generated during that step.
+
+
+NORMAL NODE CLUSTER MONITORING
+------------------------------
+Monitoring is performed in the dedicated recovery daemon process.
+The implementation can be found in server/ctdb_recoverd.c:monitor_cluster()
+This is an overview of the more important tasks during monitoring.
+These tests are to verify that the local node is consistent with the recovery master.
+
+Once every second the following monitoring loop is performed :
+
+1, Verify that the parent ctdb daemon on the local node is still running.
+   If it is not, the recovery daemon logs an error and terminates.
+   "CTDB daemon is no longer available. Shutting down recovery daemon"
+
+2, Check if any of the nodes has been recorded to have misbehaved too many times.
+   If so we ban the node and log a message :
+   "Node %u has caused %u failures in %.0f seconds - banning it for %u seconds"
+
+3, Check that there is a recovery master.
+   If not we initiate a clusterwide election process and log :
+   "Initial recovery master set - forcing election"
+   and we restart monitoring from 1.
+
+4, Verify that recovery daemon and the local ctdb daemon agreed on all the
+   node BANNING flags.
+   If the recovery daemon and the local ctdb daemon disagrees on these flags we update
+   the local ctdb daemon, logs one of two messages and restarts monitoring from 1 again.
+   "Local ctdb daemon on recmaster does not think this node is BANNED but the recovery master disagrees. Unbanning the node"
+   "Local ctdb daemon on non-recmaster does not think this node is BANNED but the recovery master disagrees. Re-banning the node"
+
+5, Verify that the node designated to be recovery master exists in the local list of all nodes.
+   If the recovery master is not in the list of all cluster nodes a new recovery master
+   election is triggered and monitoring restarts from 1.
+   "Recmaster node %u not in list. Force reelection"
+
+6, Check if the recovery master has become disconnected.
+   If is has, log an error message, force a new election and restart monitoring from 1.
+   "Recmaster node %u is disconnected. Force reelection"
+
+7, Read the node flags off the recovery master and verify that it has not become banned.
+   If is has, log an error message, force a new election and restart monitoring from 1.
+   "Recmaster node %u no longer available. Force reelection"
+
+8, Verify that the recmaster and the local node agrees on the flags (BANNED/DISABLED/...)
+   for the local node. 
+   If there is an inconsistency, push the flags for the local node out to all other nodes.
+   "Recmaster disagrees on our flags flags:0x%x recmaster_flags:0x%x  Broadcasting out flags."
+
+9, Verify that the local node hosts all public ip addresses it should host and that it does
+   NOT host any public addresses it should not host.
+   If there is an inconsistency we log an error, trigger a recovery to occur and restart
+   monitoring from 1 again.
+   "Public address '%s' is missing and we should serve this ip"
+   "We are still serving a public address '%s' that we should not be serving."
+
+These are all the checks we perform during monitoring for a normal node.
+These tests are performed on all nodes in the cluster which is why it is optimized to perform
+as few network calls to other nodes as possible.
+Each node only performs 1 call to the recovery master in each loop and to no other nodes.
+
+RECOVERY MASTER CLUSTER MONITORING
+-----------------------------------
+The recovery master performs a much more extensive test. In addition to tests 1-9 above
+the recovery master also performs the following tests:
+
+10, Read the list of nodes and flags from all other CONNECTED nodes in the cluster.
+    If there is a failure to read this list from one of the nodes, then log an 
+    error, mark this node as a candidate to become BANNED and restart monitoring from 1.
+    "Unable to get nodemap from remote node %u"
+
+11, Verify that the local recovery master and the remote node agrees on the flags
+    for the remote node. If there is a inconsistency for the BANNING flag,
+    log an error, trigger a new recmaster election and restart monitoring from 1.
+    "Remote node %u had different BANNED flags 0x%x, local had 0x%x - trigger a re-election"
+    "Remote node %u had flags 0x%x, local had 0x%x - updating local"
+
+12, Verify that the local recovery master and the remote node agrees on the flags
+    for the remote node. If one of the flags other than the BANNING flag was inconsistent,
+    just update the set of flags for the local recovery daemon, log an information message
+    and continue monitoring.
+    "Remote node %u had flags 0x%x, local had 0x%x - updating local"
+
+13, Read the list of public ip addresses from all of the CONNECTED nodes and merge into a
+    single clusterwide list.
+    If we fail to read the list of ips from a node, log an error and restart monitoring from 1.
+    "Failed to read public ips from node : %u"
+
+14, Verify that all other nodes agree that this node is the recovery master.
+    If one of the other nodes discgrees this is the recovery master, log an error,
+    force a new election and restart monitoring from 1.
+    "Node %d does not agree we are the recmaster. Need a new recmaster election"
+
+15, Check if the previous attempt to run a recovery failed, and if it did, try a new recovery.
+    After the recovery, restart monitoring from 1.
+    "Starting do_recovery"
+
+16, Verify that all CONNECTED nodes in the cluster are in recovery mode NORMAL.
+    If one of the nodes were in recovery mode ACTIVE, force a new recovery and restart
+    monitoring from 1.
+    "Node:%u was in recovery mode. Start recovery process"
+
+17, Verify that the filehandle to the recovery lock file is valid.
+    If it is not, this may mean a split brain and is a critical error.
+    Try a new recovery and restart monitoring from 1.
+    "recovery master doesn't have the recovery lock"
+
+18, Verify that GPFS allows us to read from the recovery lock file.
+    If not there is a critical GPFS issue and we may have a split brain.
+    Try forcing a new recovery and restart monitoring from 1.
+    "failed read from recovery_lock_fd - %s"
+
+19, Read the list of all nodes and flags from all CONNECTED nodes in the cluster.
+    If fail to read the nodemap from one of the remote nodes, log an error and restart
+    monitoring from 1.
+    "Unable to get nodemap from remote node %u"
+
+20, If the nodemap differs between the local node and the remote node, log an error
+    and force a recovery.
+    This would happen if the /etc/ctdb/nodes file differs across nodes in the cluster.
+    It is unlikely that the recovery will rectify the situation.
+    This is a critical error, it is most likely the entire cluster will be unavailable
+    until the files are fixed or have became banned.
+    "Remote node:%u has different node count. %u vs %u of the local node"
+
+21, If a remote node disagrees on the content of the nodes list, try a recovery and restart
+    monitoring from 1.
+    It is unlikely that the recovery will rectify the situation.
+    This is a critical error, it is most likely the entire cluster will be unavailable
+    until the files are fixed or have became banned.
+    "Remote node:%u has different nodemap pnn for %d (%u vs %u)."
+
+22, If a remote node disgrees on the node flags in the list, try a recovery to re-sync
+    the flags and restart monitoring from 1.
+    "Remote node:%u has different nodemap flag for %d (0x%x vs 0x%x)"
+
+23, Verify that all active nodes are part of the VNNMAP.
+    If not, this would be a new node that has become CONNECTED but does not yet participate
+    in the cluster.
+    Perform a recovery to merge the new node to the cluster and restart monitoring from 1.
+    "The vnnmap count is different from the number of active nodes. %u vs %u"
+    or
+    "Node %u is active in the nodemap but did not exist in the vnnmap"
+
+24, Read the VNNMAP from all CONNECTED nodes.
+    Verify that all nodes have the same VNNMAP content and that all nodes are in the same
+    generation instance of the databases.
+    If not, force a recovery to re-synchronize the vnnmap and the databases across the cluster
+    and restart monitoring from 1.
+    "Remote node %u has different generation of vnnmap. %u vs %u (ours)"
+    "Remote node %u has different size of vnnmap. %u vs %u (ours)"
+    "Remote node %u has different vnnmap."
+
+25, If there has been changes to the cluster that requires a reallocation of public ip
+    addresses. On all nodes run the "startrecovery" event. Run "releaseip" and "takeip"
+    events to reassign the ips across the cluster and finally run the "recovered" event.
+
+Finished monitoring, continue monitoring from 1.
+
+
+CLUSTER RECOVERY
+================
+Recoveries are driven by the recovery daemon on the node that is currently the recovery
+master.
+Most of the logging that is performed during recovery is only logged on the node that
+is the recovery master.
+Make sure to find which node is the recovery master and check the log for that node.
+
+Example log entries that start in column 1 are expected to be present in the
+log. Example log entries that are indented 3 columns are optional and might
+only be present if an error occured.
+
+
+1, Log that recovery has been initiated.
+"Starting do_recovery"
+
+   It might log an informational message :
+"New recovery culprit %u".
+   This is only semi-accurate and might may not mean that there is any problem
+   at all with the node indicated.
+
+
+2, Check if a node has caused too many failed recoveries and if so ban it from
+   the cluster, giving the other nodes in the cluster a chance to recovery
+   operation.
+   "Node %u has caused %u recoveries in %.0f seconds - banning it for %u seconds"
+
+
+3, Verify that the recovery daemon can lock the recovery lock file.
+   At this stage this should be recovery master. 
+   If this operation fails it means we have a split brain and have to abort recovery.
+   "("ctdb_recovery_lock: Unable to open %s - (%s)"
+   "ctdb_recovery_lock: Failed to get recovery lock on '%s'"
+   "Unable to get recovery lock - aborting recovery"
+"ctdb_recovery_lock: Got recovery lock on '%s'"
+
+
+4, Log which node caused the recovery to be initiated.
+   This is a semi-accurate information message only.
+   This line does NOT mean that there has to be something wrong with the node listed.
+"Recovery initiated due to problem with node %u"
+
+
+5, Pull the names of all databases from all nodes and verify that these databases also
+   exists locally.
+   If a database is missing locally, just create it.
+   It is not an error if a database is missing locally. Databases are created on demand and
+   this could happen if it was one database which samba has never tried to access on the
+   local node.
+
+
+6, Check the list of databases on each remote node and create any databases that may be missing
+   on the remote node.
+"Recovery - created remote databases"
+
+
+7, Set recovery mode to ACTIVE on all remote nodes.
+
+
+8, Run the "startrecovery" eventscript on all nodes.
+
+   At this stage you will also get a few additional log entries, these are not
+   from the recovery daemon but from the main ctdb daemon due to running
+   the eventscript :
+"startrecovery eventscript has been invoked"
+"Monitoring has been disabled"
+"Executing event script ...
+...
+
+
+9, Create a new generation id and update the generation id and the VNNMAP on the local node
+   only.
+   This guarantees that the generation id will now be inconsistent across the cluster and
+   that if recovery fails a new recovery is attempted in the next iteration of the monitoring
+   loop.
+
+
+10, Start a TDB TRANSACTION on all nodes for all databases.
+   This is to ensure that if recovery is aborted or fails that we do not
+   modify any databases on only some of the nodes.
+"started transactions on all nodes"
+
+
+11, For each database, pull the content from all CONNECTED nodes and merge it into 
+    the TDB databases on the local node.
+    This merges the records from the remote nodes based on their serial numbers so we
+    only keep the most recent record found.
+"Recovery - pulled remote database 0x%x"
+
+
+12, For each database, perform a fast TDB WIPE operation to delete the entire TDB under the
+    transaction started above.
+
+
+13, For each database, drop all empty records.
+    Force the DMASTER field of all records to point to the recovery master.
+    Push the database out to all other nodes.
+
+    The PUSH process lists some additional log entries for each database of the
+    form :
+"Recovery - pulled remote database 0x..."
+"Recovery - pushed remote database 0x... of size ..."
+
+
+14, Commit all changes to all TDB databases.
+"Recovery - starting database commits"
+"Recovery - committed databases"
+
+
+15, Create a new VNNMAP of all CONNECTED nodes, create a new generation number
+    and piush this new VNNMAP out to all nodes.
+"Recovery - updated vnnmap"
+
+
+16, Update all nodes that the local node is the recovery master.
+"Recovery - updated recmaster"
+
+
+17, synchronize node flags across the cluster.
+"Recovery - updated flags"
+
+18, Change recovery mode back to NORMAL.
+"Recovery - disabled recovery mode"
+
+
+19, Re-allocate all public ip addresses across the cluster.
+"Deterministic IPs enabled. Resetting all ip allocations"
+
+    If the IP address allocation on the local node changes you might get
+    "Takeover of IP 10.0.0.201/24 on interface eth0"
+    "Release of IP 10.0.0.204/24 on interface eth0"
+
+"Recovery - takeip finished"
+
+
+20, Run the "recovered" eventscript on all nodes.
+"Recovery - finished the recovered event"
+
+    You will also get an entry from the local ctdb daemon itself that it has 
+    switched back to recovery mode NORMAL.
+"Recovery has finished"
+
+
+21, Broadcast a message to all samba daemons in the cluster that the databases have been
+    recovered. Samba will now do some additional checking/cleanup of the content in the stored
+    records.
+
+"Recovery complete"
+
+
+22. Finished. At this stage a 10 second timeout (ctdb listvars : rerecoverytimeout) is 
+    initiated. The cluster will not allow a new recovery to be performed until this timeout
+    has expired.
+
+"New recoveries supressed for the rerecovery timeout"
+"Rerecovery timeout elapsed. Recovery reactivated."
+
+
+
+
+
+
+
+Example : RECOVERY LOG ON RECMASTER
+====================================
+2008/12/01 09:57:28.110732 [ 4933]: 10.0.0.21:4379: node 10.0.0.24:4379 is dead: 2 connected
+2008/12/01 09:57:28.110838 [ 4933]: Tearing down connection to dead node :3
+2008/12/01 09:57:28.967297 [ 4935]: server/ctdb_recoverd.c:2682 The vnnmap count is different from the number of active nodes. 4 vs 3
+2008/12/01 09:57:28.967297 [ 4935]: server/ctdb_recoverd.c:1327 Starting do_recovery
+2008/12/01 09:57:28.967297 [ 4935]: ctdb_recovery_lock: Got recovery lock on '/gpfs/.ctdb/shared'
+2008/12/01 09:57:28.967297 [ 4935]: server/ctdb_recoverd.c:1355 Recovery initiated due to problem with node 0
+2008/12/01 09:57:28.967297 [ 4935]: server/ctdb_recoverd.c:1381 Recovery - created remote databases
+2008/12/01 09:57:28.973543 [ 4933]: server/ctdb_recover.c:589 Recovery mode set to ACTIVE
+2008/12/01 09:57:28.974823 [ 4933]: server/ctdb_recover.c:904 startrecovery eventscript has been invoked
+2008/12/01 09:57:29.187264 [ 4935]: server/ctdb_recoverd.c:1431 started transactions on all nodes
+2008/12/01 09:57:29.187264 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0x42fe72c5
+2008/12/01 09:57:29.187264 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0x42fe72c5 of size 0
+2008/12/01 09:57:29.187264 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0x1421fb78
+2008/12/01 09:57:29.197262 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0x1421fb78 of size 0
+2008/12/01 09:57:29.197262 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0xc0bdde6a
+2008/12/01 09:57:29.197262 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0xc0bdde6a of size 0
+2008/12/01 09:57:29.197262 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0x17055d90
+2008/12/01 09:57:29.207261 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0x17055d90 of size 8
+2008/12/01 09:57:29.207261 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0x7bbbd26c
+2008/12/01 09:57:29.207261 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0x7bbbd26c of size 1
+2008/12/01 09:57:29.207261 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0xf2a58948
+2008/12/01 09:57:29.217259 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0xf2a58948 of size 51
+2008/12/01 09:57:29.217259 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0x92380e87
+2008/12/01 09:57:29.217259 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0x92380e87 of size 17
+2008/12/01 09:57:29.227258 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0x63501287
+2008/12/01 09:57:29.227258 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0x63501287 of size 1
+2008/12/01 09:57:29.227258 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0xe98e08b6
+2008/12/01 09:57:29.227258 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0xe98e08b6 of size 4
+2008/12/01 09:57:29.237256 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0x2672a57f
+2008/12/01 09:57:29.237256 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0x2672a57f of size 28
+2008/12/01 09:57:29.237256 [ 4935]: server/ctdb_recoverd.c:1268 Recovery - pulled remote database 0xb775fff6
+2008/12/01 09:57:29.237256 [ 4935]: server/ctdb_recoverd.c:1230 Recovery - pushed remote database 0xb775fff6 of size 6
+2008/12/01 09:57:29.237256 [ 4935]: server/ctdb_recoverd.c:1440 Recovery - starting database commits
+2008/12/01 09:57:29.297247 [ 4935]: server/ctdb_recoverd.c:1452 Recovery - committed databases
+2008/12/01 09:57:29.297247 [ 4935]: server/ctdb_recoverd.c:1502 Recovery - updated vnnmap
+2008/12/01 09:57:29.297247 [ 4935]: server/ctdb_recoverd.c:1511 Recovery - updated recmaster
+2008/12/01 09:57:29.297247 [ 4935]: server/ctdb_recoverd.c:1522 Recovery - updated flags
+2008/12/01 09:57:29.305235 [ 4933]: server/ctdb_recover.c:589 Recovery mode set to NORMAL
+2008/12/01 09:57:29.307245 [ 4935]: server/ctdb_recoverd.c:1531 Recovery - disabled recovery mode
+2008/12/01 09:57:29.307245 [ 4935]: Deterministic IPs enabled. Resetting all ip allocations
+2008/12/01 09:57:29.311071 [ 4933]: takeoverip called for an ip '10.0.0.201' that is not a public address
+2008/12/01 09:57:29.311186 [ 4933]: takeoverip called for an ip '10.0.0.202' that is not a public address
+2008/12/01 09:57:29.311204 [ 4933]: takeoverip called for an ip '10.0.0.203' that is not a public address
+2008/12/01 09:57:29.311299 [ 4933]: takeoverip called for an ip '10.0.0.204' that is not a public address
+2008/12/01 09:57:29.537210 [ 4935]: server/ctdb_recoverd.c:1542 Recovery - takeip finished
+2008/12/01 09:57:29.545404 [ 4933]: Recovery has finished
+2008/12/01 09:57:29.807169 [ 4935]: server/ctdb_recoverd.c:1551 Recovery - finished the recovered event
+2008/12/01 09:57:29.807169 [ 4935]: server/ctdb_recoverd.c:1557 Recovery complete
+2008/12/01 09:57:29.807169 [ 4935]: server/ctdb_recoverd.c:1565 New recoveries supressed for the rerecovery timeout
+2008/12/01 09:57:39.815648 [ 4935]: server/ctdb_recoverd.c:1567 Rerecovery timeout elapsed. Recovery reactivated.
+
+
+
+
+
+
+
+
diff --git a/ctdb/ib/README.txt b/ctdb/ib/README.txt
new file mode 100644 (file)
index 0000000..4041982
--- /dev/null
@@ -0,0 +1,19 @@
+Compilation
+===========
+
+For the configure script, please set the OFED include & library path by e.g.:
+
+export CFLAGS="-I/usr/local/ofed/include -L/usr/local/ofed/lib"
+
+After then:
+
+./configure --enable-infiniband
+
+Example for testing
+===================
+bin/ctdb_test --transport ib --nlist ../2nodes_rm.txt --listen 10.0.0.1
+bin/ctdb_test --transport ib --nlist ../2nodes_rm.txt --listen 10.0.0.2
+
+where 2nodes_rm.txt:
+10.0.0.1
+10.0.0.2
diff --git a/ctdb/ib/config.m4 b/ctdb/ib/config.m4
new file mode 100644 (file)
index 0000000..47d6fac
--- /dev/null
@@ -0,0 +1,31 @@
+AC_ARG_ENABLE(--enable-infiniband, 
+[  --enable-infiniband         Turn on infiniband support (default=no)])
+
+HAVE_INFINIBAND=no
+
+if eval "test x$enable_infiniband = xyes"; then
+        AC_DEFINE(USE_INFINIBAND,1,[Use infiniband])
+       HAVE_INFINIBAND=yes
+
+       INFINIBAND_WRAPPER_OBJ="ib/ibwrapper.o ib/ibw_ctdb.o ib/ibw_ctdb_init.o"
+       INFINIBAND_LIBS="-lrdmacm -libverbs"
+       INFINIBAND_BINS="tests/bin/ibwrapper_test"
+
+       AC_CHECK_HEADERS(infiniband/verbs.h, [], [
+               echo "ERROR: you need infiniband/verbs.h when ib enabled!"
+               exit -1])
+       AC_CHECK_HEADERS(rdma/rdma_cma.h, [], [
+               echo "ERROR: you need rdma/rdma_cma.h when ib enabled!"
+               exit -1])
+       AC_CHECK_LIB(ibverbs, ibv_create_qp, [], [
+               echo "ERROR: you need libibverbs when ib enabled!"
+               exit -1])
+       AC_CHECK_LIB(rdmacm, rdma_connect, [], [
+               echo "ERROR: you need librdmacm when ib enabled!"
+               exit -1])
+fi
+
+AC_SUBST(HAVE_INFINIBAND)
+AC_SUBST(INFINIBAND_WRAPPER_OBJ)
+AC_SUBST(INFINIBAND_LIBS)
+AC_SUBST(INFINIBAND_BINS)
diff --git a/ctdb/ib/ibw_ctdb.c b/ctdb/ib/ibw_ctdb.c
new file mode 100644 (file)
index 0000000..2e70d1d
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * Join infiniband wrapper and ctdb.
+ *
+ * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
+ *
+ * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "includes.h"
+#include <system/network.h>
+#include <assert.h>
+#include "ctdb_private.h"
+#include "ibwrapper.h"
+#include "ibw_ctdb.h"
+
+int ctdb_ibw_get_address(struct ctdb_context *ctdb,
+       const char *address, struct in_addr *addr)
+{
+       if (inet_pton(AF_INET, address, addr) <= 0) {
+               struct hostent *he = gethostbyname(address);
+               if (he == NULL || he->h_length > sizeof(*addr)) {
+                       ctdb_set_error(ctdb, "invalid nework address '%s'\n", 
+                                      address);
+                       return -1;
+               }
+               memcpy(addr, he->h_addr, he->h_length);
+       }
+       return 0;
+}
+
+int ctdb_ibw_node_connect(struct ctdb_node *node)
+{
+       struct ctdb_ibw_node *cn = talloc_get_type(node->private_data, struct ctdb_ibw_node);
+       int     rc;
+
+       assert(cn!=NULL);
+       assert(cn->conn!=NULL);
+       struct sockaddr_in sock_out;
+
+       memset(&sock_out, 0, sizeof(struct sockaddr_in));
+       sock_out.sin_port = htons(node->address.port);
+       sock_out.sin_family = PF_INET;
+       if (ctdb_ibw_get_address(node->ctdb, node->address.address, &sock_out.sin_addr)) {
+               DEBUG(DEBUG_ERR, ("ctdb_ibw_node_connect failed\n"));
+               return -1;
+       }
+
+       rc = ibw_connect(cn->conn, &sock_out, node);
+       if (rc) {
+               DEBUG(DEBUG_ERR, ("ctdb_ibw_node_connect/ibw_connect failed - retrying...\n"));
+               /* try again once a second */
+               event_add_timed(node->ctdb->ev, node, timeval_current_ofs(1, 0), 
+                       ctdb_ibw_node_connect_event, node);
+       }
+
+       /* continues at ibw_ctdb.c/IBWC_CONNECTED in good case */
+       return 0;
+}
+
+void ctdb_ibw_node_connect_event(struct event_context *ev, struct timed_event *te, 
+       struct timeval t, void *private_data)
+{
+       struct ctdb_node *node = talloc_get_type(private_data, struct ctdb_node);
+
+       ctdb_ibw_node_connect(node);
+}
+
+int ctdb_ibw_connstate_handler(struct ibw_ctx *ctx, struct ibw_conn *conn)
+{
+       if (ctx!=NULL) {
+               /* ctx->state changed */
+               switch(ctx->state) {
+               case IBWS_INIT: /* ctx start - after ibw_init */
+                       break;
+               case IBWS_READY: /* after ibw_bind & ibw_listen */
+                       break;
+               case IBWS_CONNECT_REQUEST: /* after [IBWS_READY + incoming request] */
+                               /* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */
+                       if (ibw_accept(ctx, conn, NULL)) {
+                               DEBUG(DEBUG_ERR, ("connstate_handler/ibw_accept failed\n"));
+                               return -1;
+                       } /* else continue in IBWC_CONNECTED */
+                       break;
+               case IBWS_STOPPED: /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */
+                       /* TODO: have a CTDB upcall for which CTDB should wait in a (final) loop */
+                       break;
+               case IBWS_ERROR: /* abnormal state; ibw_stop must be called after this */
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+
+       if (conn!=NULL) {
+               /* conn->state changed */
+               switch(conn->state) {
+               case IBWC_INIT: /* conn start - internal state */
+                       break;
+               case IBWC_CONNECTED: { /* after ibw_accept or ibw_connect */
+                       struct ctdb_node *node = talloc_get_type(conn->conn_userdata, struct ctdb_node);
+                       if (node!=NULL) { /* after ibw_connect */
+                               struct ctdb_ibw_node *cn = talloc_get_type(node->private_data, struct ctdb_ibw_node);
+
+                               node->ctdb->upcalls->node_connected(node);
+                               ctdb_flush_cn_queue(cn);
+                       } else { /* after ibw_accept */
+                               /* NOP in CTDB case */
+                       }
+               } break;
+               case IBWC_DISCONNECTED: { /* after ibw_disconnect */
+                       struct ctdb_node *node = talloc_get_type(conn->conn_userdata, struct ctdb_node);
+                       if (node!=NULL)
+                               node->ctdb->upcalls->node_dead(node);
+                       talloc_free(conn);
+                       /* normal + intended disconnect => not reconnecting in this layer */
+               } break;
+               case IBWC_ERROR: {
+                       struct ctdb_node *node = talloc_get_type(conn->conn_userdata, struct ctdb_node);
+                       if (node!=NULL) {
+                               struct ctdb_ibw_node *cn = talloc_get_type(node->private_data, struct ctdb_ibw_node);
+                               struct ibw_ctx *ictx = cn->conn->ctx;
+
+                               DEBUG(DEBUG_DEBUG, ("IBWC_ERROR, reconnecting...\n"));
+                               talloc_free(cn->conn); /* internal queue content is destroyed */
+                               cn->conn = (void *)ibw_conn_new(ictx, node);
+                               event_add_timed(node->ctdb->ev, node, timeval_current_ofs(1, 0),
+                                       ctdb_ibw_node_connect_event, node);
+                       }
+               } break;
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+int ctdb_ibw_receive_handler(struct ibw_conn *conn, void *buf, int n)
+{
+       struct ctdb_context *ctdb = talloc_get_type(conn->ctx->ctx_userdata, struct ctdb_context);
+       void    *buf2; /* future TODO: a solution for removal of this */
+
+       assert(ctdb!=NULL);
+       assert(buf!=NULL);
+       assert(conn!=NULL);
+       assert(conn->state==IBWC_CONNECTED);
+
+       /* so far "buf" is an ib-registered memory area
+        * and being reused for next receive
+        * noticed that HL requires talloc-ed memory to be stolen */
+       buf2 = talloc_zero_size(conn, n);
+       CTDB_NO_MEMORY(ctdb, buf2);
+
+       memcpy(buf2, buf, n);
+
+       ctdb->upcalls->recv_pkt(ctdb, (uint8_t *)buf2, (uint32_t)n);
+
+       return 0;
+}
diff --git a/ctdb/ib/ibw_ctdb.h b/ctdb/ib/ibw_ctdb.h
new file mode 100644 (file)
index 0000000..98ea102
--- /dev/null
@@ -0,0 +1,50 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * Join infiniband wrapper and ctdb.
+ *
+ * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
+ *
+ * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+struct ctdb_ibw_msg {
+       uint8_t *data;
+       uint32_t length;
+       struct ctdb_ibw_msg *prev;
+       struct ctdb_ibw_msg *next;
+};
+
+struct ctdb_ibw_node {
+       struct ibw_conn *conn;
+
+       struct ctdb_ibw_msg *queue;
+       struct ctdb_ibw_msg *queue_last;
+       int     qcnt;
+};
+
+int ctdb_ibw_get_address(struct ctdb_context *ctdb,
+       const char *address, struct in_addr *addr);
+
+int ctdb_ibw_connstate_handler(struct ibw_ctx *ctx, struct ibw_conn *conn);
+int ctdb_ibw_receive_handler(struct ibw_conn *conn, void *buf, int n);
+
+int ctdb_ibw_node_connect(struct ctdb_node *node);
+void ctdb_ibw_node_connect_event(struct event_context *ev, struct timed_event *te, 
+       struct timeval t, void *private_data);
+
+int ctdb_flush_cn_queue(struct ctdb_ibw_node *cn);
+
+int ctdb_ibw_init(struct ctdb_context *ctdb);
diff --git a/ctdb/ib/ibw_ctdb_init.c b/ctdb/ib/ibw_ctdb_init.c
new file mode 100644 (file)
index 0000000..066814d
--- /dev/null
@@ -0,0 +1,247 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * Join infiniband wrapper and ctdb.
+ *
+ * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
+ *
+ * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "includes.h"
+#include <system/network.h>
+#include <assert.h>
+#include "ctdb_private.h"
+#include "ibwrapper.h"
+#include "ibw_ctdb.h"
+
+static int ctdb_ibw_listen(struct ctdb_context *ctdb, int backlog)
+{
+       struct ibw_ctx *ictx = talloc_get_type(ctdb->private_data, struct ibw_ctx);
+       struct sockaddr_in my_addr;
+
+       assert(ictx!=NULL);
+       memset(&my_addr, 0, sizeof(struct sockaddr_in));
+       my_addr.sin_port = htons(ctdb->address.port);
+       my_addr.sin_family = PF_INET;
+       if (ctdb_ibw_get_address(ctdb, ctdb->address.address, &my_addr.sin_addr))
+               return -1;
+
+       if (ibw_bind(ictx, &my_addr)) {
+               DEBUG(DEBUG_CRIT, ("ctdb_ibw_listen: ibw_bind failed\n"));
+               return -1;
+       }
+
+       if (ibw_listen(ictx, backlog)) {
+               DEBUG(DEBUG_CRIT, ("ctdb_ibw_listen: ibw_listen failed\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+ * initialise ibw portion of a ctdb node 
+ */
+static int ctdb_ibw_add_node(struct ctdb_node *node)
+{
+       struct ibw_ctx *ictx = talloc_get_type(node->ctdb->private_data, struct ibw_ctx);
+       struct ctdb_ibw_node *cn = talloc_zero(node, struct ctdb_ibw_node);
+
+       assert(cn!=NULL);
+       cn->conn = ibw_conn_new(ictx, node);
+       node->private_data = (void *)cn;
+
+       return (cn->conn!=NULL ? 0 : -1);
+}
+
+/*
+ * initialise infiniband
+ */
+static int ctdb_ibw_initialise(struct ctdb_context *ctdb)
+{
+       int i, ret;
+
+       ret = ctdb_ibw_init(ctdb);
+       if (ret != 0) {
+               return ret;
+       }
+
+       for (i=0; i<ctdb->num_nodes; i++) {
+               if (ctdb_ibw_add_node(ctdb->nodes[i]) != 0) {
+                       DEBUG(DEBUG_CRIT, ("methods->add_node failed at %d\n", i));
+                       return -1;
+               }
+       }
+
+       /* listen on our own address */
+       if (ctdb_ibw_listen(ctdb, 10)) /* TODO: backlog as param */
+               return -1;
+
+       return 0;
+}
+
+
+/*
+ * Start infiniband
+ */
+static int ctdb_ibw_start(struct ctdb_context *ctdb)
+{
+       int i, ret;
+
+       /* everything async here */
+       for (i=0;i<ctdb->num_nodes;i++) {
+               struct ctdb_node *node = ctdb->nodes[i];
+               if (!ctdb_same_address(&ctdb->address, &node->address)) {
+                       ctdb_ibw_node_connect(node);
+               }
+       }
+
+       return 0;
+}
+
+static int ctdb_ibw_send_pkt(struct ibw_conn *conn, uint8_t *data, uint32_t length)
+{
+       void    *buf, *key;
+
+       if (ibw_alloc_send_buf(conn, &buf, &key, length)) {
+               DEBUG(DEBUG_ERR, ("queue_pkt/ibw_alloc_send_buf failed\n"));
+               return -1;
+       }
+
+       memcpy(buf, data, length);
+       return ibw_send(conn, buf, key, length);
+}
+
+int ctdb_flush_cn_queue(struct ctdb_ibw_node *cn)
+{
+       struct ctdb_ibw_msg *p;
+       int     rc = 0;
+
+       while(cn->queue) {
+               p = cn->queue;
+               rc = ctdb_ibw_send_pkt(cn->conn, p->data, p->length);
+               if (rc)
+                       return -1; /* will be retried later when conn is up */
+
+               DLIST_REMOVE(cn->queue, p);
+               cn->qcnt--;
+               talloc_free(p); /* it will talloc_free p->data as well */
+       }
+       assert(cn->qcnt==0);
+       /* cn->queue_last = NULL is not needed - see DLIST_ADD_AFTER */
+
+       return rc;
+}
+
+static int ctdb_ibw_queue_pkt(struct ctdb_node *node, uint8_t *data, uint32_t length)
+{
+       struct ctdb_ibw_node *cn = talloc_get_type(node->private_data, struct ctdb_ibw_node);
+       int     rc;
+
+       assert(length>=sizeof(uint32_t));
+       assert(cn!=NULL);
+
+       if (cn->conn==NULL) {
+               DEBUG(DEBUG_ERR, ("ctdb_ibw_queue_pkt: conn is NULL\n"));
+               return -1;
+       }
+
+       if (cn->conn->state==IBWC_CONNECTED) {
+               rc = ctdb_ibw_send_pkt(cn->conn, data, length);
+       } else {
+               struct ctdb_ibw_msg *p = talloc_zero(cn, struct ctdb_ibw_msg);
+               CTDB_NO_MEMORY(node->ctdb, p);
+
+               p->data = talloc_memdup(p, data, length);
+               CTDB_NO_MEMORY(node->ctdb, p->data);
+
+               p->length = length;
+
+               DLIST_ADD_AFTER(cn->queue, p, cn->queue_last);
+               cn->queue_last = p;
+               cn->qcnt++;
+
+               rc = 0;
+       }
+
+       return rc;
+}
+
+static void ctdb_ibw_restart(struct ctdb_node *node)
+{
+       /* TODO: implement this method for IB */
+       DEBUG(DEBUG_ALERT,("WARNING: method restart is not yet implemented for IB\n"));
+}
+
+/*
+ * transport packet allocator - allows transport to control memory for packets
+ */
+static void *ctdb_ibw_allocate_pkt(TALLOC_CTX *mem_ctx, size_t size)
+{
+       /* TODO: use ibw_alloc_send_buf instead... */
+       return talloc_size(mem_ctx, size);
+}
+
+#ifdef __NOTDEF__
+
+static int ctdb_ibw_stop(struct ctdb_context *cctx)
+{
+       struct ibw_ctx *ictx = talloc_get_type(cctx->private_data, struct ibw_ctx);
+
+       assert(ictx!=NULL);
+       return ibw_stop(ictx);
+}
+
+#endif /* __NOTDEF__ */
+
+static const struct ctdb_methods ctdb_ibw_methods = {
+       .initialise= ctdb_ibw_initialise,
+       .start     = ctdb_ibw_start,
+       .queue_pkt = ctdb_ibw_queue_pkt,
+       .add_node = ctdb_ibw_add_node,
+       .allocate_pkt = ctdb_ibw_allocate_pkt,
+       .restart      = ctdb_ibw_restart,
+
+//     .stop = ctdb_ibw_stop
+};
+
+/*
+ * initialise ibw portion of ctdb 
+ */
+int ctdb_ibw_init(struct ctdb_context *ctdb)
+{
+       struct ibw_ctx *ictx;
+
+       DEBUG(DEBUG_DEBUG, ("ctdb_ibw_init invoked...\n"));
+       ictx = ibw_init(
+               NULL, //struct ibw_initattr *attr, /* TODO */
+               0, //int nattr, /* TODO */
+               ctdb,
+               ctdb_ibw_connstate_handler,
+               ctdb_ibw_receive_handler,
+               ctdb->ev);
+
+       if (ictx==NULL) {
+               DEBUG(DEBUG_CRIT, ("ctdb_ibw_init: ibw_init failed\n"));
+               return -1;
+       }
+
+       ctdb->methods = &ctdb_ibw_methods;
+       ctdb->private_data = ictx;
+       
+       DEBUG(DEBUG_DEBUG, ("ctdb_ibw_init succeeded.\n"));
+       return 0;
+}
diff --git a/ctdb/ib/ibwrapper.c b/ctdb/ib/ibwrapper.c
new file mode 100644 (file)
index 0000000..f6e7168
--- /dev/null
@@ -0,0 +1,1365 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * Wrap Infiniband calls.
+ *
+ * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
+ *
+ * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <malloc.h>
+#include <assert.h>
+#include <unistd.h>
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "ibwrapper.h"
+
+#include <infiniband/kern-abi.h>
+#include <rdma/rdma_cma_abi.h>
+#include <rdma/rdma_cma.h>
+
+#include "ibwrapper_internal.h"
+#include "lib/util/dlinklist.h"
+
+#define IBW_LASTERR_BUFSIZE 512
+static char ibw_lasterr[IBW_LASTERR_BUFSIZE];
+
+#define IBW_MAX_SEND_WR 256
+#define IBW_MAX_RECV_WR 1024
+#define IBW_RECV_BUFSIZE 256
+#define IBW_RECV_THRESHOLD (1 * 1024 * 1024)
+
+static void ibw_event_handler_verbs(struct event_context *ev,
+       struct fd_event *fde, uint16_t flags, void *private_data);
+static int ibw_fill_cq(struct ibw_conn *conn);
+static int ibw_wc_recv(struct ibw_conn *conn, struct ibv_wc *wc);
+static int ibw_wc_send(struct ibw_conn *conn, struct ibv_wc *wc);
+static int ibw_send_packet(struct ibw_conn *conn, void *buf, struct ibw_wr *p, uint32_t len);
+
+static void *ibw_alloc_mr(struct ibw_ctx_priv *pctx, struct ibw_conn_priv *pconn,
+       uint32_t n, struct ibv_mr **ppmr)
+{
+       void *buf;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_alloc_mr(cmid=%p, n=%u)\n", pconn->cm_id, n));
+       buf = memalign(pctx->pagesize, n);
+       if (!buf) {
+               sprintf(ibw_lasterr, "couldn't allocate memory\n");
+               return NULL;
+       }
+
+       *ppmr = ibv_reg_mr(pconn->pd, buf, n, IBV_ACCESS_LOCAL_WRITE);
+       if (!*ppmr) {
+               sprintf(ibw_lasterr, "couldn't allocate mr\n");
+               free(buf);
+               return NULL;
+       }
+
+       return buf;
+}
+
+static void ibw_free_mr(char **ppbuf, struct ibv_mr **ppmr)
+{
+       DEBUG(DEBUG_DEBUG, ("ibw_free_mr(%p %p)\n", *ppbuf, *ppmr));
+       if (*ppmr!=NULL) {
+               ibv_dereg_mr(*ppmr);
+               *ppmr = NULL;
+       }
+       if (*ppbuf) {
+               free(*ppbuf);
+               *ppbuf = NULL;
+       }
+}
+
+static int ibw_init_memory(struct ibw_conn *conn)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibw_opts *opts = &pctx->opts;
+       int     i;
+       struct ibw_wr   *p;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_init_memory(cmid: %p)\n", pconn->cm_id));
+       pconn->buf_send = ibw_alloc_mr(pctx, pconn,
+               opts->max_send_wr * opts->recv_bufsize, &pconn->mr_send);
+       if (!pconn->buf_send) {
+               sprintf(ibw_lasterr, "couldn't allocate work send buf\n");
+               return -1;
+       }
+
+       pconn->buf_recv = ibw_alloc_mr(pctx, pconn,
+               opts->max_recv_wr * opts->recv_bufsize, &pconn->mr_recv);
+       if (!pconn->buf_recv) {
+               sprintf(ibw_lasterr, "couldn't allocate work recv buf\n");
+               return -1;
+       }
+
+       pconn->wr_index = talloc_size(pconn, opts->max_send_wr * sizeof(struct ibw_wr *));
+       assert(pconn->wr_index!=NULL);
+
+       for(i=0; i<opts->max_send_wr; i++) {
+               p = pconn->wr_index[i] = talloc_zero(pconn, struct ibw_wr);
+               p->buf = pconn->buf_send + (i * opts->recv_bufsize);
+               p->wr_id = i;
+
+               DLIST_ADD(pconn->wr_list_avail, p);
+       }
+
+       return 0;
+}
+
+static int ibw_ctx_priv_destruct(struct ibw_ctx_priv *pctx)
+{
+       DEBUG(DEBUG_DEBUG, ("ibw_ctx_priv_destruct(%p)\n", pctx));
+
+       /* destroy cm */
+       if (pctx->cm_channel) {
+               rdma_destroy_event_channel(pctx->cm_channel);
+               pctx->cm_channel = NULL;
+       }
+       if (pctx->cm_channel_event) {
+               /* TODO: do we have to do this here? */
+               talloc_free(pctx->cm_channel_event);
+               pctx->cm_channel_event = NULL;
+       }
+       if (pctx->cm_id) {
+               rdma_destroy_id(pctx->cm_id);
+               pctx->cm_id = NULL;
+       }
+
+       return 0;
+}
+
+static int ibw_ctx_destruct(struct ibw_ctx *ctx)
+{
+       DEBUG(DEBUG_DEBUG, ("ibw_ctx_destruct(%p)\n", ctx));
+       return 0;
+}
+
+static int ibw_conn_priv_destruct(struct ibw_conn_priv *pconn)
+{
+       DEBUG(DEBUG_DEBUG, ("ibw_conn_priv_destruct(%p, cmid: %p)\n",
+               pconn, pconn->cm_id));
+
+       /* pconn->wr_index is freed by talloc */
+       /* pconn->wr_index[i] are freed by talloc */
+
+       /* destroy verbs */
+       if (pconn->cm_id!=NULL && pconn->cm_id->qp!=NULL) {
+               rdma_destroy_qp(pconn->cm_id);
+               pconn->cm_id->qp = NULL;
+       }
+
+       if (pconn->cq!=NULL) {
+               ibv_destroy_cq(pconn->cq);
+               pconn->cq = NULL;
+       }
+
+       if (pconn->verbs_channel!=NULL) {
+               ibv_destroy_comp_channel(pconn->verbs_channel);
+               pconn->verbs_channel = NULL;
+       }
+
+       /* must be freed here because its order is important */
+       if (pconn->verbs_channel_event) {
+               talloc_free(pconn->verbs_channel_event);
+               pconn->verbs_channel_event = NULL;
+       }
+
+       /* free memory regions */
+       ibw_free_mr(&pconn->buf_send, &pconn->mr_send);
+       ibw_free_mr(&pconn->buf_recv, &pconn->mr_recv);
+
+       if (pconn->pd) {
+               ibv_dealloc_pd(pconn->pd);
+               pconn->pd = NULL;
+               DEBUG(DEBUG_DEBUG, ("pconn=%p pd deallocated\n", pconn));
+       }
+
+       if (pconn->cm_id) {
+               rdma_destroy_id(pconn->cm_id);
+               pconn->cm_id = NULL;
+               DEBUG(DEBUG_DEBUG, ("pconn=%p cm_id destroyed\n", pconn));
+       }
+
+       return 0;
+}
+
+static int ibw_wr_destruct(struct ibw_wr *wr)
+{
+       if (wr->buf_large!=NULL)
+               ibw_free_mr(&wr->buf_large, &wr->mr_large);
+       return 0;
+}
+
+static int ibw_conn_destruct(struct ibw_conn *conn)
+{
+       DEBUG(DEBUG_DEBUG, ("ibw_conn_destruct(%p)\n", conn));
+       
+       /* important here: ctx is a talloc _parent_ */
+       DLIST_REMOVE(conn->ctx->conn_list, conn);
+       return 0;
+}
+
+struct ibw_conn *ibw_conn_new(struct ibw_ctx *ctx, TALLOC_CTX *mem_ctx)
+{
+       struct ibw_conn *conn;
+       struct ibw_conn_priv *pconn;
+
+       assert(ctx!=NULL);
+
+       conn = talloc_zero(mem_ctx, struct ibw_conn);
+       assert(conn!=NULL);
+       talloc_set_destructor(conn, ibw_conn_destruct);
+
+       pconn = talloc_zero(conn, struct ibw_conn_priv);
+       assert(pconn!=NULL);
+       talloc_set_destructor(pconn, ibw_conn_priv_destruct);
+
+       conn->ctx = ctx;
+       conn->internal = (void *)pconn;
+
+       DLIST_ADD(ctx->conn_list, conn);
+
+       return conn;
+}
+
+static int ibw_setup_cq_qp(struct ibw_conn *conn)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibv_qp_init_attr init_attr;
+       struct ibv_qp_attr attr;
+       int rc;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_setup_cq_qp(cmid: %p)\n", pconn->cm_id));
+
+       /* init verbs */
+       pconn->verbs_channel = ibv_create_comp_channel(pconn->cm_id->verbs);
+       if (!pconn->verbs_channel) {
+               sprintf(ibw_lasterr, "ibv_create_comp_channel failed %d\n", errno);
+               return -1;
+       }
+       DEBUG(DEBUG_DEBUG, ("created channel %p\n", pconn->verbs_channel));
+
+       pconn->verbs_channel_event = event_add_fd(pctx->ectx, NULL, /* not pconn or conn */
+               pconn->verbs_channel->fd, EVENT_FD_READ, ibw_event_handler_verbs, conn);
+
+       pconn->pd = ibv_alloc_pd(pconn->cm_id->verbs);
+       if (!pconn->pd) {
+               sprintf(ibw_lasterr, "ibv_alloc_pd failed %d\n", errno);
+               return -1;
+       }
+       DEBUG(DEBUG_DEBUG, ("created pd %p\n", pconn->pd));
+
+       /* init mr */
+       if (ibw_init_memory(conn))
+               return -1;
+
+       /* init cq */
+       pconn->cq = ibv_create_cq(pconn->cm_id->verbs,
+               pctx->opts.max_recv_wr + pctx->opts.max_send_wr,
+               conn, pconn->verbs_channel, 0);
+       if (pconn->cq==NULL) {
+               sprintf(ibw_lasterr, "ibv_create_cq failed\n");
+               return -1;
+       }
+
+       rc = ibv_req_notify_cq(pconn->cq, 0);
+       if (rc) {
+               sprintf(ibw_lasterr, "ibv_req_notify_cq failed with %d\n", rc);
+               return rc;
+       }
+
+       /* init qp */
+       memset(&init_attr, 0, sizeof(init_attr));
+       init_attr.cap.max_send_wr = pctx->opts.max_send_wr;
+       init_attr.cap.max_recv_wr = pctx->opts.max_recv_wr;
+       init_attr.cap.max_recv_sge = 1;
+       init_attr.cap.max_send_sge = 1;
+       init_attr.qp_type = IBV_QPT_RC;
+       init_attr.send_cq = pconn->cq;
+       init_attr.recv_cq = pconn->cq;
+
+       rc = rdma_create_qp(pconn->cm_id, pconn->pd, &init_attr);
+       if (rc) {
+               sprintf(ibw_lasterr, "rdma_create_qp failed with %d\n", rc);
+               return rc;
+       }
+       /* elase result is in pconn->cm_id->qp */
+
+       rc = ibv_query_qp(pconn->cm_id->qp, &attr, IBV_QP_PATH_MTU, &init_attr);
+       if (rc) {
+               sprintf(ibw_lasterr, "ibv_query_qp failed with %d\n", rc);
+               return rc;
+       }
+
+       return ibw_fill_cq(conn);
+}
+
+static int ibw_refill_cq_recv(struct ibw_conn *conn)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       int     rc;
+       struct ibv_sge list = {
+               .addr   = (uintptr_t) NULL, /* filled below */
+               .length = pctx->opts.recv_bufsize,
+               .lkey   = pconn->mr_recv->lkey /* always the same */
+       };
+       struct ibv_recv_wr wr = {
+               .wr_id      = 0, /* filled below */
+               .sg_list    = &list,
+               .num_sge    = 1,
+       };
+       struct ibv_recv_wr *bad_wr;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_refill_cq_recv(cmid: %p)\n", pconn->cm_id));
+
+       list.addr = (uintptr_t) pconn->buf_recv + pctx->opts.recv_bufsize * pconn->recv_index;
+       wr.wr_id = pconn->recv_index;
+       pconn->recv_index = (pconn->recv_index + 1) % pctx->opts.max_recv_wr;
+
+       rc = ibv_post_recv(pconn->cm_id->qp, &wr, &bad_wr);
+       if (rc) {
+               sprintf(ibw_lasterr, "refill/ibv_post_recv failed with %d\n", rc);
+               DEBUG(DEBUG_ERR, (ibw_lasterr));
+               return -2;
+       }
+
+       return 0;
+}
+
+static int ibw_fill_cq(struct ibw_conn *conn)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       int     i, rc;
+       struct ibv_sge list = {
+               .addr   = (uintptr_t) NULL, /* filled below */
+               .length = pctx->opts.recv_bufsize,
+               .lkey   = pconn->mr_recv->lkey /* always the same */
+       };
+       struct ibv_recv_wr wr = {
+               .wr_id      = 0, /* filled below */
+               .sg_list    = &list,
+               .num_sge    = 1,
+       };
+       struct ibv_recv_wr *bad_wr;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_fill_cq(cmid: %p)\n", pconn->cm_id));
+
+       for(i = pctx->opts.max_recv_wr; i!=0; i--) {
+               list.addr = (uintptr_t) pconn->buf_recv + pctx->opts.recv_bufsize * pconn->recv_index;
+               wr.wr_id = pconn->recv_index;
+               pconn->recv_index = (pconn->recv_index + 1) % pctx->opts.max_recv_wr;
+
+               rc = ibv_post_recv(pconn->cm_id->qp, &wr, &bad_wr);
+               if (rc) {
+                       sprintf(ibw_lasterr, "fill/ibv_post_recv failed with %d\n", rc);
+                       DEBUG(DEBUG_ERR, (ibw_lasterr));
+                       return -2;
+               }
+       }
+
+       return 0;
+}
+
+static int ibw_manage_connect(struct ibw_conn *conn)
+{
+       struct rdma_conn_param conn_param;
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       int     rc;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_manage_connect(cmid: %p)\n", pconn->cm_id));
+
+       if (ibw_setup_cq_qp(conn))
+               return -1;
+
+       /* cm connect */
+       memset(&conn_param, 0, sizeof conn_param);
+       conn_param.responder_resources = 1;
+       conn_param.initiator_depth = 1;
+       conn_param.retry_count = 10;
+
+       rc = rdma_connect(pconn->cm_id, &conn_param);
+       if (rc)
+               sprintf(ibw_lasterr, "rdma_connect error %d\n", rc);
+
+       return rc;
+}
+
+static void ibw_event_handler_cm(struct event_context *ev,
+       struct fd_event *fde, uint16_t flags, void *private_data)
+{
+       int     rc;
+       struct ibw_ctx  *ctx = talloc_get_type(private_data, struct ibw_ctx);
+       struct ibw_ctx_priv *pctx = talloc_get_type(ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn *conn = NULL;
+       struct ibw_conn_priv *pconn = NULL;
+       struct rdma_cm_id *cma_id = NULL;
+       struct rdma_cm_event *event = NULL;
+
+       assert(ctx!=NULL);
+
+       rc = rdma_get_cm_event(pctx->cm_channel, &event);
+       if (rc) {
+               ctx->state = IBWS_ERROR;
+               event = NULL;
+               sprintf(ibw_lasterr, "rdma_get_cm_event error %d\n", rc);
+               goto error;
+       }
+       cma_id = event->id;
+
+       DEBUG(DEBUG_DEBUG, ("cma_event type %d cma_id %p (%s)\n", event->event, cma_id,
+                 (cma_id == pctx->cm_id) ? "parent" : "child"));
+
+       switch (event->event) {
+       case RDMA_CM_EVENT_ADDR_RESOLVED:
+               DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_ADDR_RESOLVED\n"));
+               /* continuing from ibw_connect ... */
+               rc = rdma_resolve_route(cma_id, 2000);
+               if (rc) {
+                       sprintf(ibw_lasterr, "rdma_resolve_route error %d\n", rc);
+                       goto error;
+               }
+               /* continued at RDMA_CM_EVENT_ROUTE_RESOLVED */
+               break;
+
+       case RDMA_CM_EVENT_ROUTE_RESOLVED:
+               DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_ROUTE_RESOLVED\n"));
+               /* after RDMA_CM_EVENT_ADDR_RESOLVED: */
+               assert(cma_id->context!=NULL);
+               conn = talloc_get_type(cma_id->context, struct ibw_conn);
+
+               rc = ibw_manage_connect(conn);
+               if (rc)
+                       goto error;
+
+               break;
+
+       case RDMA_CM_EVENT_CONNECT_REQUEST:
+               DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_CONNECT_REQUEST\n"));
+               ctx->state = IBWS_CONNECT_REQUEST;
+               conn = ibw_conn_new(ctx, ctx);
+               pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+               pconn->cm_id = cma_id; /* !!! event will be freed but id not */
+               cma_id->context = (void *)conn;
+               DEBUG(DEBUG_DEBUG, ("pconn->cm_id %p\n", pconn->cm_id));
+
+               if (ibw_setup_cq_qp(conn))
+                       goto error;
+
+               conn->state = IBWC_INIT;
+               pctx->connstate_func(ctx, conn);
+
+               /* continued at ibw_accept when invoked by the func above */
+               if (!pconn->is_accepted) {
+                       rc = rdma_reject(cma_id, NULL, 0);
+                       if (rc)
+                               DEBUG(DEBUG_ERR, ("rdma_reject failed with rc=%d\n", rc));
+                       talloc_free(conn);
+                       DEBUG(DEBUG_DEBUG, ("pconn->cm_id %p wasn't accepted\n", pconn->cm_id));
+               }
+
+               /* TODO: clarify whether if it's needed by upper layer: */
+               ctx->state = IBWS_READY;
+               pctx->connstate_func(ctx, NULL);
+
+               /* NOTE: more requests can arrive until RDMA_CM_EVENT_ESTABLISHED ! */
+               break;
+
+       case RDMA_CM_EVENT_ESTABLISHED:
+               /* expected after ibw_accept and ibw_connect[not directly] */
+               DEBUG(DEBUG_INFO, ("ESTABLISHED (conn: %p)\n", cma_id->context));
+               conn = talloc_get_type(cma_id->context, struct ibw_conn);
+               assert(conn!=NULL); /* important assumption */
+
+               DEBUG(DEBUG_DEBUG, ("ibw_setup_cq_qp succeeded (cmid=%p)\n", cma_id));
+
+               /* client conn is up */
+               conn->state = IBWC_CONNECTED;
+
+               /* both ctx and conn have changed */
+               pctx->connstate_func(ctx, conn);
+               break;
+
+       case RDMA_CM_EVENT_ADDR_ERROR:
+               sprintf(ibw_lasterr, "RDMA_CM_EVENT_ADDR_ERROR, error %d\n", event->status);
+       case RDMA_CM_EVENT_ROUTE_ERROR:
+               sprintf(ibw_lasterr, "RDMA_CM_EVENT_ROUTE_ERROR, error %d\n", event->status);
+       case RDMA_CM_EVENT_CONNECT_ERROR:
+               sprintf(ibw_lasterr, "RDMA_CM_EVENT_CONNECT_ERROR, error %d\n", event->status);
+       case RDMA_CM_EVENT_UNREACHABLE:
+               sprintf(ibw_lasterr, "RDMA_CM_EVENT_UNREACHABLE, error %d\n", event->status);
+               goto error;
+       case RDMA_CM_EVENT_REJECTED:
+               sprintf(ibw_lasterr, "RDMA_CM_EVENT_REJECTED, error %d\n", event->status);
+               DEBUG(DEBUG_INFO, ("cm event handler: %s", ibw_lasterr));
+               conn = talloc_get_type(cma_id->context, struct ibw_conn);
+               if (conn) {
+                       /* must be done BEFORE connstate */
+                       if ((rc=rdma_ack_cm_event(event)))
+                               DEBUG(DEBUG_ERR, ("reject/rdma_ack_cm_event failed with %d\n", rc));
+                       event = NULL; /* not to touch cma_id or conn */
+                       conn->state = IBWC_ERROR;
+                       /* it should free the conn */
+                       pctx->connstate_func(NULL, conn);
+               }
+               break; /* this is not strictly an error */
+
+       case RDMA_CM_EVENT_DISCONNECTED:
+               DEBUG(DEBUG_DEBUG, ("RDMA_CM_EVENT_DISCONNECTED\n"));
+               if ((rc=rdma_ack_cm_event(event)))
+                       DEBUG(DEBUG_ERR, ("disc/rdma_ack_cm_event failed with %d\n", rc));
+               event = NULL; /* don't ack more */
+
+               if (cma_id!=pctx->cm_id) {
+                       DEBUG(DEBUG_ERR, ("client DISCONNECT event cm_id=%p\n", cma_id));
+                       conn = talloc_get_type(cma_id->context, struct ibw_conn);
+                       conn->state = IBWC_DISCONNECTED;
+                       pctx->connstate_func(NULL, conn);
+               }
+               break;
+
+       case RDMA_CM_EVENT_DEVICE_REMOVAL:
+               sprintf(ibw_lasterr, "cma detected device removal!\n");
+               goto error;
+
+       default:
+               sprintf(ibw_lasterr, "unknown event %d\n", event->event);
+               goto error;
+       }
+
+       if (event!=NULL && (rc=rdma_ack_cm_event(event))) {
+               sprintf(ibw_lasterr, "rdma_ack_cm_event failed with %d\n", rc);
+               goto error;
+       }
+
+       return;
+error:
+       DEBUG(DEBUG_ERR, ("cm event handler: %s", ibw_lasterr));
+
+       if (event!=NULL) {
+               if (cma_id!=NULL && cma_id!=pctx->cm_id) {
+                       conn = talloc_get_type(cma_id->context, struct ibw_conn);
+                       if (conn) {
+                               conn->state = IBWC_ERROR;
+                               pctx->connstate_func(NULL, conn);
+                       }
+               } else {
+                       ctx->state = IBWS_ERROR;
+                       pctx->connstate_func(ctx, NULL);
+               }
+
+               if ((rc=rdma_ack_cm_event(event))!=0) {
+                       DEBUG(DEBUG_ERR, ("rdma_ack_cm_event failed with %d\n", rc));
+               }
+       }
+
+       return;
+}
+
+static void ibw_event_handler_verbs(struct event_context *ev,
+       struct fd_event *fde, uint16_t flags, void *private_data)
+{
+       struct ibw_conn *conn = talloc_get_type(private_data, struct ibw_conn);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+
+       struct ibv_wc wc;
+       int rc;
+       struct ibv_cq *ev_cq;
+       void          *ev_ctx;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_event_handler_verbs(%u)\n", (uint32_t)flags));
+
+       /* TODO: check whether if it's good to have more channels here... */
+       rc = ibv_get_cq_event(pconn->verbs_channel, &ev_cq, &ev_ctx);
+       if (rc) {
+               sprintf(ibw_lasterr, "Failed to get cq_event with %d\n", rc);
+               goto error;
+       }
+       if (ev_cq != pconn->cq) {
+               sprintf(ibw_lasterr, "ev_cq(%p) != pconn->cq(%p)\n", ev_cq, pconn->cq);
+               goto error;
+       }
+       rc = ibv_req_notify_cq(pconn->cq, 0);
+       if (rc) {
+               sprintf(ibw_lasterr, "Couldn't request CQ notification (%d)\n", rc);
+               goto error;
+       }
+
+       while((rc=ibv_poll_cq(pconn->cq, 1, &wc))==1) {
+               if (wc.status) {
+                       sprintf(ibw_lasterr, "cq completion failed status=%d, opcode=%d, rc=%d\n",
+                               wc.status, wc.opcode, rc);
+                       goto error;
+               }
+
+               switch(wc.opcode) {
+               case IBV_WC_SEND:
+                       DEBUG(DEBUG_DEBUG, ("send completion\n"));
+                       if (ibw_wc_send(conn, &wc))
+                               goto error;
+                       break;
+
+               case IBV_WC_RDMA_WRITE:
+                       DEBUG(DEBUG_DEBUG, ("rdma write completion\n"));
+                       break;
+       
+               case IBV_WC_RDMA_READ:
+                       DEBUG(DEBUG_DEBUG, ("rdma read completion\n"));
+                       break;
+
+               case IBV_WC_RECV:
+                       DEBUG(DEBUG_DEBUG, ("recv completion\n"));
+                       if (ibw_wc_recv(conn, &wc))
+                               goto error;
+                       break;
+
+               default:
+                       sprintf(ibw_lasterr, "unknown completion %d\n", wc.opcode);
+                       goto error;
+               }
+       }
+       if (rc!=0) {
+               sprintf(ibw_lasterr, "ibv_poll_cq error %d\n", rc);
+               goto error;
+       }
+
+       ibv_ack_cq_events(pconn->cq, 1);
+
+       return;
+error:
+       ibv_ack_cq_events(pconn->cq, 1);
+
+       DEBUG(DEBUG_ERR, (ibw_lasterr));
+       
+       if (conn->state!=IBWC_ERROR) {
+               conn->state = IBWC_ERROR;
+               pctx->connstate_func(NULL, conn);
+       }
+}
+
+static int ibw_process_queue(struct ibw_conn *conn)
+{
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibw_ctx_priv *pctx;
+       struct ibw_wr   *p;
+       int     rc;
+       uint32_t        msg_size;
+
+       if (pconn->queue==NULL)
+               return 0; /* NOP */
+
+       p = pconn->queue;
+
+       /* we must have at least 1 fragment to send */
+       assert(p->queued_ref_cnt>0);
+       p->queued_ref_cnt--;
+
+       pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       msg_size = (p->queued_ref_cnt) ? pctx->opts.recv_bufsize : p->queued_rlen;
+
+       assert(p->queued_msg!=NULL);
+       assert(msg_size!=0);
+
+       DEBUG(DEBUG_DEBUG, ("ibw_process_queue refcnt=%d msgsize=%u\n",
+               p->queued_ref_cnt, msg_size));
+
+       rc = ibw_send_packet(conn, p->queued_msg, p, msg_size);
+
+       /* was this the last fragment? */
+       if (p->queued_ref_cnt) {
+               p->queued_msg += pctx->opts.recv_bufsize;
+       } else {
+               DLIST_REMOVE2(pconn->queue, p, qprev, qnext);
+               p->queued_msg = NULL;
+       }
+
+       return rc;
+}
+
+static int ibw_wc_send(struct ibw_conn *conn, struct ibv_wc *wc)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibw_wr   *p;
+       int     send_index;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_wc_send(cmid: %p, wr_id: %u, bl: %u)\n",
+               pconn->cm_id, (uint32_t)wc->wr_id, (uint32_t)wc->byte_len));
+
+       assert(pconn->cm_id->qp->qp_num==wc->qp_num);
+       assert(wc->wr_id >= pctx->opts.max_recv_wr);
+       send_index = wc->wr_id - pctx->opts.max_recv_wr;
+       pconn->wr_sent--;
+
+       if (send_index < pctx->opts.max_send_wr) {
+               DEBUG(DEBUG_DEBUG, ("ibw_wc_send#1 %u\n", (int)wc->wr_id));
+               p = pconn->wr_index[send_index];
+               if (p->buf_large!=NULL) {
+                       if (p->ref_cnt) {
+                               /* awaiting more of it... */
+                               p->ref_cnt--;
+                       } else {
+                               ibw_free_mr(&p->buf_large, &p->mr_large);
+                               DLIST_REMOVE(pconn->wr_list_used, p);
+                               DLIST_ADD(pconn->wr_list_avail, p);
+                       }
+               } else { /* nasty - but necessary */
+                       DLIST_REMOVE(pconn->wr_list_used, p);
+                       DLIST_ADD(pconn->wr_list_avail, p);
+               }
+       } else { /* "extra" request - not optimized */
+               DEBUG(DEBUG_DEBUG, ("ibw_wc_send#2 %u\n", (int)wc->wr_id));
+               for(p=pconn->extra_sent; p!=NULL; p=p->next)
+                       if ((p->wr_id + pctx->opts.max_recv_wr)==(int)wc->wr_id)
+                               break;
+               if (p==NULL) {
+                       sprintf(ibw_lasterr, "failed to find wr_id %d\n", (int)wc->wr_id);
+                               return -1;
+               }
+               if (p->ref_cnt) {
+                       p->ref_cnt--;
+               } else {
+                       ibw_free_mr(&p->buf_large, &p->mr_large);
+                       DLIST_REMOVE(pconn->extra_sent, p);
+                       DLIST_ADD(pconn->extra_avail, p);
+               }
+       }
+
+       return ibw_process_queue(conn);
+}
+
+static int ibw_append_to_part(struct ibw_conn_priv *pconn,
+       struct ibw_part *part, char **pp, uint32_t add_len, int info)
+{
+       DEBUG(DEBUG_DEBUG, ("ibw_append_to_part: cmid=%p, (bs=%u, len=%u, tr=%u), al=%u, i=%u\n",
+               pconn->cm_id, part->bufsize, part->len, part->to_read, add_len, info));
+
+       /* allocate more if necessary - it's an "evergrowing" buffer... */
+       if (part->len + add_len > part->bufsize) {
+               if (part->buf==NULL) {
+                       assert(part->len==0);
+                       part->buf = talloc_size(pconn, add_len);
+                       if (part->buf==NULL) {
+                               sprintf(ibw_lasterr, "recv talloc_size error (%u) #%d\n",
+                                       add_len, info);
+                               return -1;
+                       }
+                       part->bufsize = add_len;
+               } else {
+                       part->buf = talloc_realloc_size(pconn,
+                               part->buf, part->len + add_len);
+                       if (part->buf==NULL) {
+                               sprintf(ibw_lasterr, "recv realloc error (%u + %u) #%d\n",
+                                       part->len, add_len, info);
+                               return -1;
+                       }
+               }
+               part->bufsize = part->len + add_len;
+       }
+
+       /* consume pp */
+       memcpy(part->buf + part->len, *pp, add_len);
+       *pp += add_len;
+       part->len += add_len;
+       part->to_read -= add_len;
+
+       return 0;
+}
+
+static int ibw_wc_mem_threshold(struct ibw_conn_priv *pconn,
+       struct ibw_part *part, uint32_t threshold)
+{
+       DEBUG(DEBUG_DEBUG, ("ibw_wc_mem_threshold: cmid=%p, (bs=%u, len=%u, tr=%u), thr=%u\n",
+               pconn->cm_id, part->bufsize, part->len, part->to_read, threshold));
+
+       if (part->bufsize > threshold) {
+               DEBUG(DEBUG_DEBUG, ("ibw_wc_mem_threshold: cmid=%p, %u > %u\n",
+                       pconn->cm_id, part->bufsize, threshold));
+               talloc_free(part->buf);
+               part->buf = talloc_size(pconn, threshold);
+               if (part->buf==NULL) {
+                       sprintf(ibw_lasterr, "talloc_size failed\n");
+                       return -1;
+               }
+               part->bufsize = threshold;
+       }
+       return 0;
+}
+
+static int ibw_wc_recv(struct ibw_conn *conn, struct ibv_wc *wc)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibw_part *part = &pconn->part;
+       char    *p;
+       uint32_t        remain = wc->byte_len;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_wc_recv: cmid=%p, wr_id: %u, bl: %u\n",
+               pconn->cm_id, (uint32_t)wc->wr_id, remain));
+
+       assert(pconn->cm_id->qp->qp_num==wc->qp_num);
+       assert((int)wc->wr_id < pctx->opts.max_recv_wr);
+       assert(wc->byte_len <= pctx->opts.recv_bufsize);
+
+       p = pconn->buf_recv + ((int)wc->wr_id * pctx->opts.recv_bufsize);
+
+       while(remain) {
+               /* here always true: (part->len!=0 && part->to_read!=0) ||
+                       (part->len==0 && part->to_read==0) */
+               if (part->len) { /* is there a partial msg to be continued? */
+                       int read_len = (part->to_read<=remain) ? part->to_read : remain;
+                       if (ibw_append_to_part(pconn, part, &p, read_len, 421))
+                               goto error;
+                       remain -= read_len;
+
+                       if (part->len<=sizeof(uint32_t) && part->to_read==0) {
+                               assert(part->len==sizeof(uint32_t));
+                               /* set it again now... */
+                               part->to_read = *((uint32_t *)(part->buf)); /* TODO: ntohl */
+                               if (part->to_read<sizeof(uint32_t)) {
+                                       sprintf(ibw_lasterr, "got msglen=%u #2\n", part->to_read);
+                                       goto error;
+                               }
+                               part->to_read -= sizeof(uint32_t); /* it's already read */
+                       }
+
+                       if (part->to_read==0) {
+                               if (pctx->receive_func(conn, part->buf, part->len) != 0) {
+                                       goto error;
+                               }
+                               part->len = 0; /* tells not having partial data (any more) */
+                               if (ibw_wc_mem_threshold(pconn, part, pctx->opts.recv_threshold))
+                                       goto error;
+                       }
+               } else {
+                       if (remain>=sizeof(uint32_t)) {
+                               uint32_t msglen = *(uint32_t *)p; /* TODO: ntohl */
+                               if (msglen<sizeof(uint32_t)) {
+                                       sprintf(ibw_lasterr, "got msglen=%u\n", msglen);
+                                       goto error;
+                               }
+
+                               /* mostly awaited case: */
+                               if (msglen<=remain) {
+                                       if (pctx->receive_func(conn, p, msglen) != 0) {
+                                               goto error;
+                                       }
+                                       p += msglen;
+                                       remain -= msglen;
+                               } else {
+                                       part->to_read = msglen;
+                                       /* part->len is already 0 */
+                                       if (ibw_append_to_part(pconn, part, &p, remain, 422))
+                                               goto error;
+                                       remain = 0; /* to be continued ... */
+                                       /* part->to_read > 0 here */
+                               }
+                       } else { /* edge case: */
+                               part->to_read = sizeof(uint32_t);
+                               /* part->len is already 0 */
+                               if (ibw_append_to_part(pconn, part, &p, remain, 423))
+                                       goto error;
+                               remain = 0;
+                               /* part->to_read > 0 here */
+                       }
+               }
+       } /* <remain> is always decreased at least by 1 */
+
+       if (ibw_refill_cq_recv(conn))
+               goto error;
+
+       return 0;
+
+error:
+       DEBUG(DEBUG_ERR, ("ibw_wc_recv error: %s", ibw_lasterr));
+       return -1;
+}
+
+static int ibw_process_init_attrs(struct ibw_initattr *attr, int nattr, struct ibw_opts *opts)
+{
+       int     i;
+       const char *name, *value;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_process_init_attrs: nattr: %d\n", nattr));
+
+       opts->max_send_wr = IBW_MAX_SEND_WR;
+       opts->max_recv_wr = IBW_MAX_RECV_WR;
+       opts->recv_bufsize = IBW_RECV_BUFSIZE;
+       opts->recv_threshold = IBW_RECV_THRESHOLD;
+
+       for(i=0; i<nattr; i++) {
+               name = attr[i].name;
+               value = attr[i].value;
+
+               assert(name!=NULL && value!=NULL);
+               if (strcmp(name, "max_send_wr")==0)
+                       opts->max_send_wr = atoi(value);
+               else if (strcmp(name, "max_recv_wr")==0)
+                       opts->max_recv_wr = atoi(value);
+               else if (strcmp(name, "recv_bufsize")==0)
+                       opts->recv_bufsize = atoi(value);
+               else if (strcmp(name, "recv_threshold")==0)
+                       opts->recv_threshold = atoi(value);
+               else {
+                       sprintf(ibw_lasterr, "ibw_init: unknown name %s\n", name);
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+struct ibw_ctx *ibw_init(struct ibw_initattr *attr, int nattr,
+       void *ctx_userdata,
+       ibw_connstate_fn_t ibw_connstate,
+       ibw_receive_fn_t ibw_receive,
+       struct event_context *ectx)
+{
+       struct ibw_ctx *ctx = talloc_zero(NULL, struct ibw_ctx);
+       struct ibw_ctx_priv *pctx;
+       int     rc;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_init(ctx_userdata: %p, ectx: %p)\n", ctx_userdata, ectx));
+
+       /* initialize basic data structures */
+       memset(ibw_lasterr, 0, IBW_LASTERR_BUFSIZE);
+
+       assert(ctx!=NULL);
+       ibw_lasterr[0] = '\0';
+       talloc_set_destructor(ctx, ibw_ctx_destruct);
+       ctx->ctx_userdata = ctx_userdata;
+
+       pctx = talloc_zero(ctx, struct ibw_ctx_priv);
+       talloc_set_destructor(pctx, ibw_ctx_priv_destruct);
+       ctx->internal = (void *)pctx;
+       assert(pctx!=NULL);
+
+       pctx->connstate_func = ibw_connstate;
+       pctx->receive_func = ibw_receive;
+
+       pctx->ectx = ectx;
+
+       /* process attributes */
+       if (ibw_process_init_attrs(attr, nattr, &pctx->opts))
+               goto cleanup;
+
+       /* init cm */
+       pctx->cm_channel = rdma_create_event_channel();
+       if (!pctx->cm_channel) {
+               sprintf(ibw_lasterr, "rdma_create_event_channel error %d\n", errno);
+               goto cleanup;
+       }
+
+       pctx->cm_channel_event = event_add_fd(pctx->ectx, pctx,
+               pctx->cm_channel->fd, EVENT_FD_READ, ibw_event_handler_cm, ctx);
+
+#if RDMA_USER_CM_MAX_ABI_VERSION >= 2
+       rc = rdma_create_id(pctx->cm_channel, &pctx->cm_id, ctx, RDMA_PS_TCP);
+#else
+       rc = rdma_create_id(pctx->cm_channel, &pctx->cm_id, ctx);
+#endif
+       if (rc) {
+               rc = errno;
+               sprintf(ibw_lasterr, "rdma_create_id error %d\n", rc);
+               goto cleanup;
+       }
+       DEBUG(DEBUG_DEBUG, ("created cm_id %p\n", pctx->cm_id));
+
+       pctx->pagesize = sysconf(_SC_PAGESIZE);
+
+       return ctx;
+       /* don't put code here */
+cleanup:
+       DEBUG(DEBUG_ERR, (ibw_lasterr));
+
+       if (ctx)
+               talloc_free(ctx);
+
+       return NULL;
+}
+
+int ibw_stop(struct ibw_ctx *ctx)
+{
+       struct ibw_ctx_priv *pctx = (struct ibw_ctx_priv *)ctx->internal;
+       struct ibw_conn *p;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_stop\n"));
+
+       for(p=ctx->conn_list; p!=NULL; p=p->next) {
+               if (ctx->state==IBWC_ERROR || ctx->state==IBWC_CONNECTED) {
+                       if (ibw_disconnect(p))
+                               return -1;
+               }
+       }
+
+       ctx->state = IBWS_STOPPED;
+       pctx->connstate_func(ctx, NULL);
+
+       return 0;
+}
+
+int ibw_bind(struct ibw_ctx *ctx, struct sockaddr_in *my_addr)
+{
+       struct ibw_ctx_priv *pctx = (struct ibw_ctx_priv *)ctx->internal;
+       int     rc;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_bind: addr=%s, port=%u\n",
+               inet_ntoa(my_addr->sin_addr), ntohs(my_addr->sin_port)));
+       rc = rdma_bind_addr(pctx->cm_id, (struct sockaddr *) my_addr);
+       if (rc) {
+               sprintf(ibw_lasterr, "rdma_bind_addr error %d\n", rc);
+               DEBUG(DEBUG_ERR, (ibw_lasterr));
+               return rc;
+       }
+       DEBUG(DEBUG_DEBUG, ("rdma_bind_addr successful\n"));
+
+       return 0;
+}
+
+int ibw_listen(struct ibw_ctx *ctx, int backlog)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(ctx->internal, struct ibw_ctx_priv);
+       int     rc;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_listen\n"));
+       rc = rdma_listen(pctx->cm_id, backlog);
+       if (rc) {
+               sprintf(ibw_lasterr, "rdma_listen failed: %d\n", rc);
+               DEBUG(DEBUG_ERR, (ibw_lasterr));
+               return rc;
+       }
+
+       return 0;
+}
+
+int ibw_accept(struct ibw_ctx *ctx, struct ibw_conn *conn, void *conn_userdata)
+{
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct rdma_conn_param  conn_param;
+       int     rc;
+
+       DEBUG(DEBUG_DEBUG, ("ibw_accept: cmid=%p\n", pconn->cm_id));
+       conn->conn_userdata = conn_userdata;
+
+       memset(&conn_param, 0, sizeof(struct rdma_conn_param));
+       conn_param.responder_resources = 1;
+       conn_param.initiator_depth = 1;
+       rc = rdma_accept(pconn->cm_id, &conn_param);
+       if (rc) {
+               sprintf(ibw_lasterr, "rdma_accept failed %d\n", rc);
+               DEBUG(DEBUG_ERR, (ibw_lasterr));
+               return -1;;
+       }
+
+       pconn->is_accepted = 1;
+
+       /* continued at RDMA_CM_EVENT_ESTABLISHED */
+
+       return 0;
+}
+
+int ibw_connect(struct ibw_conn *conn, struct sockaddr_in *serv_addr, void *conn_userdata)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = NULL;
+       int     rc;
+
+       assert(conn!=NULL);
+
+       conn->conn_userdata = conn_userdata;
+       pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       DEBUG(DEBUG_DEBUG, ("ibw_connect: addr=%s, port=%u\n", inet_ntoa(serv_addr->sin_addr),
+               ntohs(serv_addr->sin_port)));
+
+       /* clean previous - probably half - initialization */
+       if (ibw_conn_priv_destruct(pconn)) {
+               DEBUG(DEBUG_ERR, ("ibw_connect/ibw_pconn_destruct failed for cm_id=%p\n", pconn->cm_id));
+               return -1;
+       }
+
+       /* init cm */
+#if RDMA_USER_CM_MAX_ABI_VERSION >= 2
+       rc = rdma_create_id(pctx->cm_channel, &pconn->cm_id, conn, RDMA_PS_TCP);
+#else
+       rc = rdma_create_id(pctx->cm_channel, &pconn->cm_id, conn);
+#endif
+       if (rc) {
+               rc = errno;
+               sprintf(ibw_lasterr, "ibw_connect/rdma_create_id error %d\n", rc);
+               talloc_free(conn);
+               return -1;
+       }
+       DEBUG(DEBUG_DEBUG, ("ibw_connect: rdma_create_id succeeded, cm_id=%p\n", pconn->cm_id));
+
+       rc = rdma_resolve_addr(pconn->cm_id, NULL, (struct sockaddr *) serv_addr, 2000);
+       if (rc) {
+               sprintf(ibw_lasterr, "rdma_resolve_addr error %d\n", rc);
+               DEBUG(DEBUG_ERR, (ibw_lasterr));
+               talloc_free(conn);
+               return -1;
+       }
+
+       /* continued at RDMA_CM_EVENT_ADDR_RESOLVED */
+
+       return 0;
+}
+
+int ibw_disconnect(struct ibw_conn *conn)
+{
+       int     rc;
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+
+       DEBUG(DEBUG_DEBUG, ("ibw_disconnect: cmid=%p\n", pconn->cm_id));
+
+       assert(pconn!=NULL);
+
+       switch(conn->state) {
+       case IBWC_ERROR:
+               ibw_conn_priv_destruct(pconn); /* do this here right now */
+               break;
+       case IBWC_CONNECTED:
+               rc = rdma_disconnect(pconn->cm_id);
+               if (rc) {
+                       sprintf(ibw_lasterr, "ibw_disconnect failed with %d\n", rc);
+                       DEBUG(DEBUG_ERR, (ibw_lasterr));
+                       return rc;
+               }
+               break;
+       default:
+               DEBUG(DEBUG_DEBUG, ("invalid state for disconnect: %d\n", conn->state));
+               break;
+       }
+
+       return 0;
+}
+
+int ibw_alloc_send_buf(struct ibw_conn *conn, void **buf, void **key, uint32_t len)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibw_wr *p = pconn->wr_list_avail;
+
+       if (p!=NULL) {
+               DEBUG(DEBUG_DEBUG, ("ibw_alloc_send_buf#1: cmid=%p, len=%d\n", pconn->cm_id, len));
+
+               DLIST_REMOVE(pconn->wr_list_avail, p);
+               DLIST_ADD(pconn->wr_list_used, p);
+
+               if (len <= pctx->opts.recv_bufsize) {
+                       *buf = (void *)p->buf;
+               } else {
+                       p->buf_large = ibw_alloc_mr(pctx, pconn, len, &p->mr_large);
+                       if (p->buf_large==NULL) {
+                               sprintf(ibw_lasterr, "ibw_alloc_mr#1 failed\n");
+                               goto error;
+                       }
+                       *buf = (void *)p->buf_large;
+               }
+               /* p->wr_id is already filled in ibw_init_memory */
+       } else {
+               DEBUG(DEBUG_DEBUG, ("ibw_alloc_send_buf#2: cmid=%p, len=%d\n", pconn->cm_id, len));
+               /* not optimized */
+               p = pconn->extra_avail;
+               if (!p) {
+                       p = pconn->extra_avail = talloc_zero(pconn, struct ibw_wr);
+                       talloc_set_destructor(p, ibw_wr_destruct);
+                       if (p==NULL) {
+                               sprintf(ibw_lasterr, "talloc_zero failed (emax: %u)\n", pconn->extra_max);
+                               goto error;
+                       }
+                       p->wr_id = pctx->opts.max_send_wr + pconn->extra_max;
+                       pconn->extra_max++;
+                       switch(pconn->extra_max) {
+                               case 1: DEBUG(DEBUG_INFO, ("warning: queue performed\n")); break;
+                               case 10: DEBUG(DEBUG_INFO, ("warning: queue reached 10\n")); break;
+                               case 100: DEBUG(DEBUG_INFO, ("warning: queue reached 100\n")); break;
+                               case 1000: DEBUG(DEBUG_INFO, ("warning: queue reached 1000\n")); break;
+                               default: break;
+                       }
+               }
+
+               p->buf_large = ibw_alloc_mr(pctx, pconn, len, &p->mr_large);
+               if (p->buf_large==NULL) {
+                       sprintf(ibw_lasterr, "ibw_alloc_mr#2 failed\n");
+                       goto error;
+               }
+               *buf = (void *)p->buf_large;
+
+               DLIST_REMOVE(pconn->extra_avail, p);
+               /* we don't have prepared index for this, so that
+                * we will have to find this by wr_id later on */
+               DLIST_ADD(pconn->extra_sent, p);
+       }
+
+       *key = (void *)p;
+
+       return 0;
+error:
+       DEBUG(DEBUG_ERR, ("ibw_alloc_send_buf error: %s", ibw_lasterr));
+       return -1;
+}
+
+
+static int ibw_send_packet(struct ibw_conn *conn, void *buf, struct ibw_wr *p, uint32_t len)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       int     rc;
+
+       /* can we send it right now? */
+       if (pconn->wr_sent<pctx->opts.max_send_wr) {
+               struct ibv_send_wr *bad_wr;
+               struct ibv_sge list = {
+                       .addr   = (uintptr_t)buf,
+                       .length = len,
+                       .lkey   = pconn->mr_send->lkey
+               };
+               struct ibv_send_wr wr = {
+                       .wr_id      = p->wr_id + pctx->opts.max_recv_wr,
+                       .sg_list    = &list,
+                       .num_sge    = 1,
+                       .opcode     = IBV_WR_SEND,
+                       .send_flags = IBV_SEND_SIGNALED,
+               };
+
+               if (p->buf_large==NULL) {
+                       DEBUG(DEBUG_DEBUG, ("ibw_send#normal(cmid: %p, wrid: %u, n: %d)\n",
+                               pconn->cm_id, (uint32_t)wr.wr_id, len));
+               } else {
+                       DEBUG(DEBUG_DEBUG, ("ibw_send#large(cmid: %p, wrid: %u, n: %d)\n",
+                               pconn->cm_id, (uint32_t)wr.wr_id, len));
+                       list.lkey = p->mr_large->lkey;
+               }
+
+               rc = ibv_post_send(pconn->cm_id->qp, &wr, &bad_wr);
+               if (rc) {
+                       sprintf(ibw_lasterr, "ibv_post_send error %d (%d)\n",
+                               rc, pconn->wr_sent);
+                       goto error;
+               }
+
+               pconn->wr_sent++;
+
+               return rc;
+       } /* else put the request into our own queue: */
+
+       DEBUG(DEBUG_DEBUG, ("ibw_send#queued(cmid: %p, len: %u)\n", pconn->cm_id, len));
+
+       /* TODO: clarify how to continue when state==IBWC_STOPPED */
+
+       /* to be sent by ibw_wc_send */
+       /* regardless "normal" or [a part of] "large" packet */
+       if (!p->queued_ref_cnt) {
+               DLIST_ADD_END2(pconn->queue, p, struct ibw_wr *,
+                       qprev, qnext); /* TODO: optimize */
+               p->queued_msg = buf;
+       }
+       p->queued_ref_cnt++;
+       p->queued_rlen = len; /* last wins; see ibw_wc_send */
+
+       return 0;
+error:
+       DEBUG(DEBUG_ERR, (ibw_lasterr));
+       return -1;
+}
+
+int ibw_send(struct ibw_conn *conn, void *buf, void *key, uint32_t len)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_wr *p = talloc_get_type(key, struct ibw_wr);
+       int     rc;
+
+       assert(len>=sizeof(uint32_t));
+       assert((*((uint32_t *)buf)==len)); /* TODO: htonl */
+
+       if (len > pctx->opts.recv_bufsize) {
+               struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+               int     rlen = len;
+               char    *packet = (char *)buf;
+               uint32_t        recv_bufsize = pctx->opts.recv_bufsize;
+
+               DEBUG(DEBUG_DEBUG, ("ibw_send#frag(cmid: %p, buf: %p, len: %u)\n",
+                       pconn->cm_id, buf, len));
+
+               /* single threaded => no race here: */
+               assert(p->ref_cnt==0);
+               while(rlen > recv_bufsize) {
+                       rc = ibw_send_packet(conn, packet, p, recv_bufsize);
+                       if (rc)
+                               return rc;
+                       packet += recv_bufsize;
+                       rlen -= recv_bufsize;
+                       p->ref_cnt++; /* not good to have it in ibw_send_packet */
+               }
+               if (rlen) {
+                       rc = ibw_send_packet(conn, packet, p, rlen);
+                       p->ref_cnt++; /* not good to have it in ibw_send_packet */
+               }
+               p->ref_cnt--; /* for the same handling */
+       } else {
+               assert(p->ref_cnt==0);
+               assert(p->queued_ref_cnt==0);
+
+               rc = ibw_send_packet(conn, buf, p, len);
+       }
+       return rc;
+}
+
+int ibw_cancel_send_buf(struct ibw_conn *conn, void *buf, void *key)
+{
+       struct ibw_ctx_priv *pctx = talloc_get_type(conn->ctx->internal, struct ibw_ctx_priv);
+       struct ibw_conn_priv *pconn = talloc_get_type(conn->internal, struct ibw_conn_priv);
+       struct ibw_wr *p = talloc_get_type(key, struct ibw_wr);
+
+       assert(p!=NULL);
+       assert(buf!=NULL);
+       assert(conn!=NULL);
+
+       if (p->buf_large!=NULL)
+               ibw_free_mr(&p->buf_large, &p->mr_large);
+
+       /* parallel case */
+       if (p->wr_id < pctx->opts.max_send_wr) {
+               DEBUG(DEBUG_DEBUG, ("ibw_cancel_send_buf#1 %u", (int)p->wr_id));
+               DLIST_REMOVE(pconn->wr_list_used, p);
+               DLIST_ADD(pconn->wr_list_avail, p);
+       } else { /* "extra" packet */
+               DEBUG(DEBUG_DEBUG, ("ibw_cancel_send_buf#2 %u", (int)p->wr_id));
+               DLIST_REMOVE(pconn->extra_sent, p);
+               DLIST_ADD(pconn->extra_avail, p);
+       }
+
+       return 0;
+}
+
+const char *ibw_getLastError(void)
+{
+       return ibw_lasterr;
+}
diff --git a/ctdb/ib/ibwrapper.h b/ctdb/ib/ibwrapper.h
new file mode 100644 (file)
index 0000000..0b880b3
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * Wrap Infiniband calls.
+ *
+ * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
+ *
+ * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* Server communication state */
+enum ibw_state_ctx {
+       IBWS_INIT = 0, /* ctx start - after ibw_init */
+       IBWS_READY, /* after ibw_bind & ibw_listen */
+       IBWS_CONNECT_REQUEST, /* after [IBWS_READY + incoming request] */
+               /* => [(ibw_accept)IBWS_READY | (ibw_disconnect)STOPPED | ERROR] */
+       IBWS_STOPPED, /* normal stop <= ibw_disconnect+(IBWS_READY | IBWS_CONNECT_REQUEST) */
+       IBWS_ERROR /* abnormal state; ibw_stop must be called after this */
+};
+
+/* Connection state */
+struct ibw_ctx {
+       void *ctx_userdata; /* see ibw_init */
+
+       enum ibw_state_ctx state;
+       void *internal;
+
+       struct ibw_conn *conn_list; /* 1st elem of double linked list */
+};
+
+enum ibw_state_conn {
+       IBWC_INIT = 0, /* conn start - internal state */
+       IBWC_CONNECTED, /* after ibw_accept or ibw_connect */
+       IBWC_DISCONNECTED, /* after ibw_disconnect */
+       IBWC_ERROR
+};
+
+struct ibw_conn {
+       struct ibw_ctx *ctx;
+       enum ibw_state_conn state;
+
+       void *conn_userdata; /* see ibw_connect and ibw_accept */
+       void *internal;
+
+       struct ibw_conn *prev, *next;
+};
+
+/*
+ * (name, value) pair for array param of ibw_init
+ */
+struct ibw_initattr {
+       const char *name;
+       const char *value;
+};
+
+/*
+ * Callback function definition which should inform you about
+ * connection state change
+ * This callback is invoked whenever server or client connection changes.
+ * Both <conn> and <ctx> can be NULL if their state didn't change.
+ * Return nonzero on error.
+ */
+typedef int (*ibw_connstate_fn_t)(struct ibw_ctx *ctx, struct ibw_conn *conn);
+
+/*
+ * Callback function definition which should process incoming packets
+ * This callback is invoked whenever any message arrives.
+ * Return nonzero on error.
+ *
+ * Important: you mustn't store buf pointer for later use.
+ * Process its contents before returning.
+ */
+typedef int (*ibw_receive_fn_t)(struct ibw_conn *conn, void *buf, int n);
+
+/*
+ * settings: array of (name, value) pairs
+ * where name is one of:
+ *      max_send_wr [default is 256]
+ *      max_recv_wr [default is 1024]
+ * <...>
+ *
+ * Must be called _ONCE_ for each node.
+ *
+ * max_msg_size is the maximum size of a message
+ * (max_send_wr + max_recv_wr) * max_msg_size bytes allocated per connection
+ *
+ * returns non-NULL on success
+ *
+ * talloc_free must be called for the result in IBWS_STOPPED;
+ *    it will close resources by destructor
+ *    connections(ibw_conn *) must have been closed prior talloc_free
+ */
+struct ibw_ctx *ibw_init(struct ibw_initattr *attr, int nattr,
+       void *ctx_userdata,
+       ibw_connstate_fn_t ibw_connstate,
+       ibw_receive_fn_t ibw_receive,
+       struct event_context *ectx);
+
+/*
+ * Must be called in states of (IBWS_ERROR, IBWS_READY, IBWS_CONNECT_REQUEST)
+ *
+ * It will send out disconnect requests and free up ibw_conn structures.
+ * The ctx->state will transit to IBWS_STOPPED after every conn are disconnected.
+ * During that time, you mustn't send/recv/disconnect any more.
+ * Only after ctx->state=IBWS_STOPPED you can talloc_free the ctx.
+ */
+int ibw_stop(struct ibw_ctx *ctx);
+
+/*************** connection initiation - like stream sockets *****/
+
+/*
+ * works like socket bind
+ * needs a normal internet address here
+ *
+ * return 0 on success
+ */
+int ibw_bind(struct ibw_ctx *ctx, struct sockaddr_in *my_addr);
+
+/*
+ * works like socket listen
+ * non-blocking
+ * enables accepting incoming connections (after IBWS_READY)
+ * (it doesn't touch ctx->state by itself)
+ *
+ * returns 0 on success
+ */
+int ibw_listen(struct ibw_ctx *ctx, int backlog);
+
+/*
+ * works like socket accept
+ * initializes a connection to a client
+ * must be called when state=IBWS_CONNECT_REQUEST
+ *
+ * returns 0 on success
+ *
+ * You have +1 waiting here: you will get ibw_conn (having the
+ * same <conn_userdata> member) structure in ibw_connstate_fn_t.
+ *
+ * Important: you won't get remote IP address (only internal conn info)
+ */
+int ibw_accept(struct ibw_ctx *ctx, struct ibw_conn *conn, void *conn_userdata);
+
+/*
+ * Create a new connection structure
+ * available for queueing ibw_send
+ *
+ * <parent> is needed to be notified by talloc destruct action.
+ */
+struct ibw_conn *ibw_conn_new(struct ibw_ctx *ctx, TALLOC_CTX *mem_ctx);
+
+/*
+ * Needs a normal internet address here
+ * can be called within IBWS_READY|IBWS_CONNECT_REQUEST
+ *
+ * returns non-NULL on success
+ *
+ * You have +1 waiting here: you will get ibw_conn (having the
+ * same <conn_userdata> member) structure in ibw_connstate_fn_t.
+ */
+int ibw_connect(struct ibw_conn *conn, struct sockaddr_in *serv_addr, void *conn_userdata);
+
+/*
+ * Sends out a disconnect request.
+ * You should process fds after calling this function
+ * and then process it with ibw_process_event normally
+ * until you get conn->state = IBWC_DISCONNECTED
+ *
+ * You mustn't talloc_free <conn> yet right after this,
+ * first wait for IBWC_DISCONNECTED.
+ */
+int ibw_disconnect(struct ibw_conn *conn);
+
+/************ Infiniband specific event loop wrapping ******************/
+
+/*
+ * You have to use this buf to fill in before send.
+ * It's just to avoid memcpy.in ibw_send.
+ * Use the same (buf, key) pair with ibw_send.
+ * Don't use more space than maxsize (see ibw_init).
+ *
+ * Returns 0 on success.
+ */
+int ibw_alloc_send_buf(struct ibw_conn *conn, void **buf, void **key, uint32_t len);
+
+/*
+ * Send the message in one
+ * Can be invoked any times (should fit into buffers) and at any time
+ * (in conn->state=IBWC_CONNECTED)
+ * n must be less or equal than max_msg_size (see ibw_init)
+ *
+ * You mustn't use (buf, key) any more for sending.
+ */
+int ibw_send(struct ibw_conn *conn, void *buf, void *key, uint32_t len);
+
+/*
+ * Call this after ibw_alloc_send_buf
+ * when you won't call ibw_send for (buf, key)
+ * You mustn't use (buf, key) any more.
+ */
+int ibw_cancel_send_buf(struct ibw_conn *conn, void *buf, void *key);
+
+/*
+ * Retrieves the last error
+ * result: always non-zero, mustn't be freed (static)
+ */
+const char *ibw_getLastError(void);
diff --git a/ctdb/ib/ibwrapper_internal.h b/ctdb/ib/ibwrapper_internal.h
new file mode 100644 (file)
index 0000000..20aef7f
--- /dev/null
@@ -0,0 +1,126 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * Wrap Infiniband calls.
+ *
+ * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
+ *
+ * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+struct ibw_opts {
+       uint32_t        max_send_wr;
+       uint32_t        max_recv_wr;
+       uint32_t        recv_bufsize;
+       uint32_t        recv_threshold;
+};
+
+struct ibw_wr {
+       char    *buf; /* initialized in ibw_init_memory once per connection */
+       int     wr_id; /* position in wr_index list; also used as wr id */
+
+       char    *buf_large; /* allocated specially for "large" message */
+       struct ibv_mr *mr_large;
+       int     ref_cnt; /* reference count for ibw_wc_send to know when to release */
+
+       char    *queued_msg; /* set at ibw_send - can be different than above */
+       int     queued_ref_cnt; /* instead of adding the same to the queue again */
+       uint32_t        queued_rlen; /* last wins when queued_ref_cnt>0; or simple msg size */
+
+       struct ibw_wr *next, *prev; /* in wr_list_avail or wr_list_used */
+                               /* or extra_sent or extra_avail */
+       struct ibw_wr *qnext, *qprev; /* in queue */
+};
+
+struct ibw_ctx_priv {
+       struct event_context *ectx;
+
+       struct ibw_opts opts;
+
+       struct rdma_cm_id       *cm_id; /* server cm id */
+
+       struct rdma_event_channel *cm_channel;
+       struct fd_event *cm_channel_event;
+
+       ibw_connstate_fn_t connstate_func; /* see ibw_init */
+       ibw_receive_fn_t receive_func; /* see ibw_init */
+
+       long    pagesize; /* sysconf result for memalign */
+};
+
+struct ibw_part {
+       char *buf; /* talloced memory buffer */
+       uint32_t bufsize; /* allocated size of buf - always grows */
+       uint32_t len; /* message part length */
+       uint32_t to_read; /* 4 or *((uint32_t)buf) if len>=sizeof(uint32_t) */
+};
+
+struct ibw_conn_priv {
+       struct ibv_comp_channel *verbs_channel;
+       struct fd_event *verbs_channel_event;
+
+       struct rdma_cm_id *cm_id; /* client's cm id */
+       struct ibv_pd   *pd;
+       int     is_accepted;
+
+       struct ibv_cq   *cq; /* qp is in cm_id */
+
+       char *buf_send; /* max_send_wr * avg_send_size */
+       struct ibv_mr *mr_send;
+       struct ibw_wr *wr_list_avail;
+       struct ibw_wr *wr_list_used;
+       struct ibw_wr **wr_index; /* array[0..(qsize-1)] of (ibw_wr *) */
+       int     wr_sent; /* # of send wrs in the CQ */
+
+       struct ibw_wr *extra_sent;
+       struct ibw_wr *extra_avail;
+       int     extra_max; /* max wr_id in the queue */
+
+       struct ibw_wr *queue;
+
+       /* buf_recv is a ring buffer */
+       char *buf_recv; /* max_recv_wr * avg_recv_size */
+       struct ibv_mr *mr_recv;
+       int recv_index; /* index of the next recv buffer when refilling */
+       struct ibw_part part;
+};
+
+/* remove an element from a list - element doesn't have to be in list. */
+#define DLIST_REMOVE2(list, p, prev, next) \
+do { \
+       if ((p) == (list)) { \
+               (list) = (p)->next; \
+               if (list) (list)->prev = NULL; \
+       } else { \
+               if ((p)->prev) (p)->prev->next = (p)->next; \
+               if ((p)->next) (p)->next->prev = (p)->prev; \
+       } \
+       if ((p) != (list)) (p)->next = (p)->prev = NULL; \
+} while (0)
+
+/* hook into the end of the list - needs a tmp pointer */
+#define DLIST_ADD_END2(list, p, type, prev, next) \
+do { \
+               if (!(list)) { \
+                       (list) = (p); \
+                       (p)->next = (p)->prev = NULL; \
+               } else { \
+                       type tmp; \
+                       for (tmp = (list); tmp->next; tmp = tmp->next) ; \
+                       tmp->next = (p); \
+                       (p)->next = NULL; \
+                       (p)->prev = tmp; \
+               } \
+} while (0)
diff --git a/ctdb/ib/ibwrapper_test.c b/ctdb/ib/ibwrapper_test.c
new file mode 100644 (file)
index 0000000..ba54286
--- /dev/null
@@ -0,0 +1,660 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * Test the infiniband wrapper.
+ *
+ * Copyright (C) Sven Oehme <oehmes@de.ibm.com> 2006
+ *
+ * Major code contributions by Peter Somogyi <psomogyi@gamax.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <malloc.h>
+#include <assert.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include "includes.h"
+#include "ib/ibwrapper.h"
+
+struct ibwtest_ctx {
+       int     is_server;
+       char    *id; /* my id */
+
+       struct ibw_initattr *attrs;
+       int     nattrs;
+       char    *opts; /* option string */
+
+       struct sockaddr_in *addrs; /* dynamic array of dest addrs */
+       int     naddrs;
+
+       unsigned int    nsec; /* delta times between messages in nanosec */
+       unsigned int    sleep_usec; /* microsecs to sleep in the main loop to emulate overloading */
+       uint32_t        maxsize; /* maximum variable message size */
+
+       int     cnt;
+       int     nsent;
+
+       int     nmsg; /* number of messages to send (client) */
+
+       int     kill_me;
+       int     stopping;
+       int     error;
+       struct ibw_ctx  *ibwctx;
+
+       struct timeval  start_time, end_time;
+};
+
+struct ibwtest_conn {
+       char    *id;
+};
+
+enum testopcode {
+       TESTOP_SEND_ID = 1,
+       TESTOP_SEND_TEXT = 2,
+       TESTOP_SEND_RND = 3
+};
+
+int ibwtest_connect_everybody(struct ibwtest_ctx *tcx)
+{
+       struct ibw_conn         *conn;
+       struct ibwtest_conn     *tconn = talloc_zero(tcx, struct ibwtest_conn);
+       int     i;
+
+       for(i=0; i<tcx->naddrs; i++) {
+               conn = ibw_conn_new(tcx->ibwctx, tconn);
+               if (ibw_connect(conn, &tcx->addrs[i], tconn)) {
+                       fprintf(stderr, "ibw_connect error at %d\n", i);
+                       return -1;
+               }
+       }
+       DEBUG(DEBUG_DEBUG, ("sent %d connect request...\n", tcx->naddrs));
+
+       return 0;
+}
+
+int ibwtest_send_id(struct ibw_conn *conn)
+{
+       struct ibwtest_ctx *tcx = talloc_get_type(conn->ctx->ctx_userdata, struct ibwtest_ctx);
+       char *buf;
+       void *key;
+       uint32_t        len;
+
+       DEBUG(DEBUG_DEBUG, ("ibwtest_send_id\n"));
+       len = sizeof(uint32_t)+strlen(tcx->id)+2;
+       if (ibw_alloc_send_buf(conn, (void **)&buf, &key, len)) {
+               DEBUG(DEBUG_ERR, ("send_id: ibw_alloc_send_buf failed\n"));
+               return -1;
+       }
+
+       /* first sizeof(uint32_t) size bytes are for length */
+       *((uint32_t *)buf) = len;
+       buf[sizeof(uint32_t)] = (char)TESTOP_SEND_ID;
+       strcpy(buf+sizeof(uint32_t)+1, tcx->id);
+
+       if (ibw_send(conn, buf, key, len)) {
+               DEBUG(DEBUG_ERR, ("send_id: ibw_send error\n"));
+               return -1;
+       }
+       tcx->nsent++;
+
+       return 0;
+}
+
+int ibwtest_send_test_msg(struct ibwtest_ctx *tcx, struct ibw_conn *conn, const char *msg)
+{
+       char *buf, *p;
+       void *key;
+       uint32_t len;
+
+       if (conn->state!=IBWC_CONNECTED)
+               return 0; /* not yet up */
+
+       len = strlen(msg) + 2 + sizeof(uint32_t);
+       if (ibw_alloc_send_buf(conn, (void **)&buf, &key, len)) {
+               fprintf(stderr, "send_test_msg: ibw_alloc_send_buf failed\n");
+               return -1;
+       }
+
+       *((uint32_t *)buf) = len;
+       p = buf;
+       p += sizeof(uint32_t);
+       p[0] = (char)TESTOP_SEND_TEXT;
+       p++;
+       strcpy(p, msg);
+
+       if (ibw_send(conn, buf, key, len)) {
+               DEBUG(DEBUG_ERR, ("send_test_msg: ibw_send error\n"));
+               return -1;
+       }
+       tcx->nsent++;
+
+       return 0;
+}
+
+unsigned char ibwtest_fill_random(unsigned char *buf, uint32_t size)
+{
+       uint32_t        i = size;
+       unsigned char   sum = 0;
+       unsigned char   value;
+       while(i) {
+               i--;
+               value = (unsigned char)(256.0 * (rand() / (RAND_MAX + 1.0)));
+               buf[i] = value;
+               sum += value;
+       }
+       return sum;
+}
+
+unsigned char ibwtest_get_sum(unsigned char *buf, uint32_t size)
+{
+       uint32_t        i = size;
+       unsigned char   sum = 0;
+
+       while(i) {
+               i--;
+               sum += buf[i];
+       }
+       return sum;
+}
+
+int ibwtest_do_varsize_scenario_conn_size(struct ibwtest_ctx *tcx, struct ibw_conn *conn, uint32_t size)
+{
+       unsigned char *buf;
+       void    *key;
+       uint32_t        len;
+       unsigned char   sum;
+
+       len = sizeof(uint32_t) + 1 + size + 1;
+       if (ibw_alloc_send_buf(conn, (void **)&buf, &key, len)) {
+               DEBUG(DEBUG_ERR, ("varsize/ibw_alloc_send_buf failed\n"));
+               return -1;
+       }
+       *((uint32_t *)buf) = len;
+       buf[sizeof(uint32_t)] = TESTOP_SEND_RND;
+       sum = ibwtest_fill_random(buf + sizeof(uint32_t) + 1, size);
+       buf[sizeof(uint32_t) + 1 + size] = sum;
+       if (ibw_send(conn, buf, key, len)) {
+               DEBUG(DEBUG_ERR, ("varsize/ibw_send failed\n"));
+               return -1;
+       }
+       tcx->nsent++;
+
+       return 0;
+}
+
+int ibwtest_do_varsize_scenario_conn(struct ibwtest_ctx *tcx, struct ibw_conn *conn)
+{
+       uint32_t        size;
+       int     i;
+
+       for(i=0; i<tcx->nmsg; i++)
+       {
+               //size = (uint32_t)((float)(tcx->maxsize) * (rand() / (RAND_MAX + 1.0)));
+               size = (uint32_t)((float)(tcx->maxsize) * ((float)(i+1)/(float)tcx->nmsg));
+               if (ibwtest_do_varsize_scenario_conn_size(tcx, conn, size))
+                       return -1;
+       }
+       return 0;
+}
+
+/*int ibwtest_do_varsize_scenario(ibwtest_ctx *tcx)
+{
+       int     rc;
+       struct ibw_conn *conn;
+
+       for(conn=tcx->ibwctx->conn_list; conn!=NULL; conn=conn->next) {
+               if (conn->state==IBWC_CONNECTED) {
+                       rc = ibwtest_do_varsize_scenario_conn(tcx, conn);
+                       if (rc)
+                               tcx->error = rc;
+               }
+       }
+}*/
+
+int ibwtest_connstate_handler(struct ibw_ctx *ctx, struct ibw_conn *conn)
+{
+       struct ibwtest_ctx      *tcx = NULL; /* userdata */
+       struct ibwtest_conn     *tconn = NULL; /* userdata */
+
+       if (ctx) {
+               tcx = talloc_get_type(ctx->ctx_userdata, struct ibwtest_ctx);
+
+               switch(ctx->state) {
+               case IBWS_INIT:
+                       DEBUG(DEBUG_DEBUG, ("test IBWS_INIT\n"));
+                       break;
+               case IBWS_READY:
+                       DEBUG(DEBUG_DEBUG, ("test IBWS_READY\n"));
+                       break;
+               case IBWS_CONNECT_REQUEST:
+                       DEBUG(DEBUG_DEBUG, ("test IBWS_CONNECT_REQUEST\n"));
+                       tconn = talloc_zero(conn, struct ibwtest_conn);
+                       if (ibw_accept(ctx, conn, tconn)) {
+                               DEBUG(DEBUG_ERR, ("error accepting the connect request\n"));
+                       }
+                       break;
+               case IBWS_STOPPED:
+                       DEBUG(DEBUG_DEBUG, ("test IBWS_STOPPED\n"));
+                       tcx->kill_me = 1; /* main loop can exit */
+                       break;
+               case IBWS_ERROR:
+                       DEBUG(DEBUG_DEBUG, ("test IBWS_ERROR\n"));
+                       ibw_stop(tcx->ibwctx);
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+
+       if (conn) {
+               tconn = talloc_get_type(conn->conn_userdata, struct ibwtest_conn);
+               switch(conn->state) {
+               case IBWC_INIT:
+                       DEBUG(DEBUG_DEBUG, ("test IBWC_INIT\n"));
+                       break;
+               case IBWC_CONNECTED:
+                       if (gettimeofday(&tcx->start_time, NULL)) {
+                               DEBUG(DEBUG_ERR, ("gettimeofday error %d", errno));
+                               return -1;
+                       }
+                       ibwtest_send_id(conn);
+                       break;
+               case IBWC_DISCONNECTED:
+                       DEBUG(DEBUG_DEBUG, ("test IBWC_DISCONNECTED\n"));
+                       talloc_free(conn);
+                       break;
+               case IBWC_ERROR:
+                       DEBUG(DEBUG_DEBUG, ("test IBWC_ERROR %s\n", ibw_getLastError()));
+                       break;
+               default:
+                       assert(0);
+                       break;
+               }
+       }
+       return 0;
+}
+
+int ibwtest_receive_handler(struct ibw_conn *conn, void *buf, int n)
+{
+       struct ibwtest_conn *tconn;
+       enum testopcode op;
+       struct ibwtest_ctx *tcx = talloc_get_type(conn->ctx->ctx_userdata, struct ibwtest_ctx);
+       int     rc = 0;
+
+       assert(conn!=NULL);
+       assert(n>=sizeof(uint32_t)+1);
+       tconn = talloc_get_type(conn->conn_userdata, struct ibwtest_conn);
+
+       op = (enum testopcode)((char *)buf)[sizeof(uint32_t)];
+       if (op==TESTOP_SEND_ID) {
+               tconn->id = talloc_strdup(tconn, ((char *)buf)+sizeof(uint32_t)+1);
+       }
+       if (op==TESTOP_SEND_ID || op==TESTOP_SEND_TEXT) {
+               DEBUG(DEBUG_DEBUG, ("[%d]msg from %s: \"%s\"(%d)\n", op,
+                       tconn->id ? tconn->id : "NULL", ((char *)buf)+sizeof(uint32_t)+1, n));
+       }
+
+       if (tcx->is_server) {
+               if (op==TESTOP_SEND_RND) {
+                       unsigned char sum;
+                       sum = ibwtest_get_sum((unsigned char *)buf + sizeof(uint32_t) + 1,
+                               n - sizeof(uint32_t) - 2);
+                       DEBUG(DEBUG_DEBUG, ("[%d]msg varsize %u/sum %u from %s\n",
+                               op,
+                               n - sizeof(uint32_t) - 2,
+                               (uint32_t)sum,
+                               tconn->id ? tconn->id : "NULL"));
+                       if (sum!=((unsigned char *)buf)[n-1]) {
+                               DEBUG(DEBUG_ERR, ("ERROR: checksum mismatch %u!=%u\n",
+                                       (uint32_t)sum, (uint32_t)((unsigned char *)buf)[n-1]));
+                               ibw_stop(tcx->ibwctx);
+                               goto error;
+                       }
+               } else if (op!=TESTOP_SEND_ID) {
+                       char *buf2;
+                       void *key2;
+
+                       /* bounce message regardless what it is */
+                       if (ibw_alloc_send_buf(conn, (void **)&buf2, &key2, n)) {
+                               fprintf(stderr, "ibw_alloc_send_buf error #2\n");
+                               goto error;
+                       }
+                       memcpy(buf2, buf, n);
+                       if (ibw_send(conn, buf2, key2, n)) {
+                               fprintf(stderr, "ibw_send error #2\n");
+                               goto error;
+                       }
+                       tcx->nsent++;
+               }
+       } else { /* client: */
+               if (op==TESTOP_SEND_ID && tcx->maxsize) {
+                       /* send them in one blow */
+                       rc = ibwtest_do_varsize_scenario_conn(tcx, conn);
+               }
+
+               if (tcx->nmsg) {
+                       char    msg[26];
+                       sprintf(msg, "hello world %d", tcx->nmsg--);
+                       rc = ibwtest_send_test_msg(tcx, conn, msg);
+                       if (tcx->nmsg==0) {
+                               ibw_stop(tcx->ibwctx);
+                               tcx->stopping = 1;
+                       }
+               }
+       }
+
+       if (rc)
+               tcx->error = rc;
+
+       return rc;
+error:
+       return -1;
+}
+
+void ibwtest_timeout_handler(struct event_context *ev, struct timed_event *te, 
+       struct timeval t, void *private_data)
+{
+       struct ibwtest_ctx *tcx = talloc_get_type(private_data, struct ibwtest_ctx);
+       int     rc;
+
+       if (!tcx->is_server) {
+               struct ibw_conn *conn;
+               char    msg[50];
+
+               /* fill it with something variable... */
+               sprintf(msg, "hello world %d", tcx->cnt++);
+
+               /* send something to everybody... */
+               for(conn=tcx->ibwctx->conn_list; conn!=NULL; conn=conn->next) {
+                       if (conn->state==IBWC_CONNECTED) {
+                               rc = ibwtest_send_test_msg(tcx, conn, msg);
+                               if (rc)
+                                       tcx->error = rc;
+                       }
+               }
+       } /* else allow main loop run */
+}
+
+static struct ibwtest_ctx *testctx = NULL;
+
+void ibwtest_sigint_handler(int sig)
+{
+       DEBUG(DEBUG_ERR, ("got SIGINT\n"));
+       if (testctx) {
+               if (testctx->ibwctx->state==IBWS_READY ||
+                       testctx->ibwctx->state==IBWS_CONNECT_REQUEST ||
+                       testctx->ibwctx->state==IBWS_ERROR)
+               {
+                       if (testctx->stopping) {
+                               DEBUG(DEBUG_DEBUG, ("forcing exit...\n"));
+                               testctx->kill_me = 1;
+                       } else {
+                               /* mostly expected case */
+                               ibw_stop(testctx->ibwctx);
+                               testctx->stopping = 1;
+                       }
+               } else
+                       testctx->kill_me = 1;
+       }
+}
+
+int ibwtest_parse_attrs(struct ibwtest_ctx *tcx, char *optext,
+       struct ibw_initattr **pattrs, int *nattrs, char op)
+{
+       int     i = 0, n = 1;
+       int     porcess_next = 1;
+       char    *p, *q;
+       struct ibw_initattr *attrs = NULL;
+
+       *pattrs = NULL;
+       for(p = optext; *p!='\0'; p++) {
+               if (*p==',')
+                       n++;
+       }
+
+       attrs = (struct ibw_initattr *)talloc_size(tcx,
+               n * sizeof(struct ibw_initattr));
+       for(p = optext; *p!='\0'; p++) {
+               if (porcess_next) {
+                       attrs[i].name = p;
+                       q = strchr(p, ':');
+                       if (q==NULL) {
+                               fprintf(stderr, "-%c format error\n", op);
+                               return -1;
+                       }
+                       *q = '\0';
+                       attrs[i].value = q + 1;
+
+                       porcess_next = 0;
+                       i++;
+                       p = q; /* ++ at end */
+               }
+               if (*p==',') {
+                       *p = '\0'; /* ++ at end */
+                       porcess_next = 1;
+               }
+       }
+       *pattrs = attrs;
+       *nattrs = n;
+
+       return 0;
+}
+
+static int ibwtest_get_address(const char *address, struct in_addr *addr)
+{
+       if (inet_pton(AF_INET, address, addr) <= 0) {
+               struct hostent *he = gethostbyname(address);
+               if (he == NULL || he->h_length > sizeof(*addr)) {
+                       DEBUG(DEBUG_ERR, ("invalid nework address '%s'\n", address));
+                       return -1;
+               }
+               memcpy(addr, he->h_addr, he->h_length);
+       }
+       return 0;
+}
+
+int ibwtest_getdests(struct ibwtest_ctx *tcx, char op)
+{
+       int     i;
+       struct ibw_initattr     *attrs = NULL;
+       struct sockaddr_in      *p;
+       char    *tmp;
+
+       tmp = talloc_strdup(tcx, optarg);
+       if (tmp == NULL) return -1;
+       /* hack to reuse the above ibw_initattr parser */
+       if (ibwtest_parse_attrs(tcx, tmp, &attrs, &tcx->naddrs, op))
+               return -1;
+
+       tcx->addrs = talloc_size(tcx,
+               tcx->naddrs * sizeof(struct sockaddr_in));
+       for(i=0; i<tcx->naddrs; i++) {
+               p = tcx->addrs + i;
+               p->sin_family = AF_INET;
+               if (ibwtest_get_address(attrs[i].name, &p->sin_addr))
+                       return -1;
+               p->sin_port = htons(atoi(attrs[i].value));
+       }
+
+       return 0;
+}
+
+int ibwtest_init_server(struct ibwtest_ctx *tcx)
+{
+       if (tcx->naddrs!=1) {
+               fprintf(stderr, "incorrect number of addrs(%d!=1)\n", tcx->naddrs);
+               return -1;
+       }
+
+       if (ibw_bind(tcx->ibwctx, &tcx->addrs[0])) {
+               DEBUG(DEBUG_ERR, ("ERROR: ibw_bind failed\n"));
+               return -1;
+       }
+       
+       if (ibw_listen(tcx->ibwctx, 1)) {
+               DEBUG(DEBUG_ERR, ("ERROR: ibw_listen failed\n"));
+               return -1;
+       }
+
+       /* continued at IBWS_READY */
+       return 0;
+}
+
+void ibwtest_usage(struct ibwtest_ctx *tcx, char *name)
+{
+       printf("Usage:\n");
+       printf("\t%s -i <id> -o {name:value} -d {addr:port} -t nsec -s\n", name);
+       printf("\t-i <id> is a free text, acting as a server id, max 23 chars [mandatory]\n");
+       printf("\t-o name1:value1,name2:value2,... is a list of (name, value) pairs\n");
+       printf("\t-a addr1:port1,addr2:port2,... is a list of destination ip addresses\n");
+       printf("\t-t nsec delta time between sends in nanosec [default %d]\n", tcx->nsec);
+       printf("\t\t send message periodically and endless when nsec is non-zero\n");
+       printf("\t-s server mode (you have to give exactly one -d address:port in this case)\n");
+       printf("\t-n number of messages to send [default %d]\n", tcx->nmsg);
+       printf("\t-l usec time to sleep in the main loop [default %d]\n", tcx->sleep_usec);
+       printf("\t-v max variable msg size in bytes [default %d], 0=don't send var. size\n", tcx->maxsize);
+       printf("\t-d LogLevel [default %d]\n", LogLevel);       
+       printf("Press ctrl+C to stop the program.\n");
+}
+
+int main(int argc, char *argv[])
+{
+       int     rc, op;
+       int     result = 1;
+       struct tevent_context *ev = NULL;
+       struct ibwtest_ctx *tcx = NULL;
+       float   usec;
+
+       tcx = talloc_zero(NULL, struct ibwtest_ctx);
+       memset(tcx, 0, sizeof(struct ibwtest_ctx));
+       tcx->nsec = 0;
+       tcx->nmsg = 1000;
+       LogLevel = 0;
+
+       /* here is the only case we can't avoid using global... */
+       testctx = tcx;
+       signal(SIGINT, ibwtest_sigint_handler);
+       srand((unsigned)time(NULL));
+
+       while ((op=getopt(argc, argv, "i:o:d:m:st:n:l:v:a:")) != -1) {
+               switch (op) {
+               case 'i':
+                       tcx->id = talloc_strdup(tcx, optarg);
+                       break;
+               case 'o':
+                       tcx->opts = talloc_strdup(tcx, optarg);
+                       if (tcx->opts) goto cleanup;
+                       if (ibwtest_parse_attrs(tcx, tcx->opts, &tcx->attrs,
+                               &tcx->nattrs, op))
+                               goto cleanup;
+                       break;
+               case 'a':
+                       if (ibwtest_getdests(tcx, op))
+                               goto cleanup;
+                       break;
+               case 's':
+                       tcx->is_server = 1;
+                       break;
+               case 't':
+                       tcx->nsec = (unsigned int)atoi(optarg);
+                       break;
+               case 'n':
+                       tcx->nmsg = atoi(optarg);
+                       break;
+               case 'l':
+                       tcx->sleep_usec = (unsigned int)atoi(optarg);
+                       break;
+               case 'v':
+                       tcx->maxsize = (unsigned int)atoi(optarg);
+                       break;
+               case 'd':
+                       LogLevel = atoi(optarg);
+                       break;
+               default:
+                       fprintf(stderr, "ERROR: unknown option -%c\n", (char)op);
+                       ibwtest_usage(tcx, argv[0]);
+                       goto cleanup;
+               }
+       }
+       if (tcx->id==NULL) {
+               ibwtest_usage(tcx, argv[0]);
+               goto cleanup;
+       }
+
+       ev = event_context_init(NULL);
+       assert(ev);
+
+       tcx->ibwctx = ibw_init(tcx->attrs, tcx->nattrs,
+               tcx,
+               ibwtest_connstate_handler,
+               ibwtest_receive_handler,
+               ev
+       );
+       if (!tcx->ibwctx)
+               goto cleanup;
+
+       if (tcx->is_server)
+               rc = ibwtest_init_server(tcx);
+       else
+               rc = ibwtest_connect_everybody(tcx);
+       if (rc)
+               goto cleanup;
+
+       while(!tcx->kill_me && !tcx->error) {
+               if (tcx->nsec) {
+                       event_add_timed(ev, tcx, timeval_current_ofs(0, tcx->nsec),
+                               ibwtest_timeout_handler, tcx);
+               }
+
+               event_loop_once(ev);
+
+               if (tcx->sleep_usec)
+                       usleep(tcx->sleep_usec);
+       }
+
+       if (!tcx->is_server && tcx->nsent!=0 && !tcx->error) {
+               if (gettimeofday(&tcx->end_time, NULL)) {
+                       DEBUG(DEBUG_ERR, ("gettimeofday error %d\n", errno));
+                       goto cleanup;
+               }
+               usec = (tcx->end_time.tv_sec - tcx->start_time.tv_sec) * 1000000 +
+                               (tcx->end_time.tv_usec - tcx->start_time.tv_usec);
+               printf("usec: %f, nmsg: %d, usec/nmsg: %f\n",
+                       usec, tcx->nsent, usec/(float)tcx->nsent);
+       }
+
+       if (!tcx->error)
+               result = 0; /* everything OK */
+
+cleanup:
+       if (tcx)
+               talloc_free(tcx);
+       if (ev)
+               talloc_free(ev);
+       DEBUG(DEBUG_ERR, ("exited with code %d\n", result));
+       return result;
+}
diff --git a/ctdb/include/cmdline.h b/ctdb/include/cmdline.h
new file mode 100644 (file)
index 0000000..785595e
--- /dev/null
@@ -0,0 +1,7 @@
+
+extern struct poptOption popt_ctdb_cmdline[];
+
+#define POPT_CTDB_CMDLINE { NULL, 0, POPT_ARG_INCLUDE_TABLE, popt_ctdb_cmdline, 0, "Common ctdb test options:", NULL },
+
+struct ctdb_context *ctdb_cmdline_init(struct event_context *ev);
+
diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h
new file mode 100644 (file)
index 0000000..c3da068
--- /dev/null
@@ -0,0 +1,1236 @@
+/*
+   ctdb database library
+
+   Copyright (C) Ronnie sahlberg 2010
+   Copyright (C) Rusty Russell 2010
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CTDB_H
+#define _CTDB_H
+#include <sys/types.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <tdb.h>
+#include <netinet/in.h>
+#include <ctdb_protocol.h>
+
+/**
+ * ctdb - a library for accessing tdbs controlled by ctdbd
+ *
+ * ctdbd (clustered tdb daemon) is a daemon designed to syncronize TDB
+ * databases across a cluster.  Using this library, you can communicate with
+ * the daemon to access the databases, pass messages across the cluster, and
+ * control the daemon itself.
+ *
+ * The general API is event-driven and asynchronous: you call the
+ * *_send functions, supplying callbacks, then when the ctdbd file
+ * descriptor is usable, call ctdb_service() to perform read from it
+ * and call your callbacks, which use the *_recv functions to unpack
+ * the replies from ctdbd.
+ *
+ * There is also a synchronous wrapper for each function for trivial
+ * programs; these can be found in the section marked "Synchronous API".
+ */
+
+/**
+ * ctdb_log_fn_t - logging function for ctdbd
+ * @log_priv: private (typesafe) arg via ctdb_connect
+ * @severity: syslog-style severity
+ * @format: printf-style format string.
+ * @ap: arguments for formatting.
+ *
+ * The severity passed to log() are as per syslog(3).  In particular,
+ * LOG_DEBUG is used for tracing, LOG_WARNING is used for unusual
+ * conditions which don't necessarily return an error through the API,
+ * LOG_ERR is used for errors such as lost communication with ctdbd or
+ * out-of-memory, LOG_ALERT is used for library usage bugs, LOG_CRIT is
+ * used for libctdb internal consistency checks.
+ *
+ * The log() function can be typesafe: the @log_priv arg to
+ * ctdb_donnect and signature of log() should match.
+ */
+typedef void (*ctdb_log_fn_t)(void *log_priv,
+                             int severity, const char *format, va_list ap);
+
+/**
+ * ctdb_connect - connect to ctdb using the specified domain socket.
+ * @addr: the socket address, or NULL for default
+ * @log: the logging function
+ * @log_priv: the private argument to the logging function.
+ *
+ * Returns a ctdb context if successful or NULL.  Use ctdb_disconnect() to
+ * release the returned ctdb_connection when finished.
+ *
+ * See Also:
+ *     ctdb_log_fn_t, ctdb_log_file()
+ */
+struct ctdb_connection *ctdb_connect(const char *addr,
+                                    ctdb_log_fn_t log_fn, void *log_priv);
+
+/**
+ * ctdb_log_file - example logging function
+ *
+ * Logs everything at priority LOG_WARNING or above to the file given (via
+ * the log_priv argument, usually stderr).
+ */
+void ctdb_log_file(FILE *, int, const char *, va_list);
+
+/**
+ * ctdb_log_level - level at which to call logging function
+ *
+ * This variable globally controls filtering on the logging function.
+ * It is initialized to LOG_WARNING, meaning that strange but nonfatal
+ * events, as well as errors and API misuses are reported.
+ *
+ * Set it to LOG_DEBUG to receive all messages.
+ */
+extern int ctdb_log_level;
+
+/**
+ * ctdb_disconnect - close down a connection to ctdbd.
+ * @ctdb: the ctdb connectio returned from ctdb_connect.
+ *
+ * The @ctdb arg will be freed by this call, and must not be used again.
+ */
+void ctdb_disconnect(struct ctdb_connection *ctdb);
+
+/***
+ *
+ *  Asynchronous API
+ *
+ ***/
+
+/**
+ * ctdb_num_active - get the number of active commands
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ *
+ * This command can be used to find the number of active commands we have
+ * issued. An active command is a command we have queued, or sent
+ * to the ctdb daemon but which we have not yet received a reply to.
+ *
+ * See Also:
+ *     ctdb_num_in_flight(), ctdb_num_out_queue()
+ */
+int ctdb_num_active(struct ctdb_connection *ctdb);
+
+/**
+ * ctdb_num_in_flight - get the number of commands in flight.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ *
+ * This command can be used to find the number of commands we have
+ * sent to the ctdb daemon to which we have not yet received/processed
+ * the reply.
+ *
+ * See Also:
+ *     ctdb_num_out_queue(), ctdb_num_active()
+ */
+int ctdb_num_in_flight(struct ctdb_connection *ctdb);
+
+/**
+ * ctdb_num_out_queue - get the number of commands in the out queue
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ *
+ * This command can be used to find the number of commands we have
+ * queued for delivery to the ctdb daemon but have not yet been
+ * written to the domain socket.
+ *
+ * See Also:
+ *     ctdb_num_in_flight(), ctdb_num_active()
+ */
+int ctdb_num_out_queue(struct ctdb_connection *ctdb);
+
+/**
+ * ctdb_get_fd - get the filedescriptor to select/poll on
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ *
+ * By using poll or select on this file descriptor, you will know when to call
+ * ctdb_service().
+ *
+ * See Also:
+ *     ctdb_which_events(), ctdb_service()
+ */
+int ctdb_get_fd(struct ctdb_connection *ctdb);
+
+/**
+ * ctdb_which_events - determine which events ctdb_service wants to see
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ *
+ * This returns POLLIN, possibly or'd with POLLOUT if there are writes
+ * pending.  You can set this straight into poll.events.
+ *
+ * See Also:
+ *     ctdb_service()
+ */
+int ctdb_which_events(struct ctdb_connection *ctdb);
+
+/**
+ * ctdb_service - service any I/O and callbacks from ctdbd communication
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @revents: which events are available.
+ *
+ * This is the core of the library: it read and writes to the ctdbd
+ * socket.  It may call callbacks registered with the various _send
+ * functions.
+ *
+ * revents is a bitset: POLLIN and/or POLLOUT may be set to indicate
+ * it is worth attempting to read/write the (nonblocking)
+ * filedescriptor respectively.
+ *
+ * Note that the synchronous functions call this internally.
+ * Returns false on catastrophic failure.
+ */
+bool ctdb_service(struct ctdb_connection *ctdb, int revents);
+
+/**
+ * struct ctdb_request - handle for an outstanding request
+ *
+ * This opaque structure returned from various *_send functions gives
+ * you a handle by which you can cancel a request.  You can't do
+ * anything else with it until the request is completed and it is
+ * handed to your callback function.
+ */
+struct ctdb_request;
+
+/**
+ * ctdb_request_free - free a completed request
+ *
+ * This frees a request: you should only call it once it has been
+ * handed to your callback.  For incomplete requests, see ctdb_cancel().
+ */
+void ctdb_request_free(struct ctdb_request *req);
+
+/**
+ * ctdb_callback_t - callback for completed requests.
+ *
+ * This would normally unpack the request using ctdb_*_recv().  You
+ * must free the request using ctdb_request_free().
+ *
+ * Note that due to macro magic, actual your callback can be typesafe:
+ * instead of taking a void *, it can take a type which matches the
+ * actual private parameter.
+ */
+typedef void (*ctdb_callback_t)(struct ctdb_connection *ctdb,
+                               struct ctdb_request *req, void *private_data);
+
+/**
+ * struct ctdb_db - connection to a particular open TDB
+ *
+ * This represents a particular open database: you receive it from
+ * ctdb_attachdb or ctdb_attachdb_recv to manipulate a database.
+ *
+ * You have to free the handle with ctdb_detachdb() when finished with it.
+ */
+struct ctdb_db;
+
+/**
+ * ctdb_attachdb_send - open a clustered TDB
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @name: the filename of the database (no /).
+ * @persistent: whether the database is persistent across ctdbd's life
+ * @tdb_flags: the flags to pass to tdb_open.
+ * @callback: the callback when we're attached or failed (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * This function connects to a TDB controlled by ctdbd.  It can create
+ * a new TDB if it does not exist, depending on tdb_flags.  Returns
+ * the pending request, or NULL on error.
+ */
+struct ctdb_request *
+ctdb_attachdb_send(struct ctdb_connection *ctdb,
+                  const char *name, bool persistent, uint32_t tdb_flags,
+                  ctdb_callback_t callback, void *cbdata);
+
+/**
+ * ctdb_attachdb_recv - read an ctdb_attach reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ *
+ * This returns NULL if something went wrong, or otherwise the open database.
+ */
+struct ctdb_db *ctdb_attachdb_recv(struct ctdb_connection *ctdb,
+                                  struct ctdb_request *req);
+
+
+/**
+ * struct ctdb_lock - a record lock on a clustered TDB database
+ *
+ * This locks a subset of the database across the entire cluster; it
+ * is the fundamental sychronization element for ctdb.  You cannot have
+ * more than one lock at once.
+ *
+ * You MUST NOT block during holding this lock and MUST release it
+ * quickly by performing ctdb_release_lock(lock).
+ * Do NOT make any system calls that may block while holding the lock.
+ *
+ * Try to release the lock as quickly as possible.
+ */
+struct ctdb_lock;
+
+/**
+ * ctdb_rrl_callback_t - callback for ctdb_readrecordlock_async
+ *
+ * This is not the standard ctdb_callback_t, because there is often no
+ * request required to access a database record (ie. if it is local already).
+ * So the callback is handed the lock directly: it might be NULL if there
+ * was an error obtaining the lock.
+ *
+ * See Also:
+ *     ctdb_readrecordlock_async(), ctdb_readrecordlock()
+ */
+typedef void (*ctdb_rrl_callback_t)(struct ctdb_db *ctdb_db,
+                                   struct ctdb_lock *lock,
+                                   TDB_DATA data,
+                                   void *private_data);
+
+/**
+ * ctdb_readrecordlock_async - read and lock a record
+ * @ctdb_db: the database handle from ctdb_attachdb/ctdb_attachdb_recv.
+ * @key: the key of the record to lock.
+ * @callback: the callback once the record is locked (typesafe).
+ * @cbdata: the argument to callback()
+ *
+ * This returns true on success.  Commonly, we can obtain the record
+ * immediately and so the callback will be invoked.  Otherwise a request
+ * will be queued to ctdbd for the record.
+ *
+ * If failure is immediate, false is returned.  Otherwise, the callback
+ * may receive a NULL lock arg to indicate asynchronous failure.
+ */
+bool ctdb_readrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
+                              ctdb_rrl_callback_t callback, void *cbdata);
+
+/**
+ * ctdb_readonlyrecordlock_async - read and lock a record for read-only access
+ * @ctdb_db: the database handle from ctdb_attachdb/ctdb_attachdb_recv.
+ * @key: the key of the record to lock.
+ * @callback: the callback once the record is locked (typesafe).
+ * @cbdata: the argument to callback()
+ *
+ * This returns true on success.  Commonly, we can obtain the record
+ * immediately and so the callback will be invoked.  Otherwise a request
+ * will be queued to ctdbd for the record.
+ *
+ * If failure is immediate, false is returned.  Otherwise, the callback
+ * may receive a NULL lock arg to indicate asynchronous failure.
+ */
+bool ctdb_readonlyrecordlock_async(struct ctdb_db *ctdb_db, TDB_DATA key,
+                              ctdb_rrl_callback_t callback, void *cbdata);
+
+
+/**
+ * ctdb_writerecord - write a locked record in a TDB
+ * @ctdb_db: the database handle from ctdb_attachdb/ctdb_attachdb_recv.
+ * @lock: the lock from ctdb_readrecordlock/ctdb_readrecordlock_recv
+ * @data: the new data to place in the record.
+ */
+bool ctdb_writerecord(struct ctdb_db *ctdb_db,
+                     struct ctdb_lock *lock, TDB_DATA data);
+
+/**
+ * ctdb_release_lock - release a record lock on a TDB
+ * @ctdb_db: the database handle from ctdb_attachdb/ctdb_attachdb_recv.
+ * @lock: the lock from ctdb_readrecordlock/ctdb_readrecordlock_async
+ */
+void ctdb_release_lock(struct ctdb_db *ctdb_db, struct ctdb_lock *lock);
+
+
+
+/**
+ * ctdb_traverse_callback_t - callback for ctdb_traverse_async.
+ * return 0 - to continue traverse
+ * return 1 - to abort the traverse
+ *
+ * See Also:
+ *     ctdb_traverse_async()
+ */
+#define TRAVERSE_STATUS_RECORD         0
+#define TRAVERSE_STATUS_FINISHED       1
+#define TRAVERSE_STATUS_ERROR          2
+typedef int (*ctdb_traverse_callback_t)(struct ctdb_connection *ctdb,
+                                   struct ctdb_db *ctdb_db,
+                                   int status,
+                                   TDB_DATA key,
+                                   TDB_DATA data,
+                                   void *private_data);
+
+/**
+ * ctdb_traverse_async - traverse a database.
+ * @ctdb_db: the database handle from ctdb_attachdb/ctdb_attachdb_recv.
+ * @callback: the callback once the record is locked (typesafe).
+ * @cbdata: the argument to callback()
+ *
+ * This returns true on success.
+ * when successfull, the callback will be invoked for each record
+ * until the traversal is finished.
+ *
+ * status == 
+ * TRAVERSE_STATUS_RECORD         key/data contains a record.
+ * TRAVERSE_STATUS_FINISHED       traverse is finished. key/data is undefined.
+ * TRAVERSE_STATUS_ERROR          an error occured during traverse.
+ *                                key/data is undefined.
+ *
+ * If failure is immediate, false is returned.
+ */
+bool ctdb_traverse_async(struct ctdb_db *ctdb_db,
+                        ctdb_traverse_callback_t callback, void *cbdata);
+
+/**
+ * ctdb_message_fn_t - messaging callback for ctdb messages
+ *
+ * ctdbd provides a simple messaging API; you can register for a particular
+ * 64-bit id on which you want to send messages, and send to other ids.
+ *
+ * See Also:
+ *     ctdb_set_message_handler_send()
+ */
+typedef void (*ctdb_message_fn_t)(struct ctdb_connection *,
+                                 uint64_t srvid, TDB_DATA data, void *);
+
+/**
+ * ctdb_set_message_handler_send - register for messages to a srvid
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @srvid: the 64 bit identifier for our messages.
+ * @handler: the callback when we receive such a message (typesafe)
+ * @handler_data: the argument to handler()
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * Note: our callback will always be called before handler.
+ *
+ * See Also:
+ *     ctdb_set_message_handler_recv(), ctdb_remove_message_handler_send()
+ */
+struct ctdb_request *
+ctdb_set_message_handler_send(struct ctdb_connection *ctdb, uint64_t srvid,
+                             ctdb_message_fn_t handler,
+                             void *handler_data,
+                             ctdb_callback_t callback,
+                             void *cbdata);
+
+/**
+ * ctdb_set_message_handler_recv - read a set_message_handler result
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request
+ *
+ * If this returns true, the registered handler may be called from the next
+ * ctdb_service().  If this returns false, the registration failed.
+ */
+bool ctdb_set_message_handler_recv(struct ctdb_connection *ctdb,
+                                  struct ctdb_request *handle);
+
+/**
+ * ctdb_remove_message_handler_send - unregister for messages to a srvid
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @srvid: the 64 bit identifier for our messages.
+ * @handler: the callback when we receive such a message (typesafe)
+ * @handler_data: the argument to handler()
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * This undoes a successful ctdb_set_message_handler or
+ * ctdb_set_message_handler_recv.
+ */
+struct ctdb_request *
+ctdb_remove_message_handler_send(struct ctdb_connection *ctdb, uint64_t srvid,
+                                ctdb_message_fn_t handler, void *handler_data,
+                                ctdb_callback_t callback, void *cbdata);
+
+/**
+ * ctdb_remove_message_handler_recv - read a remove_message_handler result
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request
+ *
+ * After this returns true, the registered handler will no longer be called.
+ * If this returns false, the de-registration failed.
+ */
+bool ctdb_remove_message_handler_recv(struct ctdb_connection *ctdb,
+                                     struct ctdb_request *req);
+
+
+/**
+ * ctdb_send_message - send a message via ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @pnn: the physical node number to send to
+ * @srvid: the 64 bit identifier for this message type.
+ * @data: the data to send
+ *
+ * This allows arbitrary messages to be sent across the cluster to those
+ * listening (via ctdb_set_message_handler et al).
+ *
+ * This queues a message to be sent: you will need to call
+ * ctdb_service() to actually send the message.  There is no callback
+ * because there is no acknowledgement.
+ *
+ * See Also:
+ *     ctdb_getpnn_send(), ctdb_getpnn()
+ */
+bool ctdb_send_message(struct ctdb_connection *ctdb, uint32_t pnn, uint64_t srvid, TDB_DATA data);
+
+/**
+ * ctdb_getpnn_send - read the pnn number of a node.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getpnn_send(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                ctdb_callback_t callback,
+                void *cbdata);
+/**
+ * ctdb_getpnn_recv - read an ctdb_getpnn reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @pnn: a pointer to the pnn to fill in
+ *
+ * This returns false if something went wrong, or otherwise fills in pnn.
+ */
+bool ctdb_getpnn_recv(struct ctdb_connection *ctdb,
+                     struct ctdb_request *req, uint32_t *pnn);
+
+
+/**
+ * ctdb_getdbstat_send - read statistics for a db
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @db_id:    the database to collect the statistics from
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getdbstat_send(struct ctdb_connection *ctdb,
+                    uint32_t destnode,
+                    uint32_t db_id,
+                    ctdb_callback_t callback,
+                    void *cbdata);
+/**
+ * ctdb_getdbstat_recv - read an ctdb_getdbstat reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @stat: a pointer to the *stat to fill in
+ *
+ * This returns false if something went wrong, or otherwise fills in **stats
+ * stats must be freed later by calling ctdb_free_dbstat();
+ */
+bool ctdb_getdbstat_recv(struct ctdb_connection *ctdb,
+                        struct ctdb_request *req,
+                        struct ctdb_db_statistics **stat);
+
+void ctdb_free_dbstat(struct ctdb_db_statistics *stat);
+
+/**
+ * ctdb_check_message_handlers_send - check a list of message_handlers
+ * if they are registered
+ * message_handlers are registered on the daemon using the
+ *   ctdb_set_message_handler_send() call
+ *
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @num: number of srvids to check
+ * @mhs: @num message_handlers values to check
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_check_message_handlers_send(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                uint32_t num,
+                uint64_t *mhs,
+                ctdb_callback_t callback,
+                void *cbdata);
+/**
+ * ctdb_check_message_handlers_recv - read a ctdb_check_message_handlers
+ * reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @num: number of message_handlers to check
+ * @result: an array of @num uint8_t fields containing the result of the check
+ *     0: message_handler does not exist
+ *     1: message_handler exists
+ *
+ * This returns false if something went wrong, or otherwise fills in result.
+ */
+bool
+ctdb_check_message_handlers_recv(struct ctdb_connection *ctdb,
+                                 struct ctdb_request *req, uint32_t num,
+                                 uint8_t *result);
+
+
+/**
+ * ctdb_getcapabilities_send - read the capabilities of a node
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getcapabilities_send(struct ctdb_connection *ctdb,
+                         uint32_t destnode,
+                         ctdb_callback_t callback, void *cbdata);
+
+/**
+ * ctdb_getcapabilities_recv - read an ctdb_getcapabilities reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @capabilities: a pointer to the capabilities to fill in
+ *
+ * This returns false if something went wrong, or otherwise fills in
+ * capabilities.
+ */
+bool ctdb_getcapabilities_recv(struct ctdb_connection *ctdb,
+                              struct ctdb_request *handle,
+                              uint32_t *capabilities);
+
+/**
+ * ctdb_getdbseqnum_send - read the sequence number off a db
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @dbid: database id
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getdbseqnum_send(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                uint32_t dbid,
+                ctdb_callback_t callback,
+                void *cbdata);
+/**
+ * ctdb_getdbseqnum_recv - read the sequence number off a database
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @seqnum: a pointer to the seqnum to fill in
+ *
+ * This returns false if something went wrong, or otherwise fills in pnn.
+ */
+bool ctdb_getdbseqnum_recv(struct ctdb_connection *ctdb,
+                     struct ctdb_request *req, uint64_t *seqnum);
+
+/**
+ * ctdb_getnodemap_send - read the nodemap number from a node.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getnodemap_send(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                ctdb_callback_t callback,
+                void *cbdata);
+/**
+ * ctdb_getnodemap_recv - read an ctdb_getnodemap reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @nodemap: a pointer to the returned nodemap structure
+ *
+ * This returns false if something went wrong.
+ * If the command failed, it guarantees to set nodemap to NULL.
+ * A non-NULL value for nodemap means the command was successful.
+ *
+ * A non-NULL value of the nodemap must be release released/freed
+ * by ctdb_free_nodemap().
+ */
+bool ctdb_getnodemap_recv(struct ctdb_connection *ctdb,
+                     struct ctdb_request *req, struct ctdb_node_map **nodemap);
+
+/**
+ * ctdb_getifaces_send - read the list of interfaces from a node.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getifaces_send(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                ctdb_callback_t callback,
+                void *cbdata);
+/**
+ * ctdb_getifaces_recv - read an ctdb_getifaces reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @ifaces: the list of interfaces 
+ *
+ * This returns false if something went wrong.
+ * If the command failed, it guarantees to set ifaces to NULL.
+ * A non-NULL value for ifaces means the command was successful.
+ *
+ * A non-NULL value of the ifaces must be release released/freed
+ * by ctdb_free_ifaces().
+ */
+bool ctdb_getifaces_recv(struct ctdb_connection *ctdb,
+                     struct ctdb_request *req, struct ctdb_ifaces_list **ifaces);
+
+/* Free a datastructure returned by ctdb_getifaces[_recv] */
+void ctdb_free_ifaces(struct ctdb_ifaces_list *ifaces);
+
+/**
+ * ctdb_getpublicips_send - read the public ip list from a node.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * This control returns the list of public ips known to the local node.
+ * Deamons only know about those ips that are listed in the local
+ * public addresses file, which means the returned list of ips may
+ * be only a subset of all ips across the entire cluster.
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getpublicips_send(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                ctdb_callback_t callback,
+                void *cbdata);
+/**
+ * ctdb_getpublicips_recv - read the public ip list from a node
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @ips: a pointer to the returned public ip list
+ *
+ * This returns false if something went wrong.
+ * If the command failed, it guarantees to set ips to NULL.
+ * A non-NULL value for nodemap means the command was successful.
+ *
+ * A non-NULL value of the nodemap must be release released/freed
+ * by ctdb_free_publicips().
+ */
+bool ctdb_getpublicips_recv(struct ctdb_connection *ctdb,
+                     struct ctdb_request *req, struct ctdb_all_public_ips **ips);
+
+
+/**
+ * ctdb_getrecmaster_send - read the recovery master of a node
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getrecmaster_send(struct ctdb_connection *ctdb,
+                       uint32_t destnode,
+                       ctdb_callback_t callback, void *cbdata);
+
+/**
+ * ctdb_getrecmaster_recv - read an ctdb_getrecmaster reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @recmaster: a pointer to the recmaster to fill in
+ *
+ * This returns false if something went wrong, or otherwise fills in
+ * recmaster.
+ */
+bool ctdb_getrecmaster_recv(struct ctdb_connection *ctdb,
+                           struct ctdb_request *handle,
+                           uint32_t *recmaster);
+
+/**
+ * ctdb_getrecmode_send - read the recovery mode of a node
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getrecmode_send(struct ctdb_connection *ctdb,
+                    uint32_t destnode,
+                    ctdb_callback_t callback, void *cbdata);
+
+/**
+ * ctdb_getrecmode_recv - read an ctdb_getrecmode reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @recmode: a pointer to the recmode to fill in
+ *
+ * This returns false if something went wrong, or otherwise fills in
+ * recmode.
+ */
+bool ctdb_getrecmode_recv(struct ctdb_connection *ctdb,
+                         struct ctdb_request *handle,
+                         uint32_t *recmode);
+
+/**
+ * ctdb_getvnnmap_send - read the vnn map from a node.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @callback: the callback when ctdb replies to our message (typesafe)
+ * @cbdata: the argument to callback()
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+struct ctdb_request *
+ctdb_getvnnmap_send(struct ctdb_connection *ctdb,
+                   uint32_t destnode,
+                   ctdb_callback_t callback,
+                   void *cbdata);
+/**
+ * ctdb_getvnnmap_recv - read an ctdb_getvnnmap reply from ctdbd
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the completed request.
+ * @vnnmap: the list of interfaces 
+ *
+ * This returns false if something went wrong.
+ * If the command failed, it guarantees to set vnnmap to NULL.
+ * A non-NULL value for vnnmap means the command was successful.
+ *
+ * A non-NULL value of the vnnmap must be released/freed
+ * by ctdb_free_vnnmap().
+ */
+bool ctdb_getvnnmap_recv(struct ctdb_connection *ctdb,
+                        struct ctdb_request *req, struct ctdb_vnn_map **vnnmap);
+
+/**
+ * ctdb_cancel - cancel an uncompleted request
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @req: the uncompleted request.
+ *
+ * This cancels a request, returning true.  You may not cancel a
+ * request which has already been completed (ie. once its callback has
+ * been called); you should simply use ctdb_request_free() in that case.
+ */
+void ctdb_cancel(struct ctdb_connection *ctdb, struct ctdb_request *req);
+
+/***
+ *
+ *  Synchronous API
+ *
+ ***/
+
+/**
+ * ctdb_attachdb - open a clustered TDB (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @name: the filename of the database (no /).
+ * @persistent: whether the database is persistent across ctdbd's life
+ * @tdb_flags: the flags to pass to tdb_open.
+ *
+ * Do a ctdb_attachdb_send and wait for it to complete.
+ * Returns NULL on failure.
+ */
+struct ctdb_db *ctdb_attachdb(struct ctdb_connection *ctdb,
+                             const char *name, bool persistent,
+                             uint32_t tdb_flags);
+
+/**
+ * ctdb_detachdb - close a clustered TDB.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @db: the database from ctdb_attachdb/ctdb_attachdb_send
+ *
+ * Closes a clustered tdb.
+ */
+void ctdb_detachdb(struct ctdb_connection *ctdb, struct ctdb_db *db);
+
+/**
+ * ctdb_readrecordlock - read and lock a record (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @ctdb_db: the database handle from ctdb_attachdb/ctdb_attachdb_recv.
+ * @key: the key of the record to lock.
+ * @req: a pointer to the request, if one is needed.
+ *
+ * Do a ctdb_readrecordlock_send and wait for it to complete.
+ * Returns NULL on failure.
+ */
+struct ctdb_lock *ctdb_readrecordlock(struct ctdb_connection *ctdb,
+                                     struct ctdb_db *ctdb_db, TDB_DATA key,
+                                     TDB_DATA *data);
+
+
+/**
+ * ctdb_set_message_handler - register for messages to a srvid (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @srvid: the 64 bit identifier for our messages.
+ * @handler: the callback when we receive such a message (typesafe)
+ * @cbdata: the argument to handler()
+ *
+ * If this returns true, the message handler can be called from any
+ * ctdb_service() (which is also called indirectly by other
+ * synchronous functions).  If this returns false, the registration
+ * failed.
+ */
+bool ctdb_set_message_handler(struct ctdb_connection *ctdb, uint64_t srvid,
+                             ctdb_message_fn_t handler, void *cbdata);
+
+
+/**
+ * ctdb_remove_message_handler - deregister for messages (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @srvid: the 64 bit identifier for our messages.
+ * @handler: the callback when we receive such a message (typesafe)
+ * @handler_data: the argument to handler()
+ *
+ * If this returns true, the message handler will no longer be called.
+ * If this returns false, the deregistration failed.
+ */
+bool ctdb_remove_message_handler(struct ctdb_connection *ctdb, uint64_t srvid,
+                                ctdb_message_fn_t handler, void *handler_data);
+
+/**
+ * ctdb_getpnn - read the pnn number of a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @pnn: a pointer to the pnn to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *pnn on success.
+ */
+bool ctdb_getpnn(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                uint32_t *pnn);
+
+/**
+ * ctdb_getdbstat - read the db stat of a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @db_id:    the database to collect the statistics from
+ * @stat: a pointer to the *stat to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * This returns false if something went wrong, or otherwise fills in **stat
+ * stat must be freed later by calling ctdb_free_dbstat();
+ */
+bool ctdb_getdbstat(struct ctdb_connection *ctdb,
+                   uint32_t destnode,
+                   uint32_t db_id,
+                   struct ctdb_db_statistics **stat);
+
+
+/**
+ * ctdb_check_message_handlers - check a list of message_handlers (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @num: number of srvids to check
+ * @mhs: @num message_handlers to check
+ * @result: an array of @num uint8_t fields containing the result of the check
+ *     0: message_handler does not exist
+ *     1: message_handler exists
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ */
+bool
+ctdb_check_message_handlers(struct ctdb_connection *ctdb,
+                          uint32_t destnode,
+                          uint32_t num,
+                          uint64_t *mhs,
+                          uint8_t *result);
+
+/**
+ * ctdb_getcapabilities - read the capabilities of a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @capabilities: a pointer to the capabilities to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *capabilities on success.
+ */
+bool ctdb_getcapabilities(struct ctdb_connection *ctdb,
+                         uint32_t destnode,
+                         uint32_t *capabilities);
+
+
+/**
+ * ctdb_getdbseqnum - read the seqnum of a database
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @dbid: database id
+ * @seqnum: sequence number for the database
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *pnn on success.
+ */
+bool
+ctdb_getdbseqnum(struct ctdb_connection *ctdb,
+                uint32_t destnode,
+                uint32_t dbid,
+                uint64_t *seqnum);
+
+/**
+ * ctdb_getrecmaster - read the recovery master of a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @recmaster: a pointer to the recmaster to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *recmaster on success.
+ */
+bool ctdb_getrecmaster(struct ctdb_connection *ctdb,
+                      uint32_t destnode,
+                      uint32_t *recmaster);
+
+
+/**
+ * ctdb_getrecmode - read the recovery mode of a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @recmode: a pointer to the recmode to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *recmode on success.
+ */
+bool ctdb_getrecmode(struct ctdb_connection *ctdb,
+                    uint32_t destnode,
+                    uint32_t *recmode);
+
+
+/**
+ * ctdb_getnodemap - read the nodemap from a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @nodemap: a pointer to the nodemap to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *nodemap on success.
+ * A non-NULL nodemap must be freed by calling ctdb_free_nodemap.
+ */
+bool ctdb_getnodemap(struct ctdb_connection *ctdb,
+                    uint32_t destnode, struct ctdb_node_map **nodemap);
+
+/**
+ * ctdb_getifaces - read the list of interfaces from a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @ifaces: a pointer to the ifaces to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *ifaces on success.
+ * A non-NULL value of the ifaces must be release released/freed
+ * by ctdb_free_ifaces().
+ */
+bool ctdb_getifaces(struct ctdb_connection *ctdb,
+                   uint32_t destnode, struct ctdb_ifaces_list **ifaces);
+
+/*
+ * This function is used to release/free the nodemap structure returned
+ * by ctdb_getnodemap() and ctdb_getnodemap_recv()
+ */
+void ctdb_free_nodemap(struct ctdb_node_map *nodemap);
+
+
+/**
+ * ctdb_getpublicips - read the public ip list from a node.
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @ips: a pointer to the returned public ip list
+ *
+ * This control returns the list of public ips known to the local node.
+ * Deamons only know about those ips that are listed in the local
+ * public addresses file, which means the returned list of ips may
+ * be only a subset of all ips across the entire cluster.
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * This returns false if something went wrong.
+ * If the command failed, it guarantees to set ips to NULL.
+ * A non-NULL value for nodemap means the command was successful.
+ *
+ * A non-NULL value of the nodemap must be release released/freed
+ * by ctdb_free_publicips().
+ */
+bool ctdb_getpublicips(struct ctdb_connection *ctdb,
+                    uint32_t destnode, struct ctdb_all_public_ips **ips);
+
+/*
+ * This function is used to release/free the public ip structure returned
+ * by ctdb_getpublicips() and ctdb_getpublicips_recv()
+ */
+void ctdb_free_publicips(struct ctdb_all_public_ips *ips);
+
+
+/**
+ * ctdb_getvnnmap - read the vnn map from a node (synchronous)
+ * @ctdb: the ctdb_connection from ctdb_connect.
+ * @destnode: the destination node (see below)
+ * @vnnmap: a pointer to the vnnmap to fill in
+ *
+ * There are several special values for destnode, detailed in
+ * ctdb_protocol.h, particularly CTDB_CURRENT_NODE which means the
+ * local ctdbd.
+ *
+ * Returns true and fills in *vnnmap on success.
+ * A non-NULL value of the vnnmap must be  released/freed
+ * by ctdb_free_vnnmap().
+ */
+bool ctdb_getvnnmap(struct ctdb_connection *ctdb,
+                   uint32_t destnode, struct ctdb_vnn_map **vnnmap);
+
+/*
+ * This function is used to release/free the vnnmap structure returned
+ * by ctdb_getvnnmap() and ctdb_getvnnmap_recv()
+ */
+void ctdb_free_vnnmap(struct ctdb_vnn_map *vnnmap);
+
+/* These ugly macro wrappers make the callbacks typesafe. */
+#include <ctdb_typesafe_cb.h>
+#define ctdb_sendcb(cb, cbdata)                                                \
+        typesafe_cb_preargs(void, (cb), (cbdata),                      \
+                            struct ctdb_connection *, struct ctdb_request *)
+
+#define ctdb_msgcb(cb, cbdata)                                         \
+       typesafe_cb_preargs(void, (cb), (cbdata),                       \
+                           struct ctdb_connection *, uint64_t, TDB_DATA)
+
+#define ctdb_connect(addr, log, logpriv)                               \
+       ctdb_connect((addr),                                            \
+                    typesafe_cb_postargs(void, (log), (logpriv),       \
+                                         int, const char *, va_list),  \
+                    (logpriv))
+
+#define ctdb_set_message_handler(ctdb, srvid, handler, hdata)          \
+       ctdb_set_message_handler((ctdb), (srvid),                       \
+                                ctdb_msgcb((handler), (hdata)), (hdata))
+
+#define ctdb_remove_message_handler(ctdb, srvid, handler, hdata)       \
+       ctdb_remove_message_handler((ctdb), (srvid),                    \
+                                   ctdb_msgcb((handler), (hdata)), (hdata))
+
+#define ctdb_attachdb_send(ctdb, name, persistent, tdb_flags, cb, cbdata) \
+       ctdb_attachdb_send((ctdb), (name), (persistent), (tdb_flags),   \
+                          ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_readrecordlock_async(_ctdb_db, key, cb, cbdata)           \
+       ctdb_readrecordlock_async((_ctdb_db), (key),                    \
+               typesafe_cb_preargs(void, (cb), (cbdata),               \
+                                   struct ctdb_db *, struct ctdb_lock *, \
+                                   TDB_DATA), (cbdata))
+
+#define ctdb_set_message_handler_send(ctdb, srvid, handler, hdata, cb, cbdata) \
+       ctdb_set_message_handler_send((ctdb), (srvid),                  \
+                                     ctdb_msgcb((handler), (hdata)), (hdata), \
+                                     ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_remove_message_handler_send(ctdb, srvid, handler, hdata, cb, cbdata) \
+       ctdb_remove_message_handler_send((ctdb), (srvid),               \
+             ctdb_msgcb((handler), (hdata)), (hdata),                  \
+             ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getpnn_send(ctdb, destnode, cb, cbdata)                   \
+       ctdb_getpnn_send((ctdb), (destnode),                            \
+                        ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getcapabilities_send(ctdb, destnode, cb, cbdata)          \
+       ctdb_getcapabilities_send((ctdb), (destnode),                   \
+                                 ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getdbstat_send(ctdb, destnode, db_id, cb, cbdata)         \
+       ctdb_getdbstat_send((ctdb), (destnode), (db_id),                \
+                           ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_check_message_handlers_send(ctdb, destnode, num, mhs,     \
+                        cb, cbdata)                                    \
+       ctdb_check_message_handlers_send((ctdb), (destnode), (num),     \
+                        (mhs),                                         \
+                        ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getrecmaster_send(ctdb, destnode, cb, cbdata)             \
+       ctdb_getrecmaster_send((ctdb), (destnode),                      \
+                              ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getrecmode_send(ctdb, destnode, cb, cbdata)               \
+       ctdb_getrecmode_send((ctdb), (destnode),                        \
+                              ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getnodemap_send(ctdb, destnode, cb, cbdata)               \
+       ctdb_getnodemap_send((ctdb), (destnode),                        \
+                        ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getpublicips_send(ctdb, destnode, cb, cbdata)             \
+       ctdb_getpublicips_send((ctdb), (destnode),                      \
+                        ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getdbseqnum_send(ctdb, destnode, dbid, cb, cbdata)                \
+       ctdb_getdbseqnum_send((ctdb), (destnode), (dbid),               \
+                        ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getifaces_send(ctdb, destnode, cb, cbdata)                        \
+       ctdb_getifaces_send((ctdb), (destnode),                         \
+                        ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#define ctdb_getvnnmap_send(ctdb, destnode, cb, cbdata)                        \
+       ctdb_getvnnmap_send((ctdb), (destnode),                         \
+                        ctdb_sendcb((cb), (cbdata)), (cbdata))
+
+#endif
diff --git a/ctdb/include/ctdb_client.h b/ctdb/include/ctdb_client.h
new file mode 100644 (file)
index 0000000..53b0829
--- /dev/null
@@ -0,0 +1,623 @@
+/*
+   ctdb database library: old client interface
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CTDB_CLIENT_H
+#define _CTDB_CLIENT_H
+#include "ctdb_protocol.h"
+
+enum control_state {CTDB_CONTROL_WAIT, CTDB_CONTROL_DONE, CTDB_CONTROL_ERROR, CTDB_CONTROL_TIMEOUT};
+
+struct ctdb_client_control_state {
+       struct ctdb_context *ctdb;
+       uint32_t reqid;
+       int32_t status;
+       TDB_DATA outdata;
+       enum control_state state;
+       char *errormsg;
+       struct ctdb_req_control *c;
+
+       /* if we have a callback registered for the completion (or failure) of
+          this control
+          if a callback is used, it MUST talloc_free the cb_data passed to it
+       */
+       struct {
+               void (*fn)(struct ctdb_client_control_state *);
+               void *private_data;
+       } async;
+};
+
+struct ctdb_client_notify_register {
+       uint64_t srvid;
+       uint32_t len;
+       uint8_t notify_data[1];
+};
+
+struct ctdb_client_notify_deregister {
+       uint64_t srvid;
+};
+
+struct tevent_context;
+
+/*
+  initialise ctdb subsystem
+*/
+struct ctdb_context *ctdb_init(struct tevent_context *ev);
+
+/*
+  choose the transport
+*/
+int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport);
+
+/*
+  set some flags
+*/
+void ctdb_set_flags(struct ctdb_context *ctdb, unsigned flags);
+
+/*
+  tell ctdb what address to listen on, in transport specific format
+*/
+int ctdb_set_address(struct ctdb_context *ctdb, const char *address);
+
+int ctdb_set_socketname(struct ctdb_context *ctdb, const char *socketname);
+const char *ctdb_get_socketname(struct ctdb_context *ctdb);
+
+/*
+  Check that a specific ip address exists in the node list and returns
+  the id for the node or -1
+*/
+int ctdb_ip_to_nodeid(struct ctdb_context *ctdb, const char *nodeip);
+
+/*
+  start the ctdb protocol
+*/
+int ctdb_start(struct ctdb_context *ctdb);
+
+/*
+  attach to a ctdb database
+*/
+struct ctdb_db_context *ctdb_attach(struct ctdb_context *ctdb,
+                                   struct timeval timeout,
+                                   const char *name,
+                                   bool persistent,
+                                   uint32_t tdb_flags);
+
+
+/*
+  find an attached ctdb_db handle given a name
+ */
+struct ctdb_db_context *ctdb_db_handle(struct ctdb_context *ctdb, const char *name);
+
+/*
+  error string for last ctdb error
+*/
+const char *ctdb_errstr(struct ctdb_context *);
+
+/* a ctdb call function */
+typedef int (*ctdb_fn_t)(struct ctdb_call_info *);
+
+/*
+  setup a ctdb call function
+*/
+int ctdb_set_call(struct ctdb_db_context *ctdb_db, ctdb_fn_t fn, uint32_t id);
+
+
+
+/*
+  make a ctdb call. The associated ctdb call function will be called on the DMASTER
+  for the given record
+*/
+int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
+
+/*
+  initiate an ordered ctdb cluster shutdown
+  this function will never return
+*/
+void ctdb_shutdown(struct ctdb_context *ctdb);
+
+/* return pnn of this node */
+uint32_t ctdb_get_pnn(struct ctdb_context *ctdb);
+
+/*
+  return the number of nodes
+*/
+uint32_t ctdb_get_num_nodes(struct ctdb_context *ctdb);
+
+/* setup a handler for ctdb messages */
+typedef void (*ctdb_msg_fn_t)(struct ctdb_context *, uint64_t srvid,
+                                 TDB_DATA data, void *);
+int ctdb_client_set_message_handler(struct ctdb_context *ctdb, uint64_t srvid,
+                            ctdb_msg_fn_t handler,
+                            void *private_data);
+int ctdb_client_remove_message_handler(struct ctdb_context *ctdb,
+                                      uint64_t srvid, void *private_data);
+int ctdb_client_check_message_handlers(struct ctdb_context *ctdb,
+                                      uint64_t *ids, uint32_t num,
+                                      uint8_t *result);
+
+int ctdb_call(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
+struct ctdb_client_call_state *ctdb_call_send(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
+int ctdb_call_recv(struct ctdb_client_call_state *state, struct ctdb_call *call);
+
+/* send a ctdb message */
+int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+                     uint64_t srvid, TDB_DATA data);
+
+
+/*
+   Fetch a ctdb record from a remote node
+ . Underneath this will force the
+   dmaster for the record to be moved to the local node.
+*/
+struct ctdb_record_handle *ctdb_fetch_lock(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
+                                          TDB_DATA key, TDB_DATA *data);
+
+struct ctdb_record_handle *ctdb_fetch_readonly_lock(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx, TDB_DATA key, TDB_DATA *data, int read_only);
+
+int ctdb_record_store(struct ctdb_record_handle *h, TDB_DATA data);
+
+int ctdb_fetch(struct ctdb_db_context *ctdb_db, TALLOC_CTX *mem_ctx,
+              TDB_DATA key, TDB_DATA *data);
+
+int ctdb_register_message_handler(struct ctdb_context *ctdb,
+                                 TALLOC_CTX *mem_ctx,
+                                 uint64_t srvid,
+                                 ctdb_msg_fn_t handler,
+                                 void *private_data);
+
+struct ctdb_db_context *find_ctdb_db(struct ctdb_context *ctdb, uint32_t id);
+
+
+struct ctdb_context *ctdb_cmdline_client(struct tevent_context *ev,
+                                        struct timeval req_timeout);
+
+struct ctdb_statistics;
+int ctdb_ctrl_statistics(struct ctdb_context *ctdb, uint32_t destnode, struct ctdb_statistics *status);
+int ctdb_ctrl_dbstatistics(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid,
+                          TALLOC_CTX *mem_ctx, struct ctdb_db_statistics **dbstat);
+
+int ctdb_ctrl_shutdown(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+struct ctdb_vnn_map;
+int ctdb_ctrl_getvnnmap(struct ctdb_context *ctdb,
+               struct timeval timeout, uint32_t destnode,
+               TALLOC_CTX *mem_ctx, struct ctdb_vnn_map **vnnmap);
+int ctdb_ctrl_setvnnmap(struct ctdb_context *ctdb,
+               struct timeval timeout, uint32_t destnode,
+               TALLOC_CTX *mem_ctx, struct ctdb_vnn_map *vnnmap);
+
+/* table that contains a list of all dbids on a node
+ */
+struct ctdb_dbid_map {
+       uint32_t num;
+       struct ctdb_dbid {
+               uint32_t dbid;
+#define CTDB_DB_FLAGS_PERSISTENT       0x01
+#define CTDB_DB_FLAGS_READONLY         0x02
+#define CTDB_DB_FLAGS_STICKY           0x04
+               uint8_t flags;
+       } dbs[1];
+};
+int ctdb_ctrl_getdbmap(struct ctdb_context *ctdb,
+       struct timeval timeout, uint32_t destnode,
+       TALLOC_CTX *mem_ctx, struct ctdb_dbid_map **dbmap);
+
+
+struct ctdb_node_map;
+
+int ctdb_ctrl_getnodemap(struct ctdb_context *ctdb,
+                   struct timeval timeout, uint32_t destnode,
+                   TALLOC_CTX *mem_ctx, struct ctdb_node_map **nodemap);
+
+int ctdb_ctrl_getnodemapv4(struct ctdb_context *ctdb,
+                   struct timeval timeout, uint32_t destnode,
+                   TALLOC_CTX *mem_ctx, struct ctdb_node_map **nodemap);
+
+int ctdb_ctrl_reload_nodes_file(struct ctdb_context *ctdb,
+                   struct timeval timeout, uint32_t destnode);
+
+struct ctdb_key_list {
+       uint32_t dbid;
+       uint32_t num;
+       TDB_DATA *keys;
+       struct ctdb_ltdb_header *headers;
+       TDB_DATA *data;
+};
+
+int ctdb_ctrl_pulldb(
+       struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid,
+       uint32_t lmaster, TALLOC_CTX *mem_ctx,
+       struct timeval timeout, TDB_DATA *outdata);
+
+struct ctdb_client_control_state *ctdb_ctrl_pulldb_send(
+       struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid,
+       uint32_t lmaster, TALLOC_CTX *mem_ctx, struct timeval timeout);
+
+int ctdb_ctrl_pulldb_recv(
+       struct ctdb_context *ctdb,
+       TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state,
+       TDB_DATA *outdata);
+
+int ctdb_ctrl_pushdb(
+       struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid,
+       TALLOC_CTX *mem_ctx,
+       struct timeval timeout, TDB_DATA indata);
+
+struct ctdb_client_control_state *ctdb_ctrl_pushdb_send(
+       struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid,
+       TALLOC_CTX *mem_ctx, struct timeval timeout,
+       TDB_DATA indata);
+
+int ctdb_ctrl_pushdb_recv(
+       struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+       struct ctdb_client_control_state *state);
+
+
+int ctdb_ctrl_copydb(struct ctdb_context *ctdb,
+       struct timeval timeout, uint32_t sourcenode,
+       uint32_t destnode, uint32_t dbid, uint32_t lmaster,
+       TALLOC_CTX *mem_ctx);
+
+int ctdb_ctrl_getdbpath(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t dbid, TALLOC_CTX *mem_ctx, const char **path);
+int ctdb_ctrl_getdbname(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t dbid, TALLOC_CTX *mem_ctx, const char **name);
+int ctdb_ctrl_getdbhealth(struct ctdb_context *ctdb,
+                         struct timeval timeout,
+                         uint32_t destnode,
+                         uint32_t dbid, TALLOC_CTX *mem_ctx,
+                         const char **reason);
+int ctdb_ctrl_getdbseqnum(struct ctdb_context *ctdb, struct timeval timeout,
+                         uint32_t destnode, uint32_t dbid, uint64_t *seqnum);
+int ctdb_ctrl_createdb(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, const char *name, bool persistent);
+
+int ctdb_ctrl_process_exists(struct ctdb_context *ctdb, uint32_t destnode, pid_t pid);
+
+int ctdb_ctrl_ping(struct ctdb_context *ctdb, uint32_t destnode);
+
+int ctdb_ctrl_get_runstate(struct ctdb_context *ctdb, 
+                          struct timeval timeout, 
+                          uint32_t destnode,
+                          uint32_t *runstate);
+
+int ctdb_ctrl_get_config(struct ctdb_context *ctdb);
+
+int ctdb_ctrl_get_debuglevel(struct ctdb_context *ctdb, uint32_t destnode, int32_t *level);
+int ctdb_ctrl_set_debuglevel(struct ctdb_context *ctdb, uint32_t destnode, int32_t level);
+
+/*
+  change dmaster for all keys in the database to the new value
+ */
+int ctdb_ctrl_setdmaster(struct ctdb_context *ctdb,
+       struct timeval timeout, uint32_t destnode,
+       TALLOC_CTX *mem_ctx, uint32_t dbid, uint32_t dmaster);
+
+/*
+  write a record on a specific db (this implicitely updates dmaster of the record to locally be the vnn of the node where the control is executed on)
+ */
+int ctdb_ctrl_write_record(struct ctdb_context *ctdb, uint32_t destnode, TALLOC_CTX *mem_ctx, uint32_t dbid, TDB_DATA key, TDB_DATA data);
+
+#define CTDB_RECOVERY_NORMAL           0
+#define CTDB_RECOVERY_ACTIVE           1
+
+/*
+  get the recovery mode of a remote node
+ */
+int ctdb_ctrl_getrecmode(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmode);
+
+struct ctdb_client_control_state *ctdb_ctrl_getrecmode_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_getrecmode_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmode);
+
+
+/*
+  set the recovery mode of a remote node
+ */
+int ctdb_ctrl_setrecmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmode);
+/*
+  get the monitoring mode of a remote node
+ */
+int ctdb_ctrl_getmonmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *monmode);
+
+/*
+  set the monitoring mode of a remote node to active
+ */
+int ctdb_ctrl_enable_monmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+/*
+  set the monitoring mode of a remote node to disabled
+ */
+int ctdb_ctrl_disable_monmode(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+
+/*
+  get the recovery master of a remote node
+ */
+int ctdb_ctrl_getrecmaster(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, uint32_t *recmaster);
+
+struct ctdb_client_control_state *ctdb_ctrl_getrecmaster_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_getrecmaster_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *recmaster);
+
+
+
+/*
+  set the recovery master of a remote node
+ */
+int ctdb_ctrl_setrecmaster(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmaster);
+
+uint32_t *ctdb_get_connected_nodes(struct ctdb_context *ctdb,
+                                  struct timeval timeout,
+                                  TALLOC_CTX *mem_ctx,
+                                  uint32_t *num_nodes);
+
+int ctdb_statistics_reset(struct ctdb_context *ctdb, uint32_t destnode);
+
+int ctdb_set_logfile(struct ctdb_context *ctdb, const char *logfile, bool use_syslog);
+
+typedef int (*ctdb_traverse_func)(struct ctdb_context *, TDB_DATA, TDB_DATA, void *);
+int ctdb_traverse(struct ctdb_db_context *ctdb_db, ctdb_traverse_func fn, void *private_data);
+
+struct ctdb_dump_db_context {
+       FILE *f;
+       bool printemptyrecords;
+       bool printdatasize;
+       bool printlmaster;
+       bool printhash;
+       bool printrecordflags;
+};
+
+int ctdb_dumpdb_record(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, void *p);
+int ctdb_dump_db(struct ctdb_db_context *ctdb_db,
+                struct ctdb_dump_db_context *ctx);
+
+/*
+  get the pid of a ctdb daemon
+ */
+int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *pid);
+
+int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout,
+                       uint32_t destnode);
+int ctdb_ctrl_freeze_priority(struct ctdb_context *ctdb, struct timeval timeout,
+                             uint32_t destnode, uint32_t priority);
+
+struct ctdb_client_control_state *
+ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+                     struct timeval timeout, uint32_t destnode,
+                     uint32_t priority);
+
+int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+                       struct ctdb_client_control_state *state);
+
+int ctdb_ctrl_thaw_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t priority);
+int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_getpnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_get_tunable(struct ctdb_context *ctdb,
+                         struct timeval timeout,
+                         uint32_t destnode,
+                         const char *name, uint32_t *value);
+
+int ctdb_ctrl_set_tunable(struct ctdb_context *ctdb,
+                         struct timeval timeout,
+                         uint32_t destnode,
+                         const char *name, uint32_t value);
+
+int ctdb_ctrl_list_tunables(struct ctdb_context *ctdb,
+                           struct timeval timeout,
+                           uint32_t destnode,
+                           TALLOC_CTX *mem_ctx,
+                           const char ***list, uint32_t *count);
+
+int ctdb_ctrl_modflags(struct ctdb_context *ctdb,
+                      struct timeval timeout,
+                      uint32_t destnode,
+                      uint32_t set, uint32_t clear);
+
+enum ctdb_server_id_type {
+       SERVER_TYPE_SAMBA=1,
+       SERVER_TYPE_NFSD=2,
+       SERVER_TYPE_ISCSID=3
+};
+
+struct ctdb_server_id {
+       enum ctdb_server_id_type type;
+       uint32_t pnn;
+       uint32_t server_id;
+};
+
+struct ctdb_server_id_list {
+       uint32_t num;
+       struct ctdb_server_id server_ids[1];
+};
+
+
+int ctdb_ctrl_register_server_id(struct ctdb_context *ctdb,
+               struct timeval timeout,
+               struct ctdb_server_id *id);
+int ctdb_ctrl_unregister_server_id(struct ctdb_context *ctdb,
+               struct timeval timeout,
+               struct ctdb_server_id *id);
+int ctdb_ctrl_check_server_id(struct ctdb_context *ctdb,
+               struct timeval timeout, uint32_t destnode,
+               struct ctdb_server_id *id, uint32_t *status);
+int ctdb_ctrl_get_server_id_list(struct ctdb_context *ctdb,
+               TALLOC_CTX *mem_ctx,
+               struct timeval timeout, uint32_t destnode,
+               struct ctdb_server_id_list **svid_list);
+
+struct ctdb_uptime {
+       struct timeval current_time;
+       struct timeval ctdbd_start_time;
+       struct timeval last_recovery_started;
+       struct timeval last_recovery_finished;
+};
+
+/*
+  struct for tcp_client control
+  this is an ipv4 only version of this structure used by samba
+  samba will later be migrated over to use the
+  ctdb_control_tcp_addr structure instead
+ */
+struct ctdb_control_tcp {
+       struct sockaddr_in src;  /* samba uses this */
+       struct sockaddr_in dest; /* samba uses this */
+};
+/* new style structure */
+struct ctdb_control_tcp_addr {
+       ctdb_sock_addr src;
+       ctdb_sock_addr dest;
+};
+
+int ctdb_socket_connect(struct ctdb_context *ctdb);
+
+/*
+  get the uptime of a remote node
+ */
+int ctdb_ctrl_uptime(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_uptime **uptime);
+
+struct ctdb_client_control_state *ctdb_ctrl_uptime_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_uptime_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, struct ctdb_uptime **uptime);
+
+int ctdb_ctrl_end_recovery(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_getreclock(struct ctdb_context *ctdb,
+       struct timeval timeout, uint32_t destnode,
+       TALLOC_CTX *mem_ctx, const char **reclock);
+int ctdb_ctrl_setreclock(struct ctdb_context *ctdb,
+       struct timeval timeout, uint32_t destnode,
+       const char *reclock);
+
+
+uint32_t *list_of_nodes(struct ctdb_context *ctdb,
+                       struct ctdb_node_map *node_map,
+                       TALLOC_CTX *mem_ctx,
+                       uint32_t mask,
+                       int exclude_pnn);
+uint32_t *list_of_connected_nodes(struct ctdb_context *ctdb,
+                               struct ctdb_node_map *node_map,
+                               TALLOC_CTX *mem_ctx,
+                               bool include_self);
+uint32_t *list_of_active_nodes(struct ctdb_context *ctdb,
+                               struct ctdb_node_map *node_map,
+                               TALLOC_CTX *mem_ctx,
+                               bool include_self);
+uint32_t *list_of_vnnmap_nodes(struct ctdb_context *ctdb,
+                               struct ctdb_vnn_map *vnn_map,
+                               TALLOC_CTX *mem_ctx,
+                               bool include_self);
+
+int ctdb_read_pnn_lock(int fd, int32_t pnn);
+
+/*
+  get capabilities of a remote node
+ */
+int ctdb_ctrl_getcapabilities(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *capabilities);
+
+struct ctdb_client_control_state *ctdb_ctrl_getcapabilities_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_getcapabilities_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state, uint32_t *capabilities);
+
+struct ctdb_marshall_buffer *ctdb_marshall_add(TALLOC_CTX *mem_ctx,
+                                              struct ctdb_marshall_buffer *m,
+                                              uint64_t db_id,
+                                              uint32_t reqid,
+                                              TDB_DATA key,
+                                              struct ctdb_ltdb_header *header,
+                                              TDB_DATA data);
+TDB_DATA ctdb_marshall_finish(struct ctdb_marshall_buffer *m);
+
+struct ctdb_transaction_handle *ctdb_transaction_start(struct ctdb_db_context *ctdb_db,
+                                                      TALLOC_CTX *mem_ctx);
+int ctdb_transaction_fetch(struct ctdb_transaction_handle *h,
+                          TALLOC_CTX *mem_ctx,
+                          TDB_DATA key, TDB_DATA *data);
+int ctdb_transaction_store(struct ctdb_transaction_handle *h,
+                          TDB_DATA key, TDB_DATA data);
+int ctdb_transaction_commit(struct ctdb_transaction_handle *h);
+
+int ctdb_ctrl_recd_ping(struct ctdb_context *ctdb);
+
+int switch_from_server_to_client(struct ctdb_context *ctdb, const char *fmt,
+                                ...);
+
+int ctdb_ctrl_getscriptstatus(struct ctdb_context *ctdb,
+                   struct timeval timeout, uint32_t destnode,
+                   TALLOC_CTX *mem_ctx, enum ctdb_eventscript_call type,
+                   struct ctdb_scripts_wire **script_status);
+
+
+struct debug_levels {
+       int32_t level;
+       const char *description;
+};
+extern struct debug_levels debug_levels[];
+
+const char *get_debug_by_level(int32_t level);
+int32_t get_debug_by_desc(const char *desc);
+
+int ctdb_ctrl_stop_node(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+int ctdb_ctrl_continue_node(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode);
+
+int ctdb_ctrl_setnatgwstate(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t natgwstate);
+int ctdb_ctrl_setlmasterrole(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t lmasterrole);
+int ctdb_ctrl_setrecmasterrole(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t recmasterrole);
+
+int ctdb_ctrl_enablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script);
+int ctdb_ctrl_disablescript(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *script);
+
+struct ctdb_ban_time {
+       uint32_t pnn;
+       uint32_t time;
+};
+
+int ctdb_ctrl_set_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_ban_time *bantime);
+int ctdb_ctrl_get_ban(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_ban_time **bantime);
+
+struct ctdb_db_priority {
+       uint32_t db_id;
+       uint32_t priority;
+};
+
+int ctdb_ctrl_set_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, struct ctdb_db_priority *db_prio);
+int ctdb_ctrl_get_db_priority(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t db_id, uint32_t *priority);
+
+int ctdb_ctrl_getstathistory(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, TALLOC_CTX *mem_ctx, struct ctdb_statistics_wire **stats);
+
+
+
+struct ctdb_client_control_state *
+ctdb_ctrl_updaterecord_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data);
+
+int ctdb_ctrl_updaterecord_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state);
+
+int
+ctdb_ctrl_updaterecord(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data);
+
+
+struct ctdb_client_control_state *
+ctdb_ctrl_set_db_readonly_send(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid);
+int ctdb_ctrl_set_db_readonly_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state);
+int ctdb_ctrl_set_db_readonly(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid);
+
+struct ctdb_client_control_state *
+ctdb_ctrl_set_db_sticky_send(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid);
+int ctdb_ctrl_set_db_sticky_recv(struct ctdb_context *ctdb, struct ctdb_client_control_state *state);
+int ctdb_ctrl_set_db_sticky(struct ctdb_context *ctdb, uint32_t destnode, uint32_t dbid);
+
+#endif /* _CTDB_CLIENT_H */
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h
new file mode 100644 (file)
index 0000000..b4966b8
--- /dev/null
@@ -0,0 +1,1591 @@
+/* 
+   ctdb database library
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CTDB_PRIVATE_H
+#define _CTDB_PRIVATE_H
+
+#include "ctdb_client.h"
+#include <sys/socket.h>
+
+/*
+ * Structure to support SRVID requests and replies
+ */
+struct srvid_request {
+       uint32_t pnn;
+       uint64_t srvid;
+       uint32_t data;
+};
+
+/*
+ * pid of the ctdbd daemon
+ */
+extern pid_t ctdbd_pid;
+
+/*
+  a tcp connection description
+  also used by tcp_add and tcp_remove controls
+ */
+struct ctdb_tcp_connection {
+       ctdb_sock_addr src_addr;
+       ctdb_sock_addr dst_addr;
+};
+
+/* the wire representation for a tcp tickle array */
+struct ctdb_tcp_wire_array {
+       uint32_t num;
+       struct ctdb_tcp_connection connections[1];
+};     
+
+/* the list of tcp tickles used by get/set tcp tickle list */
+struct ctdb_control_tcp_tickle_list {
+       ctdb_sock_addr addr;
+       struct ctdb_tcp_wire_array tickles;
+};
+
+/*
+  array of tcp connections
+ */
+struct ctdb_tcp_array {
+       uint32_t num;
+       struct ctdb_tcp_connection *connections;
+};     
+
+
+/* all tunable variables go in here */
+struct ctdb_tunable {
+       uint32_t max_redirect_count;
+       uint32_t seqnum_interval; /* unit is ms */
+       uint32_t control_timeout;
+       uint32_t traverse_timeout;
+       uint32_t keepalive_interval;
+       uint32_t keepalive_limit;
+       uint32_t recover_timeout;
+       uint32_t recover_interval;
+       uint32_t election_timeout;
+       uint32_t takeover_timeout;
+       uint32_t monitor_interval;
+       uint32_t tickle_update_interval;
+       uint32_t script_timeout;
+       uint32_t script_timeout_count; /* allow dodgy scripts to hang this many times in a row before we mark the node unhealthy */
+       uint32_t script_unhealthy_on_timeout; /* obsolete */
+       uint32_t recovery_grace_period;
+       uint32_t recovery_ban_period;
+       uint32_t database_hash_size;
+       uint32_t database_max_dead;
+       uint32_t rerecovery_timeout;
+       uint32_t enable_bans;
+       uint32_t deterministic_public_ips;
+       uint32_t reclock_ping_period;
+       uint32_t no_ip_failback;
+       uint32_t disable_ip_failover;
+       uint32_t verbose_memory_names;
+       uint32_t recd_ping_timeout;
+       uint32_t recd_ping_failcount;
+       uint32_t log_latency_ms;
+       uint32_t reclock_latency_ms;
+       uint32_t recovery_drop_all_ips;
+       uint32_t verify_recovery_lock;
+       uint32_t vacuum_interval;
+       uint32_t vacuum_max_run_time;
+       uint32_t repack_limit;
+       uint32_t vacuum_limit;
+       uint32_t max_queue_depth_drop_msg;
+       uint32_t use_status_events_for_monitoring;
+       uint32_t allow_unhealthy_db_read;
+       uint32_t stat_history_interval;
+       uint32_t deferred_attach_timeout;
+       uint32_t vacuum_fast_path_count;
+       uint32_t lcp2_public_ip_assignment;
+       uint32_t allow_client_db_attach;
+       uint32_t recover_pdb_by_seqnum;
+       uint32_t deferred_rebalance_on_node_add;
+       uint32_t fetch_collapse;
+       uint32_t hopcount_make_sticky;
+       uint32_t sticky_duration;
+       uint32_t sticky_pindown;
+       uint32_t no_ip_takeover;
+       uint32_t db_record_count_warn;
+       uint32_t db_record_size_warn;
+       uint32_t db_size_warn;
+       uint32_t pulldb_preallocation_size;
+       uint32_t no_ip_host_on_all_disabled;
+       uint32_t samba3_hack;
+};
+
+/*
+  an installed ctdb remote call
+*/
+struct ctdb_registered_call {
+       struct ctdb_registered_call *next, *prev;
+       uint32_t id;
+       ctdb_fn_t fn;
+};
+
+/*
+  this address structure might need to be generalised later for some
+  transports
+*/
+struct ctdb_address {
+       const char *address;
+       int port;
+};
+
+/*
+  check that a pnn is valid
+ */
+#define ctdb_validate_pnn(ctdb, pnn) (((uint32_t)(pnn)) < (ctdb)->num_nodes)
+
+
+/* called from the queue code when a packet comes in. Called with data==NULL
+   on error */
+typedef void (*ctdb_queue_cb_fn_t)(uint8_t *data, size_t length,
+                                  void *private_data);
+
+/* used for callbacks in ctdb_control requests */
+typedef void (*ctdb_control_callback_fn_t)(struct ctdb_context *,
+                                          int32_t status, TDB_DATA data, 
+                                          const char *errormsg,
+                                          void *private_data);
+/*
+  structure describing a connected client in the daemon
+ */
+struct ctdb_client {
+       struct ctdb_context *ctdb;
+       int fd;
+       struct ctdb_queue *queue;
+       uint32_t client_id;
+       pid_t pid;
+       struct ctdb_tcp_list *tcp_list;
+       uint32_t db_id;
+       uint32_t num_persistent_updates;
+       struct ctdb_client_notify_list *notify;
+};
+
+struct ctdb_iface;
+
+/* state associated with a public ip address */
+struct ctdb_vnn {
+       struct ctdb_vnn *prev, *next;
+
+       struct ctdb_iface *iface;
+       const char **ifaces;
+       ctdb_sock_addr public_address;
+       uint8_t public_netmask_bits;
+
+       /* the node number that is serving this public address, if any. 
+          If no node serves this ip it is set to -1 */
+       int32_t pnn;
+
+       /* List of clients to tickle for this public address */
+       struct ctdb_tcp_array *tcp_array;
+
+       /* whether we need to update the other nodes with changes to our list
+          of connected clients */
+       bool tcp_update_needed;
+
+       /* a context to hang sending gratious arp events off */
+       TALLOC_CTX *takeover_ctx;
+
+       struct ctdb_kill_tcp *killtcp;
+
+       /* Set to true any time an update to this VNN is in flight.
+          This helps to avoid races. */
+       bool update_in_flight;
+};
+
+/*
+  state associated with one node
+*/
+struct ctdb_node {
+       struct ctdb_context *ctdb;
+       struct ctdb_address address;
+       const char *name; /* for debug messages */
+       void *private_data; /* private to transport */
+       uint32_t pnn;
+       uint32_t flags;
+
+       /* used by the dead node monitoring */
+       uint32_t dead_count;
+       uint32_t rx_cnt;
+       uint32_t tx_cnt;
+
+       /* used to track node capabilities, is only valid/tracked inside the
+          recovery daemon.
+       */
+       uint32_t capabilities;
+
+       /* a list of controls pending to this node, so we can time them out quickly
+          if the node becomes disconnected */
+       struct daemon_control_state *pending_controls;
+
+       /* used by the recovery daemon when distributing ip addresses 
+          across the nodes.  it needs to know which public ip's can be handled
+          by each node.
+       */
+       struct ctdb_all_public_ips *known_public_ips;
+       struct ctdb_all_public_ips *available_public_ips;
+       /* used by the recovery dameon to track when a node should be banned */
+       struct ctdb_banning_state *ban_state; 
+};
+
+/*
+  transport specific methods
+*/
+struct ctdb_methods {
+       int (*initialise)(struct ctdb_context *); /* initialise transport structures */ 
+       int (*start)(struct ctdb_context *); /* start the transport */
+       int (*add_node)(struct ctdb_node *); /* setup a new node */     
+       int (*connect_node)(struct ctdb_node *); /* connect to node */
+       int (*queue_pkt)(struct ctdb_node *, uint8_t *data, uint32_t length);
+       void *(*allocate_pkt)(TALLOC_CTX *mem_ctx, size_t );
+       void (*shutdown)(struct ctdb_context *); /* shutdown transport */
+       void (*restart)(struct ctdb_node *); /* stop and restart the connection */
+};
+
+/*
+  transport calls up to the ctdb layer
+*/
+struct ctdb_upcalls {
+       /* recv_pkt is called when a packet comes in */
+       void (*recv_pkt)(struct ctdb_context *, uint8_t *data, uint32_t length);
+
+       /* node_dead is called when an attempt to send to a node fails */
+       void (*node_dead)(struct ctdb_node *);
+
+       /* node_connected is called when a connection to a node is established */
+       void (*node_connected)(struct ctdb_node *);
+};
+
+/* list of message handlers - needs to be changed to a more efficient data
+   structure so we can find a message handler given a srvid quickly */
+struct ctdb_message_list_header {
+       struct ctdb_message_list_header *next, *prev;
+       struct ctdb_context *ctdb;
+       uint64_t srvid;
+       struct ctdb_message_list *m;
+};
+struct ctdb_message_list {
+       struct ctdb_message_list *next, *prev;
+       struct ctdb_message_list_header *h;
+       ctdb_msg_fn_t message_handler;
+       void *message_private;
+};
+
+/* additional data required for the daemon mode */
+struct ctdb_daemon_data {
+       int sd;
+       char *name;
+       struct ctdb_queue *queue;
+};
+
+
+#define CTDB_UPDATE_STAT(ctdb, counter, value) \
+       {                                                                               \
+               if (value > ctdb->statistics.counter) {                                 \
+                       ctdb->statistics.counter = c->hopcount;                         \
+               }                                                                       \
+               if (value > ctdb->statistics_current.counter) {                         \
+                       ctdb->statistics_current.counter = c->hopcount;                 \
+               }                                                                       \
+       }
+
+#define CTDB_INCREMENT_STAT(ctdb, counter) \
+       {                                                                               \
+               ctdb->statistics.counter++;                                             \
+               ctdb->statistics_current.counter++;                                     \
+       }
+
+#define CTDB_DECREMENT_STAT(ctdb, counter) \
+       {                                                                               \
+               if (ctdb->statistics.counter > 0)                                       \
+                       ctdb->statistics.counter--;                                     \
+               if (ctdb->statistics_current.counter > 0)                               \
+                       ctdb->statistics_current.counter--;                             \
+       }
+
+#define CTDB_INCREMENT_DB_STAT(ctdb_db, counter) \
+       {                                                                               \
+               ctdb_db->statistics.counter++;                                          \
+       }
+
+#define CTDB_DECREMENT_DB_STAT(ctdb_db, counter) \
+       {                                                                               \
+               if (ctdb_db->statistics.counter > 0)                                    \
+                       ctdb_db->statistics.counter--;                                  \
+       }
+
+#define CTDB_UPDATE_RECLOCK_LATENCY(ctdb, name, counter, value) \
+       {                                                                               \
+               if (value > ctdb->statistics.counter.max)                               \
+                       ctdb->statistics.counter.max = value;                           \
+               if (value > ctdb->statistics_current.counter.max)                       \
+                       ctdb->statistics_current.counter.max = value;                   \
+                                                                                       \
+               if (ctdb->statistics.counter.num == 0 ||                                \
+                   value < ctdb->statistics.counter.min)                               \
+                       ctdb->statistics.counter.min = value;                           \
+               if (ctdb->statistics_current.counter.num == 0 ||                        \
+                   value < ctdb->statistics_current.counter.min)                       \
+                       ctdb->statistics_current.counter.min = value;                   \
+                                                                                       \
+               ctdb->statistics.counter.total += value;                                \
+               ctdb->statistics_current.counter.total += value;                        \
+                                                                                       \
+               ctdb->statistics.counter.num++;                                         \
+               ctdb->statistics_current.counter.num++;                                 \
+                                                                                       \
+               if (ctdb->tunable.reclock_latency_ms != 0) {                            \
+                       if (value*1000 > ctdb->tunable.reclock_latency_ms) {            \
+                               DEBUG(DEBUG_ERR,                                        \
+                                     ("High RECLOCK latency %fs for operation %s\n",   \
+                                      value, name));                                   \
+                       }                                                               \
+               }                                                                       \
+       }
+
+#define CTDB_UPDATE_DB_LATENCY(ctdb_db, operation, counter, value)                     \
+       {                                                                               \
+               if (value > ctdb_db->statistics.counter.max)                            \
+                       ctdb_db->statistics.counter.max = value;                        \
+               if (ctdb_db->statistics.counter.num == 0 ||                             \
+                   value < ctdb_db->statistics.counter.min)                            \
+                       ctdb_db->statistics.counter.min = value;                        \
+                                                                                       \
+               ctdb_db->statistics.counter.total += value;                             \
+               ctdb_db->statistics.counter.num++;                                      \
+                                                                                       \
+               if (ctdb_db->ctdb->tunable.log_latency_ms != 0) {                       \
+                       if (value*1000 > ctdb_db->ctdb->tunable.log_latency_ms) {       \
+                               DEBUG(DEBUG_ERR,                                        \
+                                     ("High latency %.6fs for operation %s on database %s\n",\
+                                      value, operation, ctdb_db->db_name));            \
+                       }                                                               \
+               }                                                                       \
+       }
+
+#define CTDB_UPDATE_LATENCY(ctdb, db, operation, counter, t) \
+       {                                                                               \
+               double l = timeval_elapsed(&t);                                         \
+                                                                                       \
+               if (l > ctdb->statistics.counter.max)                                   \
+                       ctdb->statistics.counter.max = l;                               \
+               if (l > ctdb->statistics_current.counter.max)                           \
+                       ctdb->statistics_current.counter.max = l;                       \
+                                                                                       \
+               if (ctdb->statistics.counter.num == 0 ||                                \
+                   l < ctdb->statistics.counter.min)                                   \
+                       ctdb->statistics.counter.min = l;                               \
+               if (ctdb->statistics_current.counter.num == 0 ||                        \
+                   l < ctdb->statistics_current.counter.min)                           \
+                       ctdb->statistics_current.counter.min = l;                       \
+                                                                                       \
+               ctdb->statistics.counter.total += l;                                    \
+               ctdb->statistics_current.counter.total += l;                            \
+                                                                                       \
+               ctdb->statistics.counter.num++;                                         \
+               ctdb->statistics_current.counter.num++;                                 \
+                                                                                       \
+               if (ctdb->tunable.log_latency_ms != 0) {                                \
+                       if (l*1000 > ctdb->tunable.log_latency_ms) {                    \
+                               DEBUG(DEBUG_WARNING,                                    \
+                                     ("High latency %.6fs for operation %s on database %s\n",\
+                                      l, operation, db->db_name));                     \
+                       }                                                               \
+               }                                                                       \
+       }
+
+
+
+/* a structure that contains the elements required for the write record
+   control
+*/
+struct ctdb_write_record {
+       uint32_t dbid;
+       uint32_t keylen;
+       uint32_t datalen;
+       unsigned char blob[1];
+};
+
+enum ctdb_freeze_mode {CTDB_FREEZE_NONE, CTDB_FREEZE_PENDING, CTDB_FREEZE_FROZEN};
+
+enum ctdb_runstate {
+       CTDB_RUNSTATE_UNKNOWN,
+       CTDB_RUNSTATE_INIT,
+       CTDB_RUNSTATE_SETUP,
+       CTDB_RUNSTATE_FIRST_RECOVERY,
+       CTDB_RUNSTATE_STARTUP,
+       CTDB_RUNSTATE_RUNNING,
+       CTDB_RUNSTATE_SHUTDOWN,
+};
+
+const char *runstate_to_string(enum ctdb_runstate runstate);
+enum ctdb_runstate runstate_from_string(const char *label);
+void ctdb_set_runstate(struct ctdb_context *ctdb, enum ctdb_runstate runstate);
+
+void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code);
+
+#define CTDB_MONITORING_ACTIVE         0
+#define CTDB_MONITORING_DISABLED       1
+
+#define NUM_DB_PRIORITIES 3
+/* main state of the ctdb daemon */
+struct ctdb_context {
+       struct tevent_context *ev;
+       struct timeval ctdbd_start_time;
+       struct timeval last_recovery_started;
+       struct timeval last_recovery_finished;
+       uint32_t recovery_mode;
+       TALLOC_CTX *tickle_update_context;
+       TALLOC_CTX *keepalive_ctx;
+       TALLOC_CTX *check_public_ifaces_ctx;
+       struct ctdb_tunable tunable;
+       enum ctdb_freeze_mode freeze_mode[NUM_DB_PRIORITIES+1];
+       struct ctdb_freeze_handle *freeze_handles[NUM_DB_PRIORITIES+1];
+       bool freeze_transaction_started;
+       uint32_t freeze_transaction_id;
+       struct ctdb_address address;
+       const char *name;
+       const char *db_directory;
+       const char *db_directory_persistent;
+       const char *db_directory_state;
+       struct tdb_wrap *db_persistent_health;
+       uint32_t db_persistent_startup_generation;
+       uint64_t db_persistent_check_errors;
+       uint64_t max_persistent_check_errors;
+       const char *transport;
+       char *recovery_lock_file;
+       int recovery_lock_fd;
+       uint32_t pnn; /* our own pnn */
+       uint32_t num_nodes;
+       uint32_t num_connected;
+       unsigned flags;
+       uint32_t capabilities;
+       struct idr_context *idr;
+       int lastid;
+       struct ctdb_node **nodes; /* array of nodes in the cluster - indexed by vnn */
+       struct ctdb_vnn *vnn; /* list of public ip addresses and interfaces */
+       struct ctdb_vnn *single_ip_vnn; /* a structure for the single ip */
+       struct ctdb_iface *ifaces; /* list of local interfaces */
+       char *err_msg;
+       const struct ctdb_methods *methods; /* transport methods */
+       const struct ctdb_upcalls *upcalls; /* transport upcalls */
+       void *private_data; /* private to transport */
+       struct ctdb_db_context *db_list;
+       struct ctdb_message_list_header *message_list_header;
+       struct tdb_context *message_list_indexdb;
+       struct ctdb_daemon_data daemon;
+       struct ctdb_statistics statistics;
+       struct ctdb_statistics statistics_current;
+#define MAX_STAT_HISTORY 100
+       struct ctdb_statistics statistics_history[MAX_STAT_HISTORY];
+       struct ctdb_vnn_map *vnn_map;
+       uint32_t num_clients;
+       uint32_t recovery_master;
+       struct ctdb_call_state *pending_calls;
+       struct ctdb_client_ip *client_ip_list;
+       bool do_checkpublicip;
+       struct trbt_tree *server_ids; 
+       bool do_setsched;
+       void *saved_scheduler_param;
+       const char *event_script_dir;
+       const char *notification_script;
+       const char *default_public_interface;
+       pid_t ctdbd_pid;
+       pid_t recoverd_pid;
+       pid_t syslogd_pid;
+       enum ctdb_runstate runstate;
+       struct ctdb_monitor_state *monitor;
+       struct ctdb_log_state *log;
+       int start_as_disabled;
+       int start_as_stopped;
+       bool valgrinding;
+       uint32_t event_script_timeouts; /* counting how many consecutive times an eventscript has timedout */
+       uint32_t *recd_ping_count;
+       TALLOC_CTX *recd_ctx; /* a context used to track recoverd monitoring events */
+       TALLOC_CTX *release_ips_ctx; /* a context used to automatically drop all IPs if we fail to recover the node */
+
+       TALLOC_CTX *event_script_ctx;
+
+       struct ctdb_event_script_state *current_monitor;
+       struct ctdb_scripts_wire *last_status[CTDB_EVENT_MAX];
+
+       TALLOC_CTX *banning_ctx;
+
+       struct ctdb_vacuum_child_context *vacuumers;
+
+       /* mapping from pid to ctdb_client * */
+       struct ctdb_client_pid_list *client_pids;
+
+       /* used in the recovery daemon to remember the ip allocation */
+       struct trbt_tree *ip_tree;
+
+       /* Used to defer db attach requests while in recovery mode */
+       struct ctdb_deferred_attach_context *deferred_attach;
+
+       /* if we are a child process, do we have a domain socket to send controls on */
+       bool can_send_controls;
+
+       /* list of event script callback functions that are active */
+       struct event_script_callback *script_callbacks;
+
+       struct ctdb_reloadips_handle *reload_ips;
+
+       const char *nodes_file;
+       const char *public_addresses_file;
+       struct trbt_tree *child_processes; 
+
+       /* Used for locking record/db/alldb */
+       int lock_num_current;
+       int lock_num_pending;
+       struct lock_context *lock_current;
+       struct lock_context *lock_pending;
+};
+
+struct ctdb_db_context {
+       struct ctdb_db_context *next, *prev;
+       struct ctdb_context *ctdb;
+       uint32_t db_id;
+       uint32_t priority;
+       bool persistent;
+       bool readonly; /* Do we support read-only delegations ? */
+       bool sticky; /* Do we support sticky records ? */
+       const char *db_name;
+       const char *db_path;
+       struct tdb_wrap *ltdb;
+       struct tdb_context *rottdb; /* ReadOnly tracking TDB */
+       struct ctdb_registered_call *calls; /* list of registered calls */
+       uint32_t seqnum;
+       struct timed_event *seqnum_update;
+       struct ctdb_traverse_local_handle *traverse;
+       struct ctdb_vacuum_handle *vacuum_handle;
+       char *unhealthy_reason;
+       int pending_requests;
+       struct revokechild_handle *revokechild_active;
+       struct ctdb_persistent_state *persistent_state;
+       struct trbt_tree *delete_queue;
+       struct trbt_tree *sticky_records; 
+       int (*ctdb_ltdb_store_fn)(struct ctdb_db_context *ctdb_db,
+                                 TDB_DATA key,
+                                 struct ctdb_ltdb_header *header,
+                                 TDB_DATA data);
+
+       /* used to track which records we are currently fetching
+          so we can avoid sending duplicate fetch requests
+       */
+       struct trbt_tree *deferred_fetch;
+
+       struct ctdb_db_statistics statistics;
+};
+
+
+#define CTDB_NO_MEMORY(ctdb, p) do { if (!(p)) { \
+          DEBUG(0,("Out of memory for %s at %s\n", #p, __location__)); \
+          ctdb_set_error(ctdb, "Out of memory at %s:%d", __FILE__, __LINE__); \
+         return -1; }} while (0)
+
+#define CTDB_NO_MEMORY_VOID(ctdb, p) do { if (!(p)) { \
+          DEBUG(0,("Out of memory for %s at %s\n", #p, __location__)); \
+          ctdb_set_error(ctdb, "Out of memory at %s:%d", __FILE__, __LINE__); \
+         return; }} while (0)
+
+#define CTDB_NO_MEMORY_NULL(ctdb, p) do { if (!(p)) { \
+          DEBUG(0,("Out of memory for %s at %s\n", #p, __location__)); \
+          ctdb_set_error(ctdb, "Out of memory at %s:%d", __FILE__, __LINE__); \
+         return NULL; }} while (0)
+
+#define CTDB_NO_MEMORY_FATAL(ctdb, p) do { if (!(p)) { \
+          DEBUG(0,("Out of memory for %s at %s\n", #p, __location__)); \
+          ctdb_fatal(ctdb, "Out of memory in " __location__ ); \
+         }} while (0)
+
+/*
+  structure passed in set_call control
+ */
+struct ctdb_control_set_call {
+       uint32_t db_id;
+       ctdb_fn_t fn;
+       uint32_t id;
+};
+
+/*
+  struct for kill_tcp control
+ */
+struct ctdb_control_killtcp {
+       ctdb_sock_addr src_addr;
+       ctdb_sock_addr dst_addr;
+};
+
+/*
+  struct holding a ctdb_sock_addr and an interface name,
+  used to add/remove public addresses
+ */
+struct ctdb_control_ip_iface {
+       ctdb_sock_addr addr;
+       uint32_t mask;
+       uint32_t len;
+       char iface[1];
+};
+
+/*
+  struct holding a ctdb_sock_addr and an interface name,
+  used for send_gratious_arp
+ */
+struct ctdb_control_gratious_arp {
+       ctdb_sock_addr addr;
+       uint32_t mask;
+       uint32_t len;
+       char iface[1];
+};
+
+/*
+  persistent store control - update this record on all other nodes
+ */
+struct ctdb_control_persistent_store {
+       uint32_t db_id;
+       uint32_t len;
+       uint8_t  data[1];
+};
+
+/*
+  structure used for CTDB_SRVID_NODE_FLAGS_CHANGED
+ */
+struct ctdb_node_flag_change {
+       uint32_t pnn;
+       uint32_t new_flags;
+       uint32_t old_flags;
+};
+
+/*
+  struct for admin setting a ban
+ */
+struct ctdb_ban_info {
+       uint32_t pnn;
+       uint32_t ban_time;
+};
+
+enum call_state {CTDB_CALL_WAIT, CTDB_CALL_DONE, CTDB_CALL_ERROR};
+
+#define CTDB_LMASTER_ANY       0xffffffff
+
+/*
+  state of a in-progress ctdb call
+*/
+struct ctdb_call_state {
+       struct ctdb_call_state *next, *prev;
+       enum call_state state;
+       uint32_t reqid;
+       struct ctdb_req_call *c;
+       struct ctdb_db_context *ctdb_db;
+       const char *errmsg;
+       struct ctdb_call *call;
+       uint32_t generation;
+       struct {
+               void (*fn)(struct ctdb_call_state *);
+               void *private_data;
+       } async;
+};
+
+
+/* used for fetch_lock */
+struct ctdb_fetch_handle {
+       struct ctdb_db_context *ctdb_db;
+       TDB_DATA key;
+       TDB_DATA *data;
+       struct ctdb_ltdb_header header;
+};
+
+/* internal prototypes */
+void ctdb_set_error(struct ctdb_context *ctdb, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+void ctdb_fatal(struct ctdb_context *ctdb, const char *msg);
+void ctdb_die(struct ctdb_context *ctdb, const char *msg);
+void ctdb_external_trace(void);
+bool ctdb_same_address(struct ctdb_address *a1, struct ctdb_address *a2);
+int ctdb_parse_address(struct ctdb_context *ctdb,
+                      TALLOC_CTX *mem_ctx, const char *str,
+                      struct ctdb_address *address);
+bool ctdb_same_ip(const ctdb_sock_addr *ip1, const ctdb_sock_addr *ip2);
+bool ctdb_same_sockaddr(const ctdb_sock_addr *ip1, const ctdb_sock_addr *ip2);
+uint32_t ctdb_hash(const TDB_DATA *key);
+uint32_t ctdb_hash_string(const char *str);
+void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+void ctdb_request_message(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+void ctdb_reply_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+void ctdb_reply_error(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+
+uint32_t ctdb_lmaster(struct ctdb_context *ctdb, const TDB_DATA *key);
+int ctdb_ltdb_fetch(struct ctdb_db_context *ctdb_db, 
+                   TDB_DATA key, struct ctdb_ltdb_header *header, 
+                   TALLOC_CTX *mem_ctx, TDB_DATA *data);
+int ctdb_ltdb_store(struct ctdb_db_context *ctdb_db, TDB_DATA key, 
+                   struct ctdb_ltdb_header *header, TDB_DATA data);
+int ctdb_ltdb_delete(struct ctdb_db_context *ctdb_db, TDB_DATA key);
+int ctdb_ltdb_fetch_with_header(struct ctdb_db_context *ctdb_db, 
+                   TDB_DATA key, struct ctdb_ltdb_header *header, 
+                   TALLOC_CTX *mem_ctx, TDB_DATA *data);
+int32_t ctdb_control_start_persistent_update(struct ctdb_context *ctdb, 
+                       struct ctdb_req_control *c,
+                       TDB_DATA recdata);
+int32_t ctdb_control_cancel_persistent_update(struct ctdb_context *ctdb, 
+                       struct ctdb_req_control *c,
+                       TDB_DATA recdata);
+void ctdb_queue_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+void ctdb_queue_packet_opcode(struct ctdb_context *ctdb, struct ctdb_req_header *hdr, unsigned opcode);
+int ctdb_ltdb_lock_requeue(struct ctdb_db_context *ctdb_db, 
+                          TDB_DATA key, struct ctdb_req_header *hdr,
+                          void (*recv_pkt)(void *, struct ctdb_req_header *),
+                          void *recv_context, bool ignore_generation);
+int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db, 
+                                TDB_DATA key, struct ctdb_ltdb_header *header, 
+                                struct ctdb_req_header *hdr, TDB_DATA *data,
+                                void (*recv_pkt)(void *, struct ctdb_req_header *),
+                                void *recv_context, bool ignore_generation);
+void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *);
+
+struct ctdb_call_state *ctdb_call_local_send(struct ctdb_db_context *ctdb_db, 
+                                            struct ctdb_call *call,
+                                            struct ctdb_ltdb_header *header,
+                                            TDB_DATA *data);
+
+
+int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork,
+                     bool use_syslog);
+struct ctdb_call_state *ctdbd_call_send(struct ctdb_db_context *ctdb_db, struct ctdb_call *call);
+int ctdbd_call_recv(struct ctdb_call_state *state, struct ctdb_call *call);
+
+/*
+  queue a packet for sending
+*/
+int ctdb_queue_send(struct ctdb_queue *queue, uint8_t *data, uint32_t length);
+
+/*
+  setup the fd used by the queue
+ */
+int ctdb_queue_set_fd(struct ctdb_queue *queue, int fd);
+
+/*
+  setup a packet queue on a socket
+ */
+struct ctdb_queue *ctdb_queue_setup(struct ctdb_context *ctdb,
+                                   TALLOC_CTX *mem_ctx, int fd, int alignment,
+                                   
+                                   ctdb_queue_cb_fn_t callback,
+                                   void *private_data, const char *fmt, ...)
+       PRINTF_ATTRIBUTE(7,8);
+
+/*
+  allocate a packet for use in client<->daemon communication
+ */
+struct ctdb_req_header *_ctdbd_allocate_pkt(struct ctdb_context *ctdb,
+                                           TALLOC_CTX *mem_ctx, 
+                                           enum ctdb_operation operation, 
+                                           size_t length, size_t slength,
+                                           const char *type);
+#define ctdbd_allocate_pkt(ctdb, mem_ctx, operation, length, type) \
+       (type *)_ctdbd_allocate_pkt(ctdb, mem_ctx, operation, length, sizeof(type), #type)
+
+struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
+                                                TALLOC_CTX *mem_ctx, 
+                                                enum ctdb_operation operation, 
+                                                size_t length, size_t slength,
+                                                const char *type);
+#define ctdb_transport_allocate(ctdb, mem_ctx, operation, length, type) \
+       (type *)_ctdb_transport_allocate(ctdb, mem_ctx, operation, length, sizeof(type), #type)
+
+int ctdb_queue_length(struct ctdb_queue *queue);
+
+/*
+  lock a record in the ltdb, given a key
+ */
+int ctdb_ltdb_lock(struct ctdb_db_context *ctdb_db, TDB_DATA key);
+
+/*
+  unlock a record in the ltdb, given a key
+ */
+int ctdb_ltdb_unlock(struct ctdb_db_context *ctdb_db, TDB_DATA key);
+
+
+/*
+  make a ctdb call to the local daemon - async send. Called from client context.
+
+  This constructs a ctdb_call request and queues it for processing. 
+  This call never blocks.
+*/
+struct ctdb_call_state *ctdb_client_call_send(struct ctdb_db_context *ctdb_db, 
+                                             struct ctdb_call *call);
+
+/*
+  make a recv call to the local ctdb daemon - called from client context
+
+  This is called when the program wants to wait for a ctdb_call to complete and get the 
+  results. This call will block unless the call has already completed.
+*/
+int ctdb_client_call_recv(struct ctdb_call_state *state, struct ctdb_call *call);
+
+int ctdb_client_send_message(struct ctdb_context *ctdb, uint32_t vnn,
+                            uint64_t srvid, TDB_DATA data);
+
+/*
+  send a ctdb message
+*/
+int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+                            uint64_t srvid, TDB_DATA data);
+
+
+struct ctdb_call_state *ctdb_daemon_call_send(struct ctdb_db_context *ctdb_db, 
+                                             struct ctdb_call *call);
+
+int ctdb_daemon_call_recv(struct ctdb_call_state *state, struct ctdb_call *call);
+
+struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctdb_db, 
+                                                    struct ctdb_call *call, 
+                                                    struct ctdb_ltdb_header *header);
+
+int ctdb_call_local(struct ctdb_db_context *ctdb_db, struct ctdb_call *call,
+                   struct ctdb_ltdb_header *header, TALLOC_CTX *mem_ctx,
+                   TDB_DATA *data, bool updatetdb);
+
+#define ctdb_reqid_find(ctdb, reqid, type)     (type *)_ctdb_reqid_find(ctdb, reqid, #type, __location__)
+
+void ctdb_recv_raw_pkt(void *p, uint8_t *data, uint32_t length);
+
+int ctdb_socket_connect(struct ctdb_context *ctdb);
+void ctdb_client_read_cb(uint8_t *data, size_t cnt, void *args);
+
+#define CTDB_BAD_REQID ((uint32_t)-1)
+uint32_t ctdb_reqid_new(struct ctdb_context *ctdb, void *state);
+void *_ctdb_reqid_find(struct ctdb_context *ctdb, uint32_t reqid, const char *type, const char *location);
+void ctdb_reqid_remove(struct ctdb_context *ctdb, uint32_t reqid);
+
+void ctdb_request_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+void ctdb_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr);
+
+int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
+                            uint64_t srvid, uint32_t opcode, uint32_t client_id, uint32_t flags,
+                            TDB_DATA data,
+                            ctdb_control_callback_fn_t callback,
+                            void *private_data);
+
+int32_t ctdb_control_db_attach(struct ctdb_context *ctdb, TDB_DATA indata, 
+                              TDB_DATA *outdata, uint64_t tdb_flags,
+                              bool persistent, uint32_t client_id,
+                              struct ctdb_req_control *c,
+                              bool *async_reply);
+
+int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
+                        ctdb_fn_t fn, int id);
+
+int ctdb_control(struct ctdb_context *ctdb, uint32_t destnode, uint64_t srvid, 
+                uint32_t opcode, uint32_t flags, TDB_DATA data, 
+                TALLOC_CTX *mem_ctx, TDB_DATA *outdata, int32_t *status,
+                struct timeval *timeout, char **errormsg);
+int ctdb_control_recv(struct ctdb_context *ctdb, 
+               struct ctdb_client_control_state *state, 
+               TALLOC_CTX *mem_ctx,
+               TDB_DATA *outdata, int32_t *status, char **errormsg);
+
+struct ctdb_client_control_state *
+ctdb_control_send(struct ctdb_context *ctdb, 
+               uint32_t destnode, uint64_t srvid, 
+               uint32_t opcode, uint32_t flags, TDB_DATA data, 
+               TALLOC_CTX *mem_ctx,
+               struct timeval *timeout,
+               char **errormsg);
+
+
+
+
+#define CHECK_CONTROL_DATA_SIZE(size) do { \
+ if (indata.dsize != size) { \
+        DEBUG(0,(__location__ " Invalid data size in opcode %u. Got %u expected %u\n", \
+                 opcode, (unsigned)indata.dsize, (unsigned)size));     \
+        return -1; \
+ } \
+ } while (0)
+
+#define CHECK_CONTROL_MIN_DATA_SIZE(size) do { \
+ if (indata.dsize < size) { \
+        DEBUG(0,(__location__ " Invalid data size in opcode %u. Got %u expected >= %u\n", \
+                 opcode, (unsigned)indata.dsize, (unsigned)size));     \
+        return -1; \
+ } \
+ } while (0)
+
+int ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata);
+int ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata);
+int ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata);
+int ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata);
+int ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata);
+int ctdb_control_writerecord(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata);
+
+
+/* structure used for pulldb control */
+struct ctdb_control_pulldb {
+       uint32_t db_id;
+       uint32_t lmaster;
+};
+
+/* structure used for sending lists of records */
+struct ctdb_marshall_buffer {
+       uint32_t db_id;
+       uint32_t count;
+       uint8_t data[1];
+};
+
+/*
+  structure for setting a tunable
+ */
+struct ctdb_control_set_tunable {
+       uint32_t value;
+       uint32_t length;
+       uint8_t  name[1];
+};
+
+/*
+  structure for getting a tunable
+ */
+struct ctdb_control_get_tunable {
+       uint32_t length;
+       uint8_t  name[1];
+};
+
+/*
+  structure for listing tunables
+ */
+struct ctdb_control_list_tunable {
+       uint32_t length;
+       /* returns a : separated list of tunable names */
+       uint8_t  data[1];
+};
+
+
+struct ctdb_node_and_flagsv4 {
+       uint32_t pnn;
+       uint32_t flags;
+       struct sockaddr_in sin;
+};
+
+struct ctdb_node_mapv4 {
+       uint32_t num;
+       struct ctdb_node_and_flagsv4 nodes[1];
+};
+
+struct ctdb_control_wipe_database {
+       uint32_t db_id;
+       uint32_t transaction_id;
+};
+
+/*
+  state of a in-progress ctdb call in client
+*/
+struct ctdb_client_call_state {
+       enum call_state state;
+       uint32_t reqid;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_call *call;
+       struct {
+               void (*fn)(struct ctdb_client_call_state *);
+               void *private_data;
+       } async;
+};
+
+
+int32_t ctdb_control_traverse_start_ext(struct ctdb_context *ctdb,
+                                       TDB_DATA indata,
+                                       TDB_DATA *outdata,
+                                       uint32_t srcnode,
+                                       uint32_t client_id);
+int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb, TDB_DATA indata, 
+                                   TDB_DATA *outdata, uint32_t srcnode, uint32_t client_id);
+int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata);
+int32_t ctdb_control_traverse_all_ext(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata);
+int32_t ctdb_control_traverse_data(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata);
+int32_t ctdb_control_traverse_kill(struct ctdb_context *ctdb, TDB_DATA indata, 
+                                   TDB_DATA *outdata, uint32_t srcnode);
+
+int ctdb_dispatch_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data);
+bool ctdb_check_message_handler(struct ctdb_context *ctdb, uint64_t srvid);
+
+int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid);
+int ctdb_deregister_message_handler(struct ctdb_context *ctdb, uint64_t srvid, void *private_data);
+int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid);
+int daemon_check_srvids(struct ctdb_context *ctdb, TDB_DATA indata,
+                       TDB_DATA *outdata);
+
+int32_t ctdb_ltdb_enable_seqnum(struct ctdb_context *ctdb, uint32_t db_id);
+int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint32_t srcnode);
+
+struct ctdb_rec_data *ctdb_marshall_record(TALLOC_CTX *mem_ctx, uint32_t reqid,        
+                                          TDB_DATA key, struct ctdb_ltdb_header *, TDB_DATA data);
+
+struct ctdb_rec_data *ctdb_marshall_loop_next(struct ctdb_marshall_buffer *m, struct ctdb_rec_data *r,
+                                             uint32_t *reqid,
+                                             struct ctdb_ltdb_header *header,
+                                             TDB_DATA *key, TDB_DATA *data);
+
+int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata);
+int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata);
+
+int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                TDB_DATA indata, bool *async_reply,
+                                const char **errormsg);
+void ctdb_request_control_reply(struct ctdb_context *ctdb, struct ctdb_req_control *c,
+                               TDB_DATA *outdata, int32_t status, const char *errormsg);
+
+int32_t ctdb_control_freeze(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply);
+int32_t ctdb_control_thaw(struct ctdb_context *ctdb, uint32_t priority);
+
+int ctdb_start_recoverd(struct ctdb_context *ctdb);
+void ctdb_stop_recoverd(struct ctdb_context *ctdb);
+
+uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb);
+
+void ctdb_disable_monitoring(struct ctdb_context *ctdb);
+void ctdb_enable_monitoring(struct ctdb_context *ctdb);
+void ctdb_stop_monitoring(struct ctdb_context *ctdb);
+void ctdb_start_monitoring(struct ctdb_context *ctdb);
+void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb);
+void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode);
+void ctdb_start_keepalive(struct ctdb_context *ctdb);
+void ctdb_stop_keepalive(struct ctdb_context *ctdb);
+int32_t ctdb_run_eventscripts(struct ctdb_context *ctdb, struct ctdb_req_control *c, TDB_DATA data, bool *async_reply);
+
+
+void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node);
+void ctdb_call_resend_all(struct ctdb_context *ctdb);
+void ctdb_node_dead(struct ctdb_node *node);
+void ctdb_node_connected(struct ctdb_node *node);
+bool ctdb_blocking_freeze(struct ctdb_context *ctdb);
+void ctdb_set_scheduler(struct ctdb_context *ctdb);
+void ctdb_restore_scheduler(struct ctdb_context *ctdb);
+
+struct tevent_signal *ctdb_init_sigchld(struct ctdb_context *ctdb);
+pid_t ctdb_fork(struct ctdb_context *ctdb);
+pid_t ctdb_fork_no_free_ringbuffer(struct ctdb_context *ctdb);
+void ctdb_set_child_info(TALLOC_CTX *mem_ctx, const char *child_name_fmt, ...);
+bool ctdb_is_child_process(void);
+int ctdb_kill(struct ctdb_context *ctdb, pid_t pid, int signum);
+
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                TDB_DATA indata, 
+                                bool *async_reply);
+int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                TDB_DATA indata, 
+                                bool *async_reply);
+int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                TDB_DATA indata, 
+                                bool *async_reply);
+int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                TDB_DATA indata, 
+                                bool *async_reply);
+int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                bool *async_reply);
+int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                bool *async_reply);
+int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                bool *async_reply);
+
+struct ctdb_public_ipv4 {
+       uint32_t pnn;
+       struct sockaddr_in sin;
+};
+
+int ctdb_ctrl_takeover_ip(struct ctdb_context *ctdb, struct timeval timeout, 
+                         uint32_t destnode, struct ctdb_public_ip *ip);
+int ctdb_ctrl_release_ip(struct ctdb_context *ctdb, struct timeval timeout, 
+                        uint32_t destnode, struct ctdb_public_ip *ip);
+
+struct ctdb_all_public_ipsv4 {
+       uint32_t num;
+       struct ctdb_public_ipv4 ips[1];
+};
+
+int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, struct ctdb_req_control *c, TDB_DATA *outdata);
+int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, TDB_DATA *outdata);
+int ctdb_ctrl_get_public_ips(struct ctdb_context *ctdb, 
+                            struct timeval timeout,
+                            uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            struct ctdb_all_public_ips **ips);
+#define CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE 0x00010000
+int ctdb_ctrl_get_public_ips_flags(struct ctdb_context *ctdb,
+                                  struct timeval timeout, uint32_t destnode,
+                                  TALLOC_CTX *mem_ctx,
+                                  uint32_t flags,
+                                  struct ctdb_all_public_ips **ips);
+int ctdb_ctrl_get_public_ipsv4(struct ctdb_context *ctdb, 
+                       struct timeval timeout, uint32_t destnode, 
+                       TALLOC_CTX *mem_ctx, struct ctdb_all_public_ips **ips);
+
+struct ctdb_control_iface_info {
+       char name[CTDB_IFACE_SIZE+2];
+       uint16_t link_state;
+       uint32_t references;
+};
+
+struct ctdb_control_public_ip_info {
+       struct ctdb_public_ip ip;
+       uint32_t active_idx;
+       uint32_t num;
+       struct ctdb_control_iface_info ifaces[1];
+};
+
+struct ctdb_control_get_ifaces {
+       uint32_t num;
+       struct ctdb_control_iface_info ifaces[1];
+};
+
+int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
+                                       struct ctdb_req_control *c,
+                                       TDB_DATA indata,
+                                       TDB_DATA *outdata);
+int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
+                               struct ctdb_req_control *c,
+                               TDB_DATA *outdata);
+int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
+                                   struct ctdb_req_control *c,
+                                   TDB_DATA indata);
+int ctdb_ctrl_get_public_ip_info(struct ctdb_context *ctdb,
+                                struct timeval timeout, uint32_t destnode,
+                                TALLOC_CTX *mem_ctx,
+                                const ctdb_sock_addr *addr,
+                                struct ctdb_control_public_ip_info **info);
+int ctdb_ctrl_get_ifaces(struct ctdb_context *ctdb,
+                        struct timeval timeout, uint32_t destnode,
+                        TALLOC_CTX *mem_ctx,
+                        struct ctdb_control_get_ifaces **ifaces);
+int ctdb_ctrl_set_iface_link(struct ctdb_context *ctdb,
+                            struct timeval timeout, uint32_t destnode,
+                            TALLOC_CTX *mem_ctx,
+                            const struct ctdb_control_iface_info *info);
+
+/* from takeover/system.c */
+uint32_t uint16_checksum(uint16_t *data, size_t n);
+int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface);
+bool ctdb_sys_have_ip(ctdb_sock_addr *addr);
+char *ctdb_sys_find_ifname(ctdb_sock_addr *addr);
+bool ctdb_sys_check_iface_exists(const char *iface);
+int ctdb_get_peer_pid(const int fd, pid_t *peer_pid);
+int ctdb_sys_send_tcp(const ctdb_sock_addr *dest, 
+                     const ctdb_sock_addr *src,
+                     uint32_t seq, uint32_t ack, int rst);
+
+/* Details of a byte range lock */
+struct ctdb_lock_info {
+       ino_t inode;
+       off_t start, end;
+       bool waiting;
+       bool read_only;
+};
+
+char *ctdb_get_process_name(pid_t pid);
+int ctdb_set_process_name(const char *name);
+bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info);
+bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid);
+
+typedef void (*client_async_callback)(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data);
+
+int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses);
+int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
+                             const char *iface,
+                             const char *ip);
+int ctdb_set_event_script(struct ctdb_context *ctdb, const char *script);
+int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script);
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+                     uint32_t *force_rebalance_nodes,
+                     client_async_callback fail_callback, void *callback_data);
+
+int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id, 
+                               TDB_DATA indata);
+int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed);
+int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn);
+int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata);
+int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata);
+
+void ctdb_takeover_client_destructor_hook(struct ctdb_client *client);
+int ctdb_event_script(struct ctdb_context *ctdb, enum ctdb_eventscript_call call);
+int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_eventscript_call call,
+                          const char *fmt, ...) PRINTF_ATTRIBUTE(3,4);
+int ctdb_event_script_callback(struct ctdb_context *ctdb, 
+                              TALLOC_CTX *mem_ctx,
+                              void (*callback)(struct ctdb_context *, int, void *),
+                              void *private_data,
+                              bool from_user,
+                              enum ctdb_eventscript_call call,
+                              const char *fmt, ...) PRINTF_ATTRIBUTE(7,8);
+void ctdb_release_all_ips(struct ctdb_context *ctdb);
+
+void set_nonblocking(int fd);
+void set_close_on_exec(int fd);
+
+bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep);
+
+int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file);
+
+int32_t ctdb_control_get_tunable(struct ctdb_context *ctdb, TDB_DATA indata, 
+                                TDB_DATA *outdata);
+int32_t ctdb_control_set_tunable(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_list_tunables(struct ctdb_context *ctdb, TDB_DATA *outdata);
+int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata);
+int32_t ctdb_control_receive_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata);
+int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata);
+
+void ctdb_tunables_set_defaults(struct ctdb_context *ctdb);
+
+int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata);
+
+int ctdb_ctrl_get_all_tunables(struct ctdb_context *ctdb, 
+                              struct timeval timeout, 
+                              uint32_t destnode,
+                              struct ctdb_tunable *tunables);
+
+void ctdb_start_freeze(struct ctdb_context *ctdb, uint32_t priority);
+
+bool parse_ip_mask(const char *s, const char *iface, ctdb_sock_addr *addr, unsigned *mask);
+bool parse_ip_port(const char *s, ctdb_sock_addr *addr);
+bool parse_ip(const char *s, const char *iface, unsigned port, ctdb_sock_addr *addr);
+bool parse_ipv4(const char *s, unsigned port, struct sockaddr_in *sin);
+
+int ctdb_sys_open_capture_socket(const char *iface, void **private_data);
+int ctdb_sys_close_capture_socket(void *private_data);
+int ctdb_sys_read_tcp_packet(int s, void *private_data, ctdb_sock_addr *src, ctdb_sock_addr *dst, uint32_t *ack_seq, uint32_t *seq);
+
+int ctdb_ctrl_killtcp(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     struct ctdb_control_killtcp *killtcp);
+
+int ctdb_ctrl_add_public_ip(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     struct ctdb_control_ip_iface *pub);
+
+int ctdb_ctrl_del_public_ip(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     struct ctdb_control_ip_iface *pub);
+
+int ctdb_ctrl_gratious_arp(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     ctdb_sock_addr *addr,
+                     const char *ifname);
+
+int ctdb_ctrl_get_tcp_tickles(struct ctdb_context *ctdb, 
+                     struct timeval timeout, 
+                     uint32_t destnode,
+                     TALLOC_CTX *mem_ctx,
+                     ctdb_sock_addr *addr,
+                     struct ctdb_control_tcp_tickle_list **list);
+
+
+int32_t ctdb_control_register_server_id(struct ctdb_context *ctdb, 
+                     uint32_t client_id,
+                     TDB_DATA indata);
+int32_t ctdb_control_check_server_id(struct ctdb_context *ctdb, 
+                     TDB_DATA indata);
+int32_t ctdb_control_unregister_server_id(struct ctdb_context *ctdb, 
+                     TDB_DATA indata);
+int32_t ctdb_control_get_server_id_list(struct ctdb_context *ctdb, 
+                     TDB_DATA *outdata);
+int32_t ctdb_control_uptime(struct ctdb_context *ctdb, 
+                     TDB_DATA *outdata);
+
+int ctdb_attach_databases(struct ctdb_context *ctdb);
+
+int32_t ctdb_control_persistent_store(struct ctdb_context *ctdb, 
+                                     struct ctdb_req_control *c, 
+                                     TDB_DATA recdata, bool *async_reply);
+int32_t ctdb_control_update_record(struct ctdb_context *ctdb, 
+                                  struct ctdb_req_control *c, TDB_DATA recdata, 
+                                  bool *async_reply);
+int32_t ctdb_control_trans2_commit(struct ctdb_context *ctdb, 
+                                  struct ctdb_req_control *c, 
+                                  TDB_DATA recdata, bool *async_reply);
+
+int32_t ctdb_control_trans3_commit(struct ctdb_context *ctdb,
+                                  struct ctdb_req_control *c,
+                                  TDB_DATA recdata, bool *async_reply);
+
+void ctdb_persistent_finish_trans3_commits(struct ctdb_context *ctdb);
+
+int32_t ctdb_control_transaction_start(struct ctdb_context *ctdb, uint32_t id);
+int32_t ctdb_control_transaction_commit(struct ctdb_context *ctdb, uint32_t id);
+int32_t ctdb_control_transaction_cancel(struct ctdb_context *ctdb);
+int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_db_set_healthy(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_db_get_health(struct ctdb_context *ctdb,
+                                  TDB_DATA indata,
+                                  TDB_DATA *outdata);
+
+
+int ctdb_vacuum(struct ctdb_context *ctdb, int argc, const char **argv);
+int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv);
+
+void ctdb_block_signal(int signum);
+void ctdb_unblock_signal(int signum);
+int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb);
+bool ctdb_stopped_monitoring(struct ctdb_context *ctdb);
+int ctdb_set_child_logging(struct ctdb_context *ctdb);
+void ctdb_lockdown_memory(struct ctdb_context *ctdb);
+
+struct client_async_data {
+       enum ctdb_controls opcode;
+       bool dont_log_errors;
+       uint32_t count;
+       uint32_t fail_count;
+       client_async_callback callback;
+       client_async_callback fail_callback;
+       void *callback_data;
+};
+void ctdb_client_async_add(struct client_async_data *data, struct ctdb_client_control_state *state);
+int ctdb_client_async_wait(struct ctdb_context *ctdb, struct client_async_data *data);
+int ctdb_client_async_control(struct ctdb_context *ctdb,
+                               enum ctdb_controls opcode,
+                               uint32_t *nodes,
+                               uint64_t srvid,
+                               struct timeval timeout,
+                               bool dont_log_errors,
+                               TDB_DATA data,
+                               client_async_callback client_callback,
+                               client_async_callback fail_callback,
+                               void *callback_data);
+
+void ctdb_load_nodes_file(struct ctdb_context *ctdb);
+
+int ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode);
+
+int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata);
+int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata);
+
+int32_t ctdb_control_trans2_finished(struct ctdb_context *ctdb, 
+                                    struct ctdb_req_control *c);
+int32_t ctdb_control_trans2_error(struct ctdb_context *ctdb, 
+                                 struct ctdb_req_control *c);
+int32_t ctdb_control_trans2_active(struct ctdb_context *ctdb,
+                                  struct ctdb_req_control *c,
+                                  uint32_t db_id);
+
+char *ctdb_addr_to_str(ctdb_sock_addr *addr);
+unsigned ctdb_addr_to_port(ctdb_sock_addr *addr);
+void ctdb_canonicalize_ip(const ctdb_sock_addr *ip, ctdb_sock_addr *cip);
+
+int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb);
+int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata);
+
+extern int script_log_level;
+extern bool fast_start;
+extern const char *ctdbd_pidfile;
+
+int32_t ctdb_control_get_event_script_status(struct ctdb_context *ctdb,
+                                            uint32_t call_type,
+                                            TDB_DATA *outdata);
+
+int ctdb_log_event_script_output(struct ctdb_context *ctdb, char *str, uint16_t len);
+int ctdb_ctrl_report_recd_lock_latency(struct ctdb_context *ctdb, struct timeval timeout, double latency);
+
+int32_t ctdb_control_stop_node(struct ctdb_context *ctdb);
+int32_t ctdb_control_continue_node(struct ctdb_context *ctdb);
+
+void ctdb_stop_vacuuming(struct ctdb_context *ctdb);
+int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db);
+
+int32_t ctdb_control_enable_script(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_disable_script(struct ctdb_context *ctdb, TDB_DATA indata);
+
+void ctdb_local_node_got_banned(struct ctdb_context *ctdb);
+int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata);
+int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata);
+int32_t ctdb_control_set_db_priority(struct ctdb_context *ctdb, TDB_DATA indata);
+void ctdb_ban_self(struct ctdb_context *ctdb);
+
+int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata);
+
+int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata);
+
+int start_syslog_daemon(struct ctdb_context *ctdb);
+
+/* Where to send the log messages back to */
+struct ctdb_get_log_addr {
+       uint32_t pnn;
+       uint64_t srvid;
+       int32_t level;
+};
+
+extern int log_ringbuf_size;
+
+void ctdb_collect_log(struct ctdb_context *ctdb, struct ctdb_get_log_addr *log_addr);
+void ctdb_clear_log(struct ctdb_context *ctdb);
+int32_t ctdb_control_get_log(struct ctdb_context *ctdb, TDB_DATA addr);
+int32_t ctdb_control_clear_log(struct ctdb_context *ctdb);
+void ctdb_log_ringbuffer_free(void);
+
+struct ctdb_log_state *ctdb_fork_with_logging(TALLOC_CTX *mem_ctx,
+                                             struct ctdb_context *ctdb,
+                                             const char *log_prefix,
+                                             void (*logfn)(const char *, uint16_t, void *),
+                                             void *logfn_private, pid_t *pid);
+
+int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid);
+struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid);
+
+int32_t ctdb_control_get_db_seqnum(struct ctdb_context *ctdb,
+                                  TDB_DATA indata,
+                                  TDB_DATA *outdata);
+
+int ctdb_load_persistent_health(struct ctdb_context *ctdb,
+                               struct ctdb_db_context *ctdb_db);
+int ctdb_update_persistent_health(struct ctdb_context *ctdb,
+                                 struct ctdb_db_context *ctdb_db,
+                                 const char *reason,/* NULL means healthy */
+                                 int num_healthy_nodes);
+int ctdb_recheck_persistent_health(struct ctdb_context *ctdb);
+
+void ctdb_run_notification_script(struct ctdb_context *ctdb, const char *event);
+
+void ctdb_fault_setup(void);
+
+int verify_remote_ip_allocation(struct ctdb_context *ctdb, 
+                               struct ctdb_all_public_ips *ips,
+                               uint32_t pnn);
+int update_ip_assignment_tree(struct ctdb_context *ctdb,
+                               struct ctdb_public_ip *ip);
+
+int ctdb_init_tevent_logging(struct ctdb_context *ctdb);
+
+int ctdb_statistics_init(struct ctdb_context *ctdb);
+
+int32_t ctdb_control_get_stat_history(struct ctdb_context *ctdb,
+                                     struct ctdb_req_control *c,
+                                     TDB_DATA *outdata);
+
+int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb);
+
+int ctdb_process_deferred_attach(struct ctdb_context *ctdb);
+
+/**
+ * structure to pass to a schedule_for_deletion_control
+ */
+struct ctdb_control_schedule_for_deletion {
+       uint32_t db_id;
+       struct ctdb_ltdb_header hdr;
+       uint32_t keylen;
+       uint8_t key[1]; /* key[] */
+};
+
+int32_t ctdb_control_schedule_for_deletion(struct ctdb_context *ctdb,
+                                          TDB_DATA indata);
+
+
+int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db,
+                                        const struct ctdb_ltdb_header *hdr,
+                                        TDB_DATA key);
+
+void ctdb_local_remove_from_delete_queue(struct ctdb_db_context *ctdb_db,
+                                        const struct ctdb_ltdb_header *hdr,
+                                        const TDB_DATA key);
+
+struct ctdb_ltdb_header *ctdb_header_from_record_handle(struct ctdb_record_handle *h);
+
+int ctdb_trackingdb_add_pnn(struct ctdb_context *ctdb, TDB_DATA *data, uint32_t pnn);
+
+typedef void (*ctdb_trackingdb_cb)(struct ctdb_context *ctdb, uint32_t pnn, void *private_data);
+
+void ctdb_trackingdb_traverse(struct ctdb_context *ctdb, TDB_DATA data, ctdb_trackingdb_cb cb, void *private_data);
+
+int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data);
+
+typedef void (*deferred_requeue_fn)(void *call_context, struct ctdb_req_header *hdr);
+
+int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context);
+
+int ctdb_set_db_readonly(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db);
+
+int ctdb_null_func(struct ctdb_call_info *call);
+
+int ctdb_fetch_func(struct ctdb_call_info *call);
+
+int ctdb_fetch_with_header_func(struct ctdb_call_info *call);
+
+int32_t ctdb_control_get_db_statistics(struct ctdb_context *ctdb,
+                               uint32_t db_id,
+                               TDB_DATA *outdata);
+
+int ctdb_set_db_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db);
+
+/*
+  description for a message to reload all ips via recovery master/daemon
+ */
+struct reloadips_all_reply {
+       uint32_t pnn;
+       uint64_t srvid;
+};
+
+int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply);
+
+int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb);
+
+/* from server/ctdb_lock.c */
+struct lock_request;
+
+int ctdb_lockall_mark_prio(struct ctdb_context *ctdb, uint32_t priority);
+int ctdb_lockall_unmark_prio(struct ctdb_context *ctdb, uint32_t priority);
+
+void ctdb_lock_free_request_context(struct lock_request *lock_req);
+
+struct lock_request *ctdb_lock_record(struct ctdb_db_context *ctdb_db,
+                                     TDB_DATA key,
+                                     bool auto_mark,
+                                     void (*callback)(void *, bool),
+                                     void *private_data);
+
+struct lock_request *ctdb_lock_db(struct ctdb_db_context *ctdb_db,
+                                 bool auto_mark,
+                                 void (*callback)(void *, bool),
+                                 void *private_data);
+
+struct lock_request *ctdb_lock_alldb_prio(struct ctdb_context *ctdb,
+                                         uint32_t priority,
+                                         bool auto_mark,
+                                         void (*callback)(void *, bool),
+                                         void *private_data);
+
+struct lock_request *ctdb_lock_alldb(struct ctdb_context *ctdb,
+                                    bool auto_mark,
+                                    void (*callback)(void *, bool),
+                                    void *private_data);
+
+int mkdir_p(const char *dir, int mode);
+void ctdb_mkdir_p_or_die(struct ctdb_context *ctdb, const char *dir, int mode);
+
+#endif
diff --git a/ctdb/include/ctdb_protocol.h b/ctdb/include/ctdb_protocol.h
new file mode 100644 (file)
index 0000000..15c87be
--- /dev/null
@@ -0,0 +1,764 @@
+/*
+   ctdb database library
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CTDB_PROTOCOL_H
+#define _CTDB_PROTOCOL_H
+
+/* location of daemon socket, set at configure time */
+#ifdef SOCKPATH
+#define CTDB_PATH      SOCKPATH
+#else
+#define CTDB_PATH      "/var/run/ctdb/ctdbd.socket"
+#endif
+
+/* default ctdb port number */
+#define CTDB_PORT 4379
+
+/* we must align packets to ensure ctdb works on all architectures (eg. sparc) */
+#define CTDB_DS_ALIGNMENT 8
+
+
+#define CTDB_NULL_FUNC                  0xFF000001
+#define CTDB_FETCH_FUNC                 0xFF000002
+#define CTDB_FETCH_WITH_HEADER_FUNC     0xFF000003
+
+
+struct ctdb_call {
+       int call_id;
+       TDB_DATA key;
+       TDB_DATA call_data;
+       TDB_DATA reply_data;
+       uint32_t status;
+#define CTDB_IMMEDIATE_MIGRATION               0x00000001
+#define CTDB_CALL_FLAG_VACUUM_MIGRATION                0x00000002
+#define CTDB_WANT_READONLY                     0x00000004
+       uint32_t flags;
+};
+
+/*
+  structure passed to a ctdb call backend function
+*/
+struct ctdb_call_info {
+       TDB_DATA key;          /* record key */
+       struct ctdb_ltdb_header *header;
+       TDB_DATA record_data;  /* current data in the record */
+       TDB_DATA *new_data;    /* optionally updated record data */
+       TDB_DATA *call_data;   /* optionally passed from caller */
+       TDB_DATA *reply_data;  /* optionally returned by function */
+       uint32_t status;       /* optional reply status - defaults to zero */
+};
+
+#define CTDB_ERR_INVALID 1
+#define CTDB_ERR_NOMEM 2
+
+/*
+  ctdb flags
+*/
+#define CTDB_FLAG_TORTURE      (1<<1)
+
+/*
+   a message handler ID meaning "give me all messages"
+ */
+#define CTDB_SRVID_ALL (~(uint64_t)0)
+
+/*
+  srvid type : RECOVERY
+*/
+#define CTDB_SRVID_RECOVERY    0xF100000000000000LL
+
+/*
+   a message handler ID meaning that the cluster has been reconfigured
+ */
+#define CTDB_SRVID_RECONFIGURE 0xF200000000000000LL
+
+/*
+   a message handler ID meaning that an IP address has been released
+ */
+#define CTDB_SRVID_RELEASE_IP 0xF300000000000000LL
+
+/*
+   a message handler ID meaning that an IP address has been taken
+ */
+#define CTDB_SRVID_TAKE_IP 0xF301000000000000LL
+
+/*
+   a message ID to set the node flags in the recovery daemon
+ */
+#define CTDB_SRVID_SET_NODE_FLAGS 0xF400000000000000LL
+
+/*
+   a message ID to ask the recovery daemon to update the expected node
+   assignment for a public ip
+ */
+#define CTDB_SRVID_RECD_UPDATE_IP 0xF500000000000000LL
+
+/*
+  a message to tell the recovery daemon to fetch a set of records
+ */
+#define CTDB_SRVID_VACUUM_FETCH 0xF700000000000000LL
+
+/*
+  a message to tell the recovery daemon to write a talloc memdump
+  to the log
+ */
+#define CTDB_SRVID_MEM_DUMP 0xF800000000000000LL
+
+/* A message id used to ask the recover daemon to send logs
+*/
+#define CTDB_SRVID_GETLOG  0xF801000000000000LL
+
+/* A message id used to ask the recover daemon to send logs
+*/
+#define CTDB_SRVID_CLEARLOG  0xF802000000000000LL
+
+/*
+   a message ID to get the recovery daemon to push the node flags out
+ */
+#define CTDB_SRVID_PUSH_NODE_FLAGS 0xF900000000000000LL
+
+/*
+   a message ID to get the recovery daemon to reload the nodes file
+ */
+#define CTDB_SRVID_RELOAD_NODES 0xFA00000000000000LL
+
+/*
+   a message ID to get the recovery daemon to perform a takeover run
+ */
+#define CTDB_SRVID_TAKEOVER_RUN 0xFB00000000000000LL
+
+/* request recovery daemon to rebalance ips for a node.
+   input is uint32_t for the node id.
+*/
+#define CTDB_SRVID_REBALANCE_NODE 0xFB01000000000000LL
+
+/* A message handler ID to stop takeover runs from occurring */
+#define CTDB_SRVID_DISABLE_TAKEOVER_RUNS 0xFB03000000000000LL
+
+/* A message id to ask the recovery daemon to temporarily disable the
+   public ip checks
+*/
+#define CTDB_SRVID_DISABLE_IP_CHECK  0xFC00000000000000LL
+
+/* A dummy port used for sending back ipreallocate resposnes to the main
+   daemon
+*/
+#define CTDB_SRVID_TAKEOVER_RUN_RESPONSE  0xFD00000000000000LL
+
+/* A range of ports reserved for registering a PID (top 8 bits)
+ * All ports matching the 8 top bits are reserved for exclusive use by
+ * registering a SRVID that matches the process-id of the requesting process
+ */
+#define CTDB_SRVID_PID_RANGE   0x0000000000000000LL
+
+/* A range of ports reserved for samba (top 8 bits)
+ * All ports matching the 8 top bits are reserved for exclusive use by
+ * CIFS server
+ */
+#define CTDB_SRVID_SAMBA_NOTIFY  0xFE00000000000000LL
+#define CTDB_SRVID_SAMBA_RANGE   0xFE00000000000000LL
+
+/* A range of ports reserved for a CTDB NFS server (top 8 bits)
+ * All ports matching the 8 top bits are reserved for exclusive use by
+ * NFS server
+ */
+#define CTDB_SRVID_NFSD_RANGE  0xEE00000000000000LL
+
+/* A range of ports reserved for a CTDB ISCSI server (top 8 bits)
+ * All ports matching the 8 top bits are reserved for exclusive use by
+ * ISCSI server
+ */
+#define CTDB_SRVID_ISCSID_RANGE  0xDE00000000000000LL
+
+/* A range of ports reserved for testing (top 8 bits)
+ * All ports matching the 8 top bits are reserved for exclusive use by
+ * test applications
+ */
+#define CTDB_SRVID_TEST_RANGE  0xCE00000000000000LL
+
+/* Range of ports reserved for traversals */
+#define CTDB_SRVID_TRAVERSE_RANGE  0xBE00000000000000LL
+
+/* used on the domain socket, send a pdu to the local daemon */
+#define CTDB_CURRENT_NODE     0xF0000001
+/* send a broadcast to all nodes in the cluster, active or not */
+#define CTDB_BROADCAST_ALL    0xF0000002
+/* send a broadcast to all nodes in the current vnn map */
+#define CTDB_BROADCAST_VNNMAP 0xF0000003
+/* send a broadcast to all connected nodes */
+#define CTDB_BROADCAST_CONNECTED 0xF0000004
+/* send a broadcast to selected connected nodes */
+#define CTDB_MULTICAST 0xF0000005
+
+/* the key used for transaction locking on persistent databases */
+#define CTDB_TRANSACTION_LOCK_KEY "__transaction_lock__"
+
+/* the key used to store persistent db sequence number */
+#define CTDB_DB_SEQNUM_KEY "__db_sequence_number__"
+
+#define MONITOR_SCRIPT_OK      0
+#define MONITOR_SCRIPT_TIMEOUT 1
+
+#define MAX_SCRIPT_NAME 31
+#define MAX_SCRIPT_OUTPUT 511
+struct ctdb_script_wire {
+       char name[MAX_SCRIPT_NAME+1];
+       struct timeval start;
+       struct timeval finished;
+       int32_t status;
+       char output[MAX_SCRIPT_OUTPUT+1];
+};
+
+struct ctdb_scripts_wire {
+       uint32_t num_scripts;
+       struct ctdb_script_wire scripts[1];
+};
+
+/* different calls to event scripts. */
+enum ctdb_eventscript_call {
+       CTDB_EVENT_INIT,                /* CTDB starting up: no args */
+       CTDB_EVENT_SETUP,               /* CTDB starting up after transport is readdy: no args. */
+       CTDB_EVENT_STARTUP,             /* CTDB starting up after initial recovery: no args. */
+       CTDB_EVENT_START_RECOVERY,      /* CTDB recovery starting: no args. */
+       CTDB_EVENT_RECOVERED,           /* CTDB recovery finished: no args. */
+       CTDB_EVENT_TAKE_IP,             /* IP taken: interface, IP address, netmask bits. */
+       CTDB_EVENT_RELEASE_IP,          /* IP released: interface, IP address, netmask bits. */
+       CTDB_EVENT_STOPPED,             /* Deprecated, do not use. */
+       CTDB_EVENT_MONITOR,             /* Please check if service is healthy: no args. */
+       CTDB_EVENT_STATUS,              /* Report service status: no args. */
+       CTDB_EVENT_SHUTDOWN,            /* CTDB shutting down: no args. */
+       CTDB_EVENT_RELOAD,              /* magic */
+       CTDB_EVENT_UPDATE_IP,           /* IP updating: old interface, new interface, IP address, netmask bits. */
+       CTDB_EVENT_IPREALLOCATED,       /* when a takeover_run() completes */
+       CTDB_EVENT_MAX
+};
+
+/* Mapping from enum to names. */
+extern const char *ctdb_eventscript_call_names[];
+
+/*
+  operation IDs
+*/
+enum ctdb_operation {
+       CTDB_REQ_CALL           = 0,
+       CTDB_REPLY_CALL         = 1,
+       CTDB_REQ_DMASTER        = 2,
+       CTDB_REPLY_DMASTER      = 3,
+       CTDB_REPLY_ERROR        = 4,
+       CTDB_REQ_MESSAGE        = 5,
+       /* #6 removed */
+       CTDB_REQ_CONTROL        = 7,
+       CTDB_REPLY_CONTROL      = 8,
+       CTDB_REQ_KEEPALIVE      = 9,
+};
+
+#define CTDB_MAGIC 0x43544442 /* CTDB */
+#define CTDB_VERSION 1
+
+enum ctdb_controls {CTDB_CONTROL_PROCESS_EXISTS          = 0,
+                   CTDB_CONTROL_STATISTICS              = 1,
+                   /* #2 removed */
+                   CTDB_CONTROL_PING                    = 3,
+                   CTDB_CONTROL_GETDBPATH               = 4,
+                   CTDB_CONTROL_GETVNNMAP               = 5,
+                   CTDB_CONTROL_SETVNNMAP               = 6,
+                   CTDB_CONTROL_GET_DEBUG               = 7,
+                   CTDB_CONTROL_SET_DEBUG               = 8,
+                   CTDB_CONTROL_GET_DBMAP               = 9,
+                   CTDB_CONTROL_GET_NODEMAPv4           = 10, /* obsolete */
+                   CTDB_CONTROL_SET_DMASTER             = 11, /* obsolete */
+                   /* #12 removed */
+                   CTDB_CONTROL_PULL_DB                 = 13,
+                   CTDB_CONTROL_PUSH_DB                 = 14,
+                   CTDB_CONTROL_GET_RECMODE             = 15,
+                   CTDB_CONTROL_SET_RECMODE             = 16,
+                   CTDB_CONTROL_STATISTICS_RESET        = 17,
+                   CTDB_CONTROL_DB_ATTACH               = 18,
+                   CTDB_CONTROL_SET_CALL                = 19,
+                   CTDB_CONTROL_TRAVERSE_START          = 20,
+                   CTDB_CONTROL_TRAVERSE_ALL            = 21,
+                   CTDB_CONTROL_TRAVERSE_DATA           = 22,
+                   CTDB_CONTROL_REGISTER_SRVID          = 23,
+                   CTDB_CONTROL_DEREGISTER_SRVID        = 24,
+                   CTDB_CONTROL_GET_DBNAME              = 25,
+                   CTDB_CONTROL_ENABLE_SEQNUM           = 26,
+                   CTDB_CONTROL_UPDATE_SEQNUM           = 27,
+                   /* #28 removed */
+                   CTDB_CONTROL_DUMP_MEMORY             = 29,
+                   CTDB_CONTROL_GET_PID                 = 30,
+                   CTDB_CONTROL_GET_RECMASTER           = 31,
+                   CTDB_CONTROL_SET_RECMASTER           = 32,
+                   CTDB_CONTROL_FREEZE                  = 33,
+                   CTDB_CONTROL_THAW                    = 34,
+                   CTDB_CONTROL_GET_PNN                 = 35,
+                   CTDB_CONTROL_SHUTDOWN                = 36,
+                   CTDB_CONTROL_GET_MONMODE             = 37,
+                   /* #38 removed */
+                   /* #39 removed */
+                   /* #40 removed */
+                   /* #41 removed */
+                   CTDB_CONTROL_TAKEOVER_IPv4           = 42, /* obsolete */
+                   CTDB_CONTROL_RELEASE_IPv4            = 43, /* obsolete */
+                   CTDB_CONTROL_TCP_CLIENT              = 44,
+                   CTDB_CONTROL_TCP_ADD                 = 45,
+                   CTDB_CONTROL_TCP_REMOVE              = 46,
+                   CTDB_CONTROL_STARTUP                 = 47,
+                   CTDB_CONTROL_SET_TUNABLE             = 48,
+                   CTDB_CONTROL_GET_TUNABLE             = 49,
+                   CTDB_CONTROL_LIST_TUNABLES           = 50,
+                   CTDB_CONTROL_GET_PUBLIC_IPSv4        = 51, /* obsolete */
+                   CTDB_CONTROL_MODIFY_FLAGS            = 52,
+                   CTDB_CONTROL_GET_ALL_TUNABLES        = 53,
+                   CTDB_CONTROL_KILL_TCP                = 54,
+                   CTDB_CONTROL_GET_TCP_TICKLE_LIST     = 55,
+                   CTDB_CONTROL_SET_TCP_TICKLE_LIST     = 56,
+                   CTDB_CONTROL_REGISTER_SERVER_ID      = 57,
+                   CTDB_CONTROL_UNREGISTER_SERVER_ID    = 58,
+                   CTDB_CONTROL_CHECK_SERVER_ID         = 59,
+                   CTDB_CONTROL_GET_SERVER_ID_LIST      = 60,
+                   CTDB_CONTROL_DB_ATTACH_PERSISTENT    = 61,
+                   CTDB_CONTROL_PERSISTENT_STORE        = 62, /* obsolete */
+                   CTDB_CONTROL_UPDATE_RECORD           = 63,
+                   CTDB_CONTROL_SEND_GRATIOUS_ARP       = 64,
+                   CTDB_CONTROL_TRANSACTION_START       = 65,
+                   CTDB_CONTROL_TRANSACTION_COMMIT      = 66,
+                   CTDB_CONTROL_WIPE_DATABASE           = 67,
+                   /* #68 removed */
+                   CTDB_CONTROL_UPTIME                  = 69,
+                   CTDB_CONTROL_START_RECOVERY          = 70,
+                   CTDB_CONTROL_END_RECOVERY            = 71,
+                   CTDB_CONTROL_RELOAD_NODES_FILE       = 72,
+                   /* #73 removed */
+                   CTDB_CONTROL_TRY_DELETE_RECORDS      = 74,
+                   CTDB_CONTROL_ENABLE_MONITOR          = 75,
+                   CTDB_CONTROL_DISABLE_MONITOR         = 76,
+                   CTDB_CONTROL_ADD_PUBLIC_IP           = 77,
+                   CTDB_CONTROL_DEL_PUBLIC_IP           = 78,
+                   CTDB_CONTROL_RUN_EVENTSCRIPTS        = 79,
+                   CTDB_CONTROL_GET_CAPABILITIES        = 80,
+                   CTDB_CONTROL_START_PERSISTENT_UPDATE = 81,
+                   CTDB_CONTROL_CANCEL_PERSISTENT_UPDATE= 82,
+                   CTDB_CONTROL_TRANS2_COMMIT           = 83, /* obsolete */
+                   CTDB_CONTROL_TRANS2_FINISHED         = 84, /* obsolete */
+                   CTDB_CONTROL_TRANS2_ERROR            = 85, /* obsolete */
+                   CTDB_CONTROL_TRANS2_COMMIT_RETRY     = 86, /* obsolete */
+                   CTDB_CONTROL_RECD_PING               = 87,
+                   CTDB_CONTROL_RELEASE_IP              = 88,
+                   CTDB_CONTROL_TAKEOVER_IP             = 89,
+                   CTDB_CONTROL_GET_PUBLIC_IPS          = 90,
+                   CTDB_CONTROL_GET_NODEMAP             = 91,
+                   CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS = 96,
+                   CTDB_CONTROL_TRAVERSE_KILL           = 97,
+                   CTDB_CONTROL_RECD_RECLOCK_LATENCY    = 98,
+                   CTDB_CONTROL_GET_RECLOCK_FILE        = 99,
+                   CTDB_CONTROL_SET_RECLOCK_FILE        = 100,
+                   CTDB_CONTROL_STOP_NODE               = 101,
+                   CTDB_CONTROL_CONTINUE_NODE           = 102,
+                   CTDB_CONTROL_SET_NATGWSTATE          = 103,
+                   CTDB_CONTROL_SET_LMASTERROLE         = 104,
+                   CTDB_CONTROL_SET_RECMASTERROLE       = 105,
+                   CTDB_CONTROL_ENABLE_SCRIPT           = 107,
+                   CTDB_CONTROL_DISABLE_SCRIPT          = 108,
+                   CTDB_CONTROL_SET_BAN_STATE           = 109,
+                   CTDB_CONTROL_GET_BAN_STATE           = 110,
+                   CTDB_CONTROL_SET_DB_PRIORITY         = 111,
+                   CTDB_CONTROL_GET_DB_PRIORITY         = 112,
+                   CTDB_CONTROL_TRANSACTION_CANCEL      = 113,
+                   CTDB_CONTROL_REGISTER_NOTIFY         = 114,
+                   CTDB_CONTROL_DEREGISTER_NOTIFY       = 115,
+                   CTDB_CONTROL_TRANS2_ACTIVE           = 116, /* obsolete */
+                   CTDB_CONTROL_GET_LOG                 = 117,
+                   CTDB_CONTROL_CLEAR_LOG               = 118,
+                   CTDB_CONTROL_TRANS3_COMMIT           = 119,
+                   CTDB_CONTROL_GET_DB_SEQNUM           = 120,
+                   CTDB_CONTROL_DB_SET_HEALTHY          = 121,
+                   CTDB_CONTROL_DB_GET_HEALTH           = 122,
+                   CTDB_CONTROL_GET_PUBLIC_IP_INFO      = 123,
+                   CTDB_CONTROL_GET_IFACES              = 124,
+                   CTDB_CONTROL_SET_IFACE_LINK_STATE    = 125,
+                   CTDB_CONTROL_TCP_ADD_DELAYED_UPDATE  = 126,
+                   CTDB_CONTROL_GET_STAT_HISTORY        = 127,
+                   CTDB_CONTROL_SCHEDULE_FOR_DELETION   = 128,
+                   CTDB_CONTROL_SET_DB_READONLY         = 129,
+                   CTDB_CONTROL_CHECK_SRVIDS            = 130,
+                   CTDB_CONTROL_TRAVERSE_START_EXT      = 131,
+                   CTDB_CONTROL_GET_DB_STATISTICS       = 132,
+                   CTDB_CONTROL_SET_DB_STICKY           = 133,
+                   CTDB_CONTROL_RELOAD_PUBLIC_IPS       = 134,
+                   CTDB_CONTROL_TRAVERSE_ALL_EXT        = 135,
+                   CTDB_CONTROL_RECEIVE_RECORDS         = 136,
+                   CTDB_CONTROL_IPREALLOCATED           = 137,
+                   CTDB_CONTROL_GET_RUNSTATE            = 138,
+};
+
+/*
+  packet structures
+*/
+struct ctdb_req_header {
+       uint32_t length;
+       uint32_t ctdb_magic;
+       uint32_t ctdb_version;
+       uint32_t generation;
+       uint32_t operation;
+       uint32_t destnode;
+       uint32_t srcnode;
+       uint32_t reqid;
+};
+
+struct ctdb_req_call {
+       struct ctdb_req_header hdr;
+       uint32_t flags;
+       uint32_t db_id;
+       uint32_t callid;
+       uint32_t hopcount;
+       uint32_t keylen;
+       uint32_t calldatalen;
+       uint8_t data[1]; /* key[] followed by calldata[] */
+};
+
+struct ctdb_reply_call {
+       struct ctdb_req_header hdr;
+       uint32_t status;
+       uint32_t datalen;
+       uint8_t  data[1];
+};
+
+struct ctdb_reply_error {
+       struct ctdb_req_header hdr;
+       uint32_t status;
+       uint32_t msglen;
+       uint8_t  msg[1];
+};
+
+struct ctdb_req_dmaster {
+       struct ctdb_req_header hdr;
+       uint32_t db_id;
+       uint64_t rsn;
+       uint32_t dmaster;
+       uint32_t keylen;
+       uint32_t datalen;
+       uint8_t  data[1];
+};
+
+struct ctdb_reply_dmaster {
+       struct ctdb_req_header hdr;
+       uint32_t db_id;
+       uint64_t rsn;
+       uint32_t keylen;
+       uint32_t datalen;
+       uint8_t  data[1];
+};
+
+struct ctdb_req_message {
+       struct ctdb_req_header hdr;
+       uint64_t srvid;
+       uint32_t datalen;
+       uint8_t data[1];
+};
+
+struct ctdb_req_getdbpath {
+       struct ctdb_req_header hdr;
+       uint32_t db_id;
+};
+
+struct ctdb_reply_getdbpath {
+       struct ctdb_req_header hdr;
+       uint32_t datalen;
+       uint8_t data[1];
+};
+
+struct ctdb_req_control {
+       struct ctdb_req_header hdr;
+       uint32_t opcode;
+       uint32_t pad;
+       uint64_t srvid;
+       uint32_t client_id;
+#define CTDB_CTRL_FLAG_NOREPLY   1
+#define CTDB_CTRL_FLAG_OPCODE_SPECIFIC   0xFFFF0000
+       uint32_t flags;
+       uint32_t datalen;
+       uint8_t data[1];
+};
+
+struct ctdb_reply_control {
+       struct ctdb_req_header hdr;
+       int32_t  status;
+       uint32_t datalen;
+       uint32_t errorlen;
+       uint8_t data[1];
+};
+
+struct ctdb_req_keepalive {
+       struct ctdb_req_header hdr;
+};
+
+
+/*
+  the extended header for records in the ltdb
+*/
+struct ctdb_ltdb_header {
+       uint64_t rsn;
+       uint32_t dmaster;
+       uint32_t reserved1;
+#define CTDB_REC_FLAG_DEFAULT                  0x00000000
+#define CTDB_REC_FLAG_MIGRATED_WITH_DATA       0x00010000
+#define CTDB_REC_FLAG_VACUUM_MIGRATED          0x00020000
+#define CTDB_REC_FLAG_AUTOMATIC                        0x00040000
+#define CTDB_REC_RO_HAVE_DELEGATIONS           0x01000000
+#define CTDB_REC_RO_HAVE_READONLY              0x02000000
+#define CTDB_REC_RO_REVOKING_READONLY          0x04000000
+#define CTDB_REC_RO_REVOKE_COMPLETE            0x08000000
+#define CTDB_REC_RO_FLAGS                      (CTDB_REC_RO_HAVE_DELEGATIONS|\
+                                                CTDB_REC_RO_HAVE_READONLY|\
+                                                CTDB_REC_RO_REVOKING_READONLY|\
+                                                CTDB_REC_RO_REVOKE_COMPLETE)
+       uint32_t flags;
+};
+
+
+/*
+  definitions for different socket structures
+ */
+typedef struct sockaddr_in ctdb_addr_in;
+typedef struct sockaddr_in6 ctdb_addr_in6;
+typedef union {
+       struct sockaddr sa;
+       ctdb_addr_in    ip;
+       ctdb_addr_in6   ip6;
+} ctdb_sock_addr;
+
+/*
+   A structure describing a single node, its flags and its address
+*/
+struct ctdb_node_and_flags {
+       uint32_t pnn;
+       uint32_t flags;
+       ctdb_sock_addr addr;
+};
+
+
+/*
+   Structure used for a nodemap. 
+   The nodemap is the structure containing a list of all nodes
+   known to the cluster and their associated flags.
+*/
+struct ctdb_node_map {
+       uint32_t num;
+       struct ctdb_node_and_flags nodes[1];
+};
+
+/*
+ * Node flags
+ */
+#define NODE_FLAGS_DISCONNECTED                0x00000001 /* node isn't connected */
+#define NODE_FLAGS_UNHEALTHY           0x00000002 /* monitoring says node is unhealthy */
+#define NODE_FLAGS_PERMANENTLY_DISABLED        0x00000004 /* administrator has disabled node */
+#define NODE_FLAGS_BANNED              0x00000008 /* recovery daemon has banned the node */
+#define NODE_FLAGS_DELETED             0x00000010 /* this node has been deleted */
+#define NODE_FLAGS_STOPPED             0x00000020 /* this node has been stopped */
+#define NODE_FLAGS_DISABLED            (NODE_FLAGS_UNHEALTHY|NODE_FLAGS_PERMANENTLY_DISABLED)
+#define NODE_FLAGS_INACTIVE            (NODE_FLAGS_DELETED|NODE_FLAGS_DISCONNECTED|NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)
+
+/*
+ * Node capabilities
+ */
+#define CTDB_CAP_RECMASTER             0x00000001
+#define CTDB_CAP_LMASTER               0x00000002
+/* This capability is set if CTDB_LVS_PUBLIC_IP is set */
+#define CTDB_CAP_LVS                   0x00000004
+/* This capability is set if NATGW is enabled */
+#define CTDB_CAP_NATGW                 0x00000008
+
+
+struct ctdb_public_ip {
+       uint32_t pnn;
+       ctdb_sock_addr addr;
+};
+
+struct ctdb_all_public_ips {
+       uint32_t num;
+       struct ctdb_public_ip ips[1];
+};
+
+
+struct latency_counter {
+       int num;
+       double min;
+       double max;
+       double total;
+};
+
+/*
+  structure used to pass record data between the child and parent
+ */
+struct ctdb_rec_data {
+       uint32_t length;
+       uint32_t reqid;
+       uint32_t keylen;
+       uint32_t datalen;
+       uint8_t  data[1];
+};
+
+struct ctdb_traverse_start {
+       uint32_t db_id;
+       uint32_t reqid;
+       uint64_t srvid;
+};
+
+struct ctdb_traverse_start_ext {
+       uint32_t db_id;
+       uint32_t reqid;
+       uint64_t srvid;
+       bool withemptyrecords;
+};
+
+/*
+  ctdb statistics information
+ */
+#define MAX_COUNT_BUCKETS 16
+#define MAX_HOT_KEYS      10
+
+struct ctdb_statistics {
+       uint32_t num_clients;
+       uint32_t frozen;
+       uint32_t recovering;
+       uint32_t client_packets_sent;
+       uint32_t client_packets_recv;
+       uint32_t node_packets_sent;
+       uint32_t node_packets_recv;
+       uint32_t keepalive_packets_sent;
+       uint32_t keepalive_packets_recv;
+       struct {
+               uint32_t req_call;
+               uint32_t reply_call;
+               uint32_t req_dmaster;
+               uint32_t reply_dmaster;
+               uint32_t reply_error;
+               uint32_t req_message;
+               uint32_t req_control;
+               uint32_t reply_control;
+       } node;
+       struct {
+               uint32_t req_call;
+               uint32_t req_message;
+               uint32_t req_control;
+       } client;
+       struct {
+               uint32_t call;
+               uint32_t control;
+               uint32_t traverse;
+       } timeouts;
+       struct {
+               struct latency_counter ctdbd;
+               struct latency_counter recd;
+       } reclock;
+       struct {
+               uint32_t num_calls;
+               uint32_t num_current;
+               uint32_t num_pending;
+               uint32_t num_failed;
+               struct latency_counter latency;
+               uint32_t buckets[MAX_COUNT_BUCKETS];
+       } locks;
+       uint32_t total_calls;
+       uint32_t pending_calls;
+       uint32_t childwrite_calls;
+       uint32_t pending_childwrite_calls;
+       uint32_t memory_used;
+       uint32_t __last_counter; /* hack for control_statistics_all */
+       uint32_t max_hop_count;
+       uint32_t hop_count_bucket[MAX_COUNT_BUCKETS];
+       struct latency_counter call_latency;
+       struct latency_counter childwrite_latency;
+       uint32_t num_recoveries;
+       struct timeval statistics_start_time;
+       struct timeval statistics_current_time;
+       uint32_t total_ro_delegations;
+       uint32_t total_ro_revokes;
+};
+
+/*
+ * wire format for statistics history
+ */
+struct ctdb_statistics_wire {
+       uint32_t num;
+       struct ctdb_statistics stats[1];
+};
+
+/*
+ * db statistics
+ */
+struct ctdb_db_statistics {
+       struct {
+               uint32_t num_calls;
+               uint32_t num_current;
+               uint32_t num_pending;
+               uint32_t num_failed;
+               struct latency_counter latency;
+               uint32_t buckets[MAX_COUNT_BUCKETS];
+       } locks;
+       uint32_t db_ro_delegations;
+       uint32_t db_ro_revokes;
+       uint32_t hop_count_bucket[MAX_COUNT_BUCKETS];
+       uint32_t num_hot_keys;
+       struct {
+               uint32_t count;
+               TDB_DATA key;
+       } hot_keys[MAX_HOT_KEYS];
+       char hot_keys_wire[1];
+};
+
+/*
+ * wire format for interface list
+ */
+#ifdef IFNAMSIZ
+#define CTDB_IFACE_SIZE IFNAMSIZ
+#else
+#define CTDB_IFACE_SIZE 16
+#endif
+
+struct ctdb_iface_info {
+       char name[CTDB_IFACE_SIZE+2];
+       uint16_t link_state;
+       uint32_t references;
+};
+
+struct ctdb_ifaces_list {
+       uint32_t num;
+       struct ctdb_iface_info ifaces[1];
+};
+
+#define INVALID_GENERATION 1
+/* table that contains the mapping between a hash value and lmaster
+ */
+struct ctdb_vnn_map {
+       uint32_t generation;
+       uint32_t size;
+       uint32_t *map;
+};
+
+/* 
+   a wire representation of the vnn map
+ */
+struct ctdb_vnn_map_wire {
+       uint32_t generation;
+       uint32_t size;
+       uint32_t map[1];
+};
+
+#endif
diff --git a/ctdb/include/ctdb_typesafe_cb.h b/ctdb/include/ctdb_typesafe_cb.h
new file mode 100644 (file)
index 0000000..b1f2c5f
--- /dev/null
@@ -0,0 +1,177 @@
+#ifndef CCAN_CAST_IF_TYPE_H
+#define CCAN_CAST_IF_TYPE_H
+
+#if (__GNUC__ >= 3)
+#define HAVE_TYPEOF 1
+#define HAVE_BUILTIN_CHOOSE_EXPR 1
+#define HAVE_BUILTIN_TYPES_COMPATIBLE_P 1
+#endif
+
+#if HAVE_TYPEOF && HAVE_BUILTIN_CHOOSE_EXPR && HAVE_BUILTIN_TYPES_COMPATIBLE_P
+/**
+ * cast_if_type - only cast an expression if test matches a given type
+ * @desttype: the type to cast to
+ * @expr: the expression to cast
+ * @test: the expression to test
+ * @oktype: the type we allow
+ *
+ * This macro is used to create functions which allow multiple types.
+ * The result of this macro is used somewhere that a @desttype type is
+ * expected: if @expr was of type @oktype, it will be cast to
+ * @desttype type.  As a result, if @expr is any type other than
+ * @oktype or @desttype, a compiler warning will be issued.
+ *
+ * This macro can be used in static initializers.
+ *
+ * This is merely useful for warnings: if the compiler does not
+ * support the primitives required for cast_if_type(), it becomes an
+ * unconditional cast, and the @test and @oktype argument is not used.  In
+ * particular, this means that @oktype can be a type which uses
+ * the "typeof": it will not be evaluated if typeof is not supported.
+ *
+ * Example:
+ *     // We can take either an unsigned long or a void *.
+ *     void _set_some_value(void *val);
+ *     #define set_some_value(e)                       \
+ *             _set_some_value(cast_if_type(void *, (e), (e), unsigned long))
+ */
+#define cast_if_type(desttype, expr, test, oktype)                     \
+__builtin_choose_expr(__builtin_types_compatible_p(typeof(1?(test):0), oktype), \
+                       (desttype)(expr), (expr))
+#else
+#define cast_if_type(desttype, expr, test, oktype) ((desttype)(expr))
+#endif
+
+/**
+ * cast_if_any - only cast an expression if it is one of the three given types
+ * @desttype: the type to cast to
+ * @expr: the expression to cast
+ * @test: the expression to test
+ * @ok1: the first type we allow
+ * @ok2: the second type we allow
+ * @ok3: the third type we allow
+ *
+ * This is a convenient wrapper for multiple cast_if_type() calls.  You can
+ * chain them inside each other (ie. use cast_if_any() for expr) if you need
+ * more than 3 arguments.
+ *
+ * Example:
+ *     // We can take either a long, unsigned long, void * or a const void *.
+ *     void _set_some_value(void *val);
+ *     #define set_some_value(expr)                                    \
+ *             _set_some_value(cast_if_any(void *, (expr), (expr),     \
+ *                                         long, unsigned long, const void *))
+ */
+#define cast_if_any(desttype, expr, test, ok1, ok2, ok3)               \
+       cast_if_type(desttype,                                          \
+                    cast_if_type(desttype,                             \
+                                 cast_if_type(desttype, (expr), (test), ok1), \
+                                 ok2),                                 \
+                    ok3)
+
+/**
+ * typesafe_cb - cast a callback function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * If a callback function takes a single argument, this macro does
+ * appropriate casts to a function which takes a single void * argument if the
+ * callback provided matches the @arg (or a const or volatile version).
+ *
+ * It is assumed that @arg is of pointer type: usually @arg is passed
+ * or assigned to a void * elsewhere anyway.
+ *
+ * Example:
+ *     void _register_callback(void (*fn)(void *arg), void *arg);
+ *     #define register_callback(fn, arg) \
+ *             _register_callback(typesafe_cb(void, (fn), (arg)), (arg))
+ */
+#define typesafe_cb(rtype, fn, arg)                    \
+       cast_if_type(rtype (*)(void *), (fn), (fn)(arg), rtype)
+
+/**
+ * typesafe_cb_const - cast a const callback function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * If a callback function takes a single argument, this macro does appropriate
+ * casts to a function which takes a single const void * argument if the
+ * callback provided matches the @arg.
+ *
+ * It is assumed that @arg is of pointer type: usually @arg is passed
+ * or assigned to a void * elsewhere anyway.
+ *
+ * Example:
+ *     void _register_callback(void (*fn)(const void *arg), const void *arg);
+ *     #define register_callback(fn, arg) \
+ *             _register_callback(typesafe_cb_const(void, (fn), (arg)), (arg))
+ */
+#define typesafe_cb_const(rtype, fn, arg)                              \
+       sizeof((fn)((const void *)0)),                                  \
+               cast_if_type(rtype (*)(const void *),                   \
+                            (fn), (fn)(arg), rtype (*)(typeof(arg)))
+
+/**
+ * typesafe_cb_preargs - cast a callback function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * This is a version of typesafe_cb() for callbacks that take other arguments
+ * before the @arg.
+ *
+ * Example:
+ *     void _register_callback(void (*fn)(int, void *arg), void *arg);
+ *     #define register_callback(fn, arg) \
+ *             _register_callback(typesafe_cb_preargs(void, (fn), (arg), int),\
+ *                                (arg))
+ */
+#define typesafe_cb_preargs(rtype, fn, arg, ...)                       \
+       cast_if_type(rtype (*)(__VA_ARGS__, void *), (fn), (fn),        \
+                    rtype (*)(__VA_ARGS__, typeof(arg)))
+/**
+ * typesafe_cb_postargs - cast a callback function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument to hand to the callback function.
+ *
+ * This is a version of typesafe_cb() for callbacks that take other arguments
+ * after the @arg.
+ *
+ * Example:
+ *     void _register_callback(void (*fn)(void *arg, int), void *arg);
+ *     #define register_callback(fn, arg) \
+ *             _register_callback(typesafe_cb_postargs(void, (fn), (arg), int),\
+ *                                (arg))
+ */
+#define typesafe_cb_postargs(rtype, fn, arg, ...)                      \
+       cast_if_type(rtype (*)(void *, __VA_ARGS__), (fn), (fn),        \
+                    rtype (*)(typeof(arg), __VA_ARGS__))
+/**
+ * typesafe_cb_cmp - cast a compare function if it matches the arg
+ * @rtype: the return type of the callback function
+ * @fn: the callback function to cast
+ * @arg: the (pointer) argument(s) to hand to the compare function.
+ *
+ * If a callback function takes two matching-type arguments, this macro does
+ * appropriate casts to a function which takes two const void * arguments if
+ * the callback provided takes two a const pointers to @arg.
+ *
+ * It is assumed that @arg is of pointer type: usually @arg is passed
+ * or assigned to a void * elsewhere anyway.  Note also that the type
+ * arg points to must be defined.
+ *
+ * Example:
+ *     void _my_qsort(void *base, size_t nmemb, size_t size,
+ *                    int (*cmp)(const void *, const void *));
+ *     #define my_qsort(base, nmemb, cmpfn) \
+ *             _my_qsort((base), (nmemb), sizeof(*(base)), \
+ *                       typesafe_cb_cmp(int, (cmpfn), (base)), (arg))
+ */
+#define typesafe_cb_cmp(rtype, cmpfn, arg)                             \
+       cast_if_type(rtype (*)(const void *, const void *), (cmpfn),    \
+                    rtype (*)(const typeof(*arg)*, const typeof(*arg)*))
+                    
+#endif /* CCAN_CAST_IF_TYPE_H */
diff --git a/ctdb/include/idtree.h b/ctdb/include/idtree.h
new file mode 100644 (file)
index 0000000..259af91
--- /dev/null
@@ -0,0 +1,7 @@
+struct idr_context *idr_init(TALLOC_CTX *mem_ctx);
+int idr_get_new(struct idr_context *idp, void *ptr, int limit);
+int idr_get_new_above(struct idr_context *idp, void *ptr, int starting_id, int limit);
+int idr_get_new_random(struct idr_context *idp, void *ptr, int limit);
+void *idr_find(struct idr_context *idp, int id);
+int idr_remove(struct idr_context *idp, int id);
+
diff --git a/ctdb/include/includes.h b/ctdb/include/includes.h
new file mode 100644 (file)
index 0000000..3747198
--- /dev/null
@@ -0,0 +1,68 @@
+#ifndef _CTDB_INCLUDES_H
+#define _CTDB_INCLUDES_H
+
+#define HAVE_UNIXSOCKET 1
+
+#include "replace.h"
+#include "talloc.h"
+#include "system/wait.h"
+#include "system/network.h"
+#include "tdb.h"
+#include "idtree.h"
+#include "ctdb_client.h"
+
+/* Allow use of deprecated function tevent_loop_allow_nesting() */
+#define TEVENT_DEPRECATED
+/* Saves ctdb from massive churn. */
+#define TEVENT_COMPAT_DEFINES 1
+
+#include "tevent.h"
+
+extern int LogLevel;
+extern int this_log_level;
+
+enum debug_level { 
+       DEBUG_EMERG   = -3, 
+       DEBUG_ALERT   = -2, 
+       DEBUG_CRIT    = -1,
+       DEBUG_ERR     =  0,
+       DEBUG_WARNING =  1,
+       DEBUG_NOTICE  =  2,     
+       DEBUG_INFO    =  3,
+       DEBUG_DEBUG   =  4,
+};
+
+#define DEBUGLVL(lvl) ((lvl) <= LogLevel)
+#define DEBUG(lvl, x) do { this_log_level = (lvl); if ((lvl) < DEBUG_DEBUG) { log_ringbuffer x; } if ((lvl) <= LogLevel) { do_debug x; }} while (0)
+#define DEBUGADD(lvl, x) do { if ((lvl) <= LogLevel) { this_log_level = (lvl); do_debug_add x; }} while (0)
+
+#define _PUBLIC_
+#define _NORETURN_
+#define _PURE_
+
+#define ZERO_STRUCT(x) memset((char *)&(x), 0, sizeof(x))
+
+#ifndef discard_const
+#define discard_const(ptr) ((void *)((intptr_t)(ptr)))
+#endif
+
+struct timeval timeval_zero(void);
+bool timeval_is_zero(const struct timeval *tv);
+struct timeval timeval_current(void);
+struct timeval timeval_set(uint32_t secs, uint32_t usecs);
+int timeval_compare(const struct timeval *tv1, const struct timeval *tv2);
+struct timeval timeval_until(const struct timeval *tv1,
+                            const struct timeval *tv2);
+_PUBLIC_ struct timeval timeval_current_ofs(uint32_t secs, uint32_t usecs);
+double timeval_elapsed(struct timeval *tv);
+double timeval_delta(struct timeval *tv2, struct timeval *tv);
+char **file_lines_load(const char *fname, int *numlines, TALLOC_CTX *mem_ctx);
+char *hex_encode_talloc(TALLOC_CTX *mem_ctx, const unsigned char *buff_in, size_t len);
+uint8_t *hex_decode_talloc(TALLOC_CTX *mem_ctx, const char *hex_in, size_t *len);
+_PUBLIC_ const char **str_list_add(const char **list, const char *s);
+_PUBLIC_ int set_blocking(int fd, bool set);
+
+#include "lib/util/debug.h"
+#include "lib/util/util.h"
+
+#endif /* _CTDB_INCLUDES_H */
diff --git a/ctdb/install-sh b/ctdb/install-sh
new file mode 100755 (executable)
index 0000000..5871924
--- /dev/null
@@ -0,0 +1,238 @@
+#! /bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+#
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+       -c) instcmd="$cpprog"
+           shift
+           continue;;
+
+       -d) dir_arg=true
+           shift
+           continue;;
+
+       -m) chmodcmd="$chmodprog $2"
+           shift
+           shift
+           continue;;
+
+       -o) chowncmd="$chownprog $2"
+           shift
+           shift
+           continue;;
+
+       -g) chgrpcmd="$chgrpprog $2"
+           shift
+           shift
+           continue;;
+
+       -s) stripcmd="$stripprog"
+           shift
+           continue;;
+
+       -t=*) transformarg=`echo $1 | sed 's/-t=//'`
+           shift
+           continue;;
+
+       -b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+           shift
+           continue;;
+
+       *)  if [ x"$src" = x ]
+           then
+               src=$1
+           else
+               # this colon is to work around a 386BSD /bin/sh bug
+               :
+               dst=$1
+           fi
+           shift
+           continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+       echo "install:  no input file specified"
+       exit 1
+else
+       true
+fi
+
+if [ x"$dir_arg" != x ]; then
+       dst=$src
+       src=""
+       
+       if [ -d $dst ]; then
+               instcmd=:
+       else
+               instcmd=mkdir
+       fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+       if [ -f $src -o -d $src ]
+       then
+               true
+       else
+               echo "install:  $src does not exist"
+               exit 1
+       fi
+       
+       if [ x"$dst" = x ]
+       then
+               echo "install:  no destination specified"
+               exit 1
+       else
+               true
+       fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+       if [ -d $dst ]
+       then
+               dst="$dst"/`basename $src`
+       else
+               true
+       fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='   
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+       pathcomp="${pathcomp}${1}"
+       shift
+
+       if [ ! -d "${pathcomp}" ] ;
+        then
+               $mkdirprog "${pathcomp}"
+       else
+               true
+       fi
+
+       pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+       $doit $instcmd $dst &&
+
+       if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+       if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+       if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+       if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+       if [ x"$transformarg" = x ] 
+       then
+               dstfile=`basename $dst`
+       else
+               dstfile=`basename $dst $transformbasename | 
+                       sed $transformarg`$transformbasename
+       fi
+
+# don't allow the sed command to completely eliminate the filename
+
+       if [ x"$dstfile" = x ] 
+       then
+               dstfile=`basename $dst`
+       else
+               true
+       fi
+
+# Make a temp file name in the proper directory.
+
+       dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+       $doit $instcmd $src $dsttmp &&
+
+       trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+       if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+       if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+       if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+       if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+       $doit $rmcmd -f $dstdir/$dstfile &&
+       $doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0
diff --git a/ctdb/lib/popt/CHANGES b/ctdb/lib/popt/CHANGES
new file mode 100644 (file)
index 0000000..db16a5f
--- /dev/null
@@ -0,0 +1,46 @@
+1.5 -> 1.6
+       - add ability to perform callbacks for every, not just first, match.
+
+1.3 -> 1.5
+       - heavy dose of const's
+       - poptParseArgvString() now NULL terminates the list
+
+1.2.3 -> 1.3
+       - added support for single -
+       - misc bug fixes
+       - portability improvements
+
+1.2.2 -> 1.2.3
+       - fixed memset() in help message generation (Dale Hawkins)
+       - added extern "C" stuff to popt.h for C++ compilers (Dale Hawkins)
+       - const'ified poptParseArgvString (Jeff Garzik)
+
+1.2.1 -> 1.2.2
+       - fixed bug in chaind alias happens which seems to have only
+         affected --triggers in rpm
+       - added POPT_ARG_VAL
+       - popt.3 installed by default
+
+1.2 -> 1.2.1
+       - added POPT_ARG_INTL_DOMAIN (Elliot Lee)
+       - updated Makefile's to be more GNUish (Elliot Lee)
+
+1.1 -> 1.2
+       - added popt.3 man page (Robert Lynch)
+       - don't use mmap anymore (its lack of portability isn't worth the
+         trouble)
+       - added test script
+       - added support for exec
+       - removed support for *_POPT_ALIASES env variable -- it was a bad
+         idea
+       - reorganized into multiple source files
+       - added automatic help generation, POPT_AUTOHELP
+       - added table callbacks
+       - added table inclusion
+       - updated man page for new features
+       - added test scripts
+
+1.0 -> 1.1
+       - moved to autoconf (Fred Fish)
+       - added STRERROR replacement (Norbert Warmuth)
+       - added const keywords (Bruce Perens)
diff --git a/ctdb/lib/popt/COPYING b/ctdb/lib/popt/COPYING
new file mode 100644 (file)
index 0000000..b4c7ca8
--- /dev/null
@@ -0,0 +1,22 @@
+Copyright (c) 1998  Red Hat Software
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+X CONSORTIUM BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+Except as contained in this notice, the name of the X Consortium shall not be
+used in advertising or otherwise to promote the sale, use or other dealings
+in this Software without prior written authorization from the X Consortium.
diff --git a/ctdb/lib/popt/README b/ctdb/lib/popt/README
new file mode 100644 (file)
index 0000000..0b5205b
--- /dev/null
@@ -0,0 +1,18 @@
+This is the popt command line option parsing library. While it is similiar
+to getopt(3), it contains a number of enhancements, including:
+
+       1) popt is fully reentrant
+       2) popt can parse arbitrary argv[] style arrays while 
+          getopt(2) makes this quite difficult
+       3) popt allows users to alias command line arguments
+       4) popt provides convience functions for parsing strings
+          into argv[] style arrays
+
+popt is used by rpm, the Red Hat install program, and many other Red Hat
+utilities, all of which provide excellent examples of how to use popt. 
+Complete documentation on popt is available in popt.ps (included in this
+tarball), which is excerpted with permission from the book "Linux
+Application Development" by Michael K. Johnson and Erik Troan (availble
+from Addison Wesley in May, 1998).
+
+Comments on popt should be addressed to ewt@redhat.com.
diff --git a/ctdb/lib/popt/findme.c b/ctdb/lib/popt/findme.c
new file mode 100644 (file)
index 0000000..a950e50
--- /dev/null
@@ -0,0 +1,50 @@
+/** \ingroup popt
+ * \file popt/findme.c
+ */
+
+/* (C) 1998-2002 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from 
+   ftp://ftp.rpm.org/pub/rpm/dist. */
+
+#include "system.h"
+#include "findme.h"
+
+const char * findProgramPath(const char * argv0) {
+    char * path = getenv("PATH");
+    char * pathbuf;
+    char * start, * chptr;
+    char * buf;
+
+    if (argv0 == NULL) return NULL;    /* XXX can't happen */
+    /* If there is a / in the argv[0], it has to be an absolute path */
+    if (strchr(argv0, '/'))
+       return xstrdup(argv0);
+
+    if (path == NULL) return NULL;
+
+    start = pathbuf = alloca(strlen(path) + 1);
+    buf = malloc(strlen(path) + strlen(argv0) + sizeof("/"));
+    if (buf == NULL) return NULL;      /* XXX can't happen */
+    strcpy(pathbuf, path);
+
+    chptr = NULL;
+    /*@-branchstate@*/
+    do {
+       if ((chptr = strchr(start, ':')))
+           *chptr = '\0';
+       sprintf(buf, "%s/%s", start, argv0);
+
+       if (!access(buf, X_OK))
+           return buf;
+
+       if (chptr) 
+           start = chptr + 1;
+       else
+           start = NULL;
+    } while (start && *start);
+    /*@=branchstate@*/
+
+    free(buf);
+
+    return NULL;
+}
diff --git a/ctdb/lib/popt/findme.h b/ctdb/lib/popt/findme.h
new file mode 100644 (file)
index 0000000..a016b86
--- /dev/null
@@ -0,0 +1,20 @@
+/** \ingroup popt
+ * \file popt/findme.h
+ */
+
+/* (C) 1998-2000 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from 
+   ftp://ftp.rpm.org/pub/rpm/dist. */
+
+#ifndef H_FINDME
+#define H_FINDME
+
+/**
+ * Return absolute path to executable by searching PATH.
+ * @param argv0                name of executable
+ * @return             (malloc'd) absolute path to executable (or NULL)
+ */
+/*@null@*/ const char * findProgramPath(/*@null@*/ const char * argv0)
+       /*@*/;
+
+#endif
diff --git a/ctdb/lib/popt/libpopt.m4 b/ctdb/lib/popt/libpopt.m4
new file mode 100644 (file)
index 0000000..79980d1
--- /dev/null
@@ -0,0 +1,43 @@
+dnl Check to see if we should use the included popt
+
+INCLUDED_POPT=auto
+AC_ARG_WITH(included-popt,
+[  --with-included-popt    use bundled popt library, not from system],
+[ INCLUDED_POPT=$withval ])
+
+AC_SUBST(POPT_LIBS)
+AC_SUBST(POPT_CFLAGS)
+
+if test x"$INCLUDED_POPT" != x"yes"; then
+       AC_CHECK_HEADERS(popt.h)
+       AC_CHECK_LIB(popt, poptGetContext, [ POPT_LIBS="-lpopt" ])
+       if test x"$ac_cv_header_popt_h" = x"no" -o x"$ac_cv_lib_popt_poptGetContext" = x"no"; then
+               INCLUDED_POPT=yes
+               POPT_CFLAGS=""
+       else
+               INCLUDED_POPT=no
+       fi
+fi
+
+AC_MSG_CHECKING(whether to use included popt)
+AC_MSG_RESULT($INCLUDED_POPT)
+if test x"$INCLUDED_POPT" != x"no"; then
+       dnl find the popt sources. This is meant to work both for 
+       dnl popt standalone builds, and builds of packages using popt
+       poptdir=""
+       poptpaths="$srcdir $srcdir/lib/popt $srcdir/popt $srcdir/../popt"
+       for d in $poptpaths; do
+               if test -f "$d/popt.c"; then
+                       poptdir="$d"            
+                       POPT_CFLAGS="-I$d"
+                       AC_SUBST(poptdir)
+                       break
+               fi
+       done
+        if test x"$poptdir" = "x"; then
+               AC_MSG_ERROR([cannot find popt source in $poptpaths])
+       fi
+       POPT_OBJ="popt.o findme.o poptconfig.o popthelp.o poptparse.o"
+       AC_SUBST(POPT_OBJ)
+       AC_CHECK_HEADERS([float.h alloca.h])
+fi
diff --git a/ctdb/lib/popt/popt.c b/ctdb/lib/popt/popt.c
new file mode 100644 (file)
index 0000000..33a5ec7
--- /dev/null
@@ -0,0 +1,1240 @@
+/** \ingroup popt
+ * \file popt/popt.c
+ */
+
+/* (C) 1998-2002 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from
+   ftp://ftp.rpm.org/pub/rpm/dist */
+
+#undef MYDEBUG
+
+#include "system.h"
+
+#if HAVE_MATH_H
+#include <math.h>
+#endif
+#if HAVE_FLOAT_H
+#include <float.h>
+#endif
+
+#include "findme.h"
+#include "poptint.h"
+
+#ifdef MYDEBUG
+/*@unchecked@*/
+int _popt_debug = 0;
+#endif
+
+#ifndef HAVE_STRERROR
+static char * strerror(int errno) {
+    extern int sys_nerr;
+    extern char * sys_errlist[];
+
+    if ((0 <= errno) && (errno < sys_nerr))
+       return sys_errlist[errno];
+    else
+       return POPT_("unknown errno");
+}
+#endif
+
+#ifdef MYDEBUG
+/*@unused@*/ static void prtcon(const char *msg, poptContext con)
+{
+    if (msg) fprintf(stderr, "%s", msg);
+    fprintf(stderr, "\tcon %p os %p nextCharArg \"%s\" nextArg \"%s\" argv[%d] \"%s\"\n",
+       con, con->os,
+       (con->os->nextCharArg ? con->os->nextCharArg : ""),
+       (con->os->nextArg ? con->os->nextArg : ""),
+       con->os->next,
+       (con->os->argv && con->os->argv[con->os->next]
+               ? con->os->argv[con->os->next] : ""));
+}
+#endif
+
+void poptSetExecPath(poptContext con, const char * path, int allowAbsolute)
+{
+    con->execPath = _free(con->execPath);
+    con->execPath = xstrdup(path);
+    con->execAbsolute = allowAbsolute;
+    /*@-nullstate@*/ /* LCL: con->execPath can be NULL? */
+    return;
+    /*@=nullstate@*/
+}
+
+static void invokeCallbacksPRE(poptContext con, const struct poptOption * opt)
+       /*@globals internalState@*/
+       /*@modifies internalState@*/
+{
+    if (opt != NULL)
+    for (; opt->longName || opt->shortName || opt->arg; opt++) {
+       if (opt->arg == NULL) continue;         /* XXX program error. */
+       if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INCLUDE_TABLE) {
+           /* Recurse on included sub-tables. */
+           invokeCallbacksPRE(con, opt->arg);
+       } else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_CALLBACK &&
+                  (opt->argInfo & POPT_CBFLAG_PRE))
+       {   /*@-castfcnptr@*/
+           poptCallbackType cb = (poptCallbackType)opt->arg;
+           /*@=castfcnptr@*/
+           /* Perform callback. */
+           /*@-moduncon -noeffectuncon @*/
+           cb(con, POPT_CALLBACK_REASON_PRE, NULL, NULL, opt->descrip);
+           /*@=moduncon =noeffectuncon @*/
+       }
+    }
+}
+
+static void invokeCallbacksPOST(poptContext con, const struct poptOption * opt)
+       /*@globals internalState@*/
+       /*@modifies internalState@*/
+{
+    if (opt != NULL)
+    for (; opt->longName || opt->shortName || opt->arg; opt++) {
+       if (opt->arg == NULL) continue;         /* XXX program error. */
+       if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INCLUDE_TABLE) {
+           /* Recurse on included sub-tables. */
+           invokeCallbacksPOST(con, opt->arg);
+       } else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_CALLBACK &&
+                  (opt->argInfo & POPT_CBFLAG_POST))
+       {   /*@-castfcnptr@*/
+           poptCallbackType cb = (poptCallbackType)opt->arg;
+           /*@=castfcnptr@*/
+           /* Perform callback. */
+           /*@-moduncon -noeffectuncon @*/
+           cb(con, POPT_CALLBACK_REASON_POST, NULL, NULL, opt->descrip);
+           /*@=moduncon =noeffectuncon @*/
+       }
+    }
+}
+
+static void invokeCallbacksOPTION(poptContext con,
+                                 const struct poptOption * opt,
+                                 const struct poptOption * myOpt,
+                                 /*@null@*/ const void * myData, int shorty)
+       /*@globals internalState@*/
+       /*@modifies internalState@*/
+{
+    const struct poptOption * cbopt = NULL;
+
+    if (opt != NULL)
+    for (; opt->longName || opt->shortName || opt->arg; opt++) {
+       if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INCLUDE_TABLE) {
+           /* Recurse on included sub-tables. */
+           if (opt->arg != NULL)       /* XXX program error */
+               invokeCallbacksOPTION(con, opt->arg, myOpt, myData, shorty);
+       } else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_CALLBACK &&
+                 !(opt->argInfo & POPT_CBFLAG_SKIPOPTION)) {
+           /* Save callback info. */
+           cbopt = opt;
+       } else if (cbopt != NULL &&
+                  ((myOpt->shortName && opt->shortName && shorty &&
+                       myOpt->shortName == opt->shortName) ||
+                   (myOpt->longName && opt->longName &&
+               /*@-nullpass@*/         /* LCL: opt->longName != NULL */
+                       !strcmp(myOpt->longName, opt->longName)))
+               /*@=nullpass@*/
+                  )
+       {   /*@-castfcnptr@*/
+           poptCallbackType cb = (poptCallbackType)cbopt->arg;
+           /*@=castfcnptr@*/
+           const void * cbData = (cbopt->descrip ? cbopt->descrip : myData);
+           /* Perform callback. */
+           if (cb != NULL) {   /* XXX program error */
+               /*@-moduncon -noeffectuncon @*/
+               cb(con, POPT_CALLBACK_REASON_OPTION, myOpt,
+                       con->os->nextArg, cbData);
+               /*@=moduncon =noeffectuncon @*/
+           }
+           /* Terminate (unless explcitly continuing). */
+           if (!(cbopt->argInfo & POPT_CBFLAG_CONTINUE))
+               return;
+       }
+    }
+}
+
+poptContext poptGetContext(const char * name, int argc, const char ** argv,
+                          const struct poptOption * options, int flags)
+{
+    poptContext con = malloc(sizeof(*con));
+
+    if (con == NULL) return NULL;      /* XXX can't happen */
+    memset(con, 0, sizeof(*con));
+
+    con->os = con->optionStack;
+    con->os->argc = argc;
+    /*@-dependenttrans -assignexpose@*/        /* FIX: W2DO? */
+    con->os->argv = argv;
+    /*@=dependenttrans =assignexpose@*/
+    con->os->argb = NULL;
+
+    if (!(flags & POPT_CONTEXT_KEEP_FIRST))
+       con->os->next = 1;                      /* skip argv[0] */
+
+    con->leftovers = calloc( (argc + 1), sizeof(*con->leftovers) );
+    /*@-dependenttrans -assignexpose@*/        /* FIX: W2DO? */
+    con->options = options;
+    /*@=dependenttrans =assignexpose@*/
+    con->aliases = NULL;
+    con->numAliases = 0;
+    con->flags = flags;
+    con->execs = NULL;
+    con->numExecs = 0;
+    con->finalArgvAlloced = argc * 2;
+    con->finalArgv = calloc( con->finalArgvAlloced, sizeof(*con->finalArgv) );
+    con->execAbsolute = 1;
+    con->arg_strip = NULL;
+
+    if (getenv("POSIXLY_CORRECT") || getenv("POSIX_ME_HARDER"))
+       con->flags |= POPT_CONTEXT_POSIXMEHARDER;
+
+    if (name) {
+       char * t = malloc(strlen(name) + 1);
+       if (t) con->appName = strcpy(t, name);
+    }
+
+    /*@-internalglobs@*/
+    invokeCallbacksPRE(con, con->options);
+    /*@=internalglobs@*/
+
+    return con;
+}
+
+static void cleanOSE(/*@special@*/ struct optionStackEntry *os)
+       /*@uses os @*/
+       /*@releases os->nextArg, os->argv, os->argb @*/
+       /*@modifies os @*/
+{
+    os->nextArg = _free(os->nextArg);
+    os->argv = _free(os->argv);
+    os->argb = PBM_FREE(os->argb);
+}
+
+/*@-boundswrite@*/
+void poptResetContext(poptContext con)
+{
+    int i;
+
+    if (con == NULL) return;
+    while (con->os > con->optionStack) {
+       cleanOSE(con->os--);
+    }
+    con->os->argb = PBM_FREE(con->os->argb);
+    con->os->currAlias = NULL;
+    con->os->nextCharArg = NULL;
+    con->os->nextArg = NULL;
+    con->os->next = 1;                 /* skip argv[0] */
+
+    con->numLeftovers = 0;
+    con->nextLeftover = 0;
+    con->restLeftover = 0;
+    con->doExec = NULL;
+
+    if (con->finalArgv != NULL)
+    for (i = 0; i < con->finalArgvCount; i++) {
+       /*@-unqualifiedtrans@*/         /* FIX: typedef double indirection. */
+       con->finalArgv[i] = _free(con->finalArgv[i]);
+       /*@=unqualifiedtrans@*/
+    }
+
+    con->finalArgvCount = 0;
+    con->arg_strip = PBM_FREE(con->arg_strip);
+    /*@-nullstate@*/   /* FIX: con->finalArgv != NULL */
+    return;
+    /*@=nullstate@*/
+}
+/*@=boundswrite@*/
+
+/* Only one of longName, shortName should be set, not both. */
+/*@-boundswrite@*/
+static int handleExec(/*@special@*/ poptContext con,
+               /*@null@*/ const char * longName, char shortName)
+       /*@uses con->execs, con->numExecs, con->flags, con->doExec,
+               con->finalArgv, con->finalArgvAlloced, con->finalArgvCount @*/
+       /*@modifies con @*/
+{
+    poptItem item;
+    int i;
+
+    if (con->execs == NULL || con->numExecs <= 0) /* XXX can't happen */
+       return 0;
+
+    for (i = con->numExecs - 1; i >= 0; i--) {
+       item = con->execs + i;
+       if (longName && !(item->option.longName &&
+                       !strcmp(longName, item->option.longName)))
+           continue;
+       else if (shortName != item->option.shortName)
+           continue;
+       break;
+    }
+    if (i < 0) return 0;
+
+
+    if (con->flags & POPT_CONTEXT_NO_EXEC)
+       return 1;
+
+    if (con->doExec == NULL) {
+       con->doExec = con->execs + i;
+       return 1;
+    }
+
+    /* We already have an exec to do; remember this option for next
+       time 'round */
+    if ((con->finalArgvCount + 1) >= (con->finalArgvAlloced)) {
+       con->finalArgvAlloced += 10;
+       con->finalArgv = realloc(con->finalArgv,
+                       sizeof(*con->finalArgv) * con->finalArgvAlloced);
+    }
+
+    i = con->finalArgvCount++;
+    if (con->finalArgv != NULL)        /* XXX can't happen */
+    {  char *s  = malloc((longName ? strlen(longName) : 0) + 3);
+       if (s != NULL) {        /* XXX can't happen */
+           if (longName)
+               sprintf(s, "--%s", longName);
+           else
+               sprintf(s, "-%c", shortName);
+           con->finalArgv[i] = s;
+       } else
+           con->finalArgv[i] = NULL;
+    }
+
+    /*@-nullstate@*/   /* FIX: con->finalArgv[] == NULL */
+    return 1;
+    /*@=nullstate@*/
+}
+/*@=boundswrite@*/
+
+/* Only one of longName, shortName may be set at a time */
+static int handleAlias(/*@special@*/ poptContext con,
+               /*@null@*/ const char * longName, char shortName,
+               /*@exposed@*/ /*@null@*/ const char * nextCharArg)
+       /*@uses con->aliases, con->numAliases, con->optionStack, con->os,
+               con->os->currAlias, con->os->currAlias->option.longName @*/
+       /*@modifies con @*/
+{
+    poptItem item = con->os->currAlias;
+    int rc;
+    int i;
+
+    if (item) {
+       if (longName && (item->option.longName &&
+               !strcmp(longName, item->option.longName)))
+           return 0;
+       if (shortName && shortName == item->option.shortName)
+           return 0;
+    }
+
+    if (con->aliases == NULL || con->numAliases <= 0) /* XXX can't happen */
+       return 0;
+
+    for (i = con->numAliases - 1; i >= 0; i--) {
+       item = con->aliases + i;
+       if (longName && !(item->option.longName &&
+                       !strcmp(longName, item->option.longName)))
+           continue;
+       else if (shortName != item->option.shortName)
+           continue;
+       break;
+    }
+    if (i < 0) return 0;
+
+    if ((con->os - con->optionStack + 1) == POPT_OPTION_DEPTH)
+       return POPT_ERROR_OPTSTOODEEP;
+
+/*@-boundsread@*/
+    if (nextCharArg && *nextCharArg)
+       con->os->nextCharArg = nextCharArg;
+/*@=boundsread@*/
+
+    con->os++;
+    con->os->next = 0;
+    con->os->stuffed = 0;
+    con->os->nextArg = NULL;
+    con->os->nextCharArg = NULL;
+    con->os->currAlias = con->aliases + i;
+    rc = poptDupArgv(con->os->currAlias->argc, con->os->currAlias->argv,
+               &con->os->argc, &con->os->argv);
+    con->os->argb = NULL;
+
+    return (rc ? rc : 1);
+}
+
+/*@-bounds -boundswrite @*/
+static int execCommand(poptContext con)
+       /*@globals internalState @*/
+       /*@modifies internalState @*/
+{
+    poptItem item = con->doExec;
+    const char ** argv;
+    int argc = 0;
+#if defined(__hpux) || defined(HAVE_SETUID) || defined(HAVE_SETREUID)
+    int rc;
+#endif
+
+    if (item == NULL) /*XXX can't happen*/
+       return POPT_ERROR_NOARG;
+
+    if (item->argv == NULL || item->argc < 1 ||
+       (!con->execAbsolute && strchr(item->argv[0], '/')))
+           return POPT_ERROR_NOARG;
+
+    argv = malloc(sizeof(*argv) *
+                       (6 + item->argc + con->numLeftovers + con->finalArgvCount));
+    if (argv == NULL) return POPT_ERROR_MALLOC;        /* XXX can't happen */
+
+    if (!strchr(item->argv[0], '/') && con->execPath) {
+       char *s = alloca(strlen(con->execPath) + strlen(item->argv[0]) + sizeof("/"));
+       sprintf(s, "%s/%s", con->execPath, item->argv[0]);
+       argv[argc] = s;
+    } else {
+       argv[argc] = findProgramPath(item->argv[0]);
+    }
+    if (argv[argc++] == NULL) return POPT_ERROR_NOARG;
+
+    if (item->argc > 1) {
+       memcpy(argv + argc, item->argv + 1, sizeof(*argv) * (item->argc - 1));
+       argc += (item->argc - 1);
+    }
+
+    if (con->finalArgv != NULL && con->finalArgvCount > 0) {
+       memcpy(argv + argc, con->finalArgv,
+               sizeof(*argv) * con->finalArgvCount);
+       argc += con->finalArgvCount;
+    }
+
+    if (con->leftovers != NULL && con->numLeftovers > 0) {
+#if 0
+       argv[argc++] = "--";
+#endif
+       memcpy(argv + argc, con->leftovers, sizeof(*argv) * con->numLeftovers);
+       argc += con->numLeftovers;
+    }
+
+    argv[argc] = NULL;
+
+#ifdef __hpux
+    rc = setresuid(getuid(), getuid(),-1);
+    if (rc) return POPT_ERROR_ERRNO;
+#else
+/*
+ * XXX " ... on BSD systems setuid() should be preferred over setreuid()"
+ * XXX         sez' Timur Bakeyev <mc@bat.ru>
+ * XXX from Norbert Warmuth <nwarmuth@privat.circular.de>
+ */
+#if defined(HAVE_SETUID)
+    rc = setuid(getuid());
+    if (rc) return POPT_ERROR_ERRNO;
+#elif defined (HAVE_SETREUID)
+    rc = setreuid(getuid(), getuid()); /*hlauer: not portable to hpux9.01 */
+    if (rc) return POPT_ERROR_ERRNO;
+#else
+    ; /* Can't drop privileges */
+#endif
+#endif
+
+    if (argv[0] == NULL)
+       return POPT_ERROR_NOARG;
+
+#ifdef MYDEBUG
+if (_popt_debug)
+    {  const char ** avp;
+       fprintf(stderr, "==> execvp(%s) argv[%d]:", argv[0], argc);
+       for (avp = argv; *avp; avp++)
+           fprintf(stderr, " '%s'", *avp);
+       fprintf(stderr, "\n");
+    }
+#endif
+
+    (void) execvp(argv[0], (char *const *)(intptr_t)argv);
+
+    return POPT_ERROR_ERRNO;
+}
+/*@=bounds =boundswrite @*/
+
+/*@-boundswrite@*/
+/*@observer@*/ /*@null@*/ static const struct poptOption *
+findOption(const struct poptOption * opt, /*@null@*/ const char * longName,
+               char shortName,
+               /*@null@*/ /*@out@*/ poptCallbackType * callback,
+               /*@null@*/ /*@out@*/ const void ** callbackData,
+               int singleDash)
+       /*@modifies *callback, *callbackData */
+{
+    const struct poptOption * cb = NULL;
+
+    /* This happens when a single - is given */
+    if (singleDash && !shortName && (longName && *longName == '\0'))
+       shortName = '-';
+
+    for (; opt->longName || opt->shortName || opt->arg; opt++) {
+
+       if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INCLUDE_TABLE) {
+           const struct poptOption * opt2;
+
+           /* Recurse on included sub-tables. */
+           if (opt->arg == NULL) continue;     /* XXX program error */
+           opt2 = findOption(opt->arg, longName, shortName, callback,
+                             callbackData, singleDash);
+           if (opt2 == NULL) continue;
+           /* Sub-table data will be inheirited if no data yet. */
+           if (!(callback && *callback)) return opt2;
+           if (!(callbackData && *callbackData == NULL)) return opt2;
+           /*@-observertrans -dependenttrans @*/
+           *callbackData = opt->descrip;
+           /*@=observertrans =dependenttrans @*/
+           return opt2;
+       } else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_CALLBACK) {
+           cb = opt;
+       } else if (longName && opt->longName &&
+                  (!singleDash || (opt->argInfo & POPT_ARGFLAG_ONEDASH)) &&
+               /*@-nullpass@*/         /* LCL: opt->longName != NULL */
+                  !strcmp(longName, opt->longName))
+               /*@=nullpass@*/
+       {
+           break;
+       } else if (shortName && shortName == opt->shortName) {
+           break;
+       }
+    }
+
+    if (!opt->longName && !opt->shortName)
+       return NULL;
+    /*@-modobserver -mods @*/
+    if (callback) *callback = NULL;
+    if (callbackData) *callbackData = NULL;
+    if (cb) {
+       if (callback)
+       /*@-castfcnptr@*/
+           *callback = (poptCallbackType)cb->arg;
+       /*@=castfcnptr@*/
+       if (!(cb->argInfo & POPT_CBFLAG_INC_DATA)) {
+           if (callbackData)
+               /*@-observertrans@*/    /* FIX: typedef double indirection. */
+               *callbackData = cb->descrip;
+               /*@=observertrans@*/
+       }
+    }
+    /*@=modobserver =mods @*/
+
+    return opt;
+}
+/*@=boundswrite@*/
+
+static const char * findNextArg(/*@special@*/ poptContext con,
+               unsigned argx, int delete_arg)
+       /*@uses con->optionStack, con->os,
+               con->os->next, con->os->argb, con->os->argc, con->os->argv @*/
+       /*@modifies con @*/
+{
+    struct optionStackEntry * os = con->os;
+    const char * arg;
+
+    do {
+       int i;
+       arg = NULL;
+       while (os->next == os->argc && os > con->optionStack) os--;
+       if (os->next == os->argc && os == con->optionStack) break;
+       if (os->argv != NULL)
+       for (i = os->next; i < os->argc; i++) {
+           /*@-sizeoftype@*/
+           if (os->argb && PBM_ISSET(i, os->argb))
+               /*@innercontinue@*/ continue;
+           if (*os->argv[i] == '-')
+               /*@innercontinue@*/ continue;
+           if (--argx > 0)
+               /*@innercontinue@*/ continue;
+           arg = os->argv[i];
+           if (delete_arg) {
+               if (os->argb == NULL) os->argb = PBM_ALLOC(os->argc);
+               if (os->argb != NULL)   /* XXX can't happen */
+               PBM_SET(i, os->argb);
+           }
+           /*@innerbreak@*/ break;
+           /*@=sizeoftype@*/
+       }
+       if (os > con->optionStack) os--;
+    } while (arg == NULL);
+    return arg;
+}
+
+/*@-boundswrite@*/
+static /*@only@*/ /*@null@*/ const char *
+expandNextArg(/*@special@*/ poptContext con, const char * s)
+       /*@uses con->optionStack, con->os,
+               con->os->next, con->os->argb, con->os->argc, con->os->argv @*/
+       /*@modifies con @*/
+{
+    const char * a = NULL;
+    size_t alen;
+    char *t, *te;
+    size_t tn = strlen(s) + 1;
+    char c;
+
+    te = t = malloc(tn);;
+    if (t == NULL) return NULL;                /* XXX can't happen */
+    while ((c = *s++) != '\0') {
+       switch (c) {
+#if 0  /* XXX can't do this */
+       case '\\':      /* escape */
+           c = *s++;
+           /*@switchbreak@*/ break;
+#endif
+       case '!':
+           if (!(s[0] == '#' && s[1] == ':' && s[2] == '+'))
+               /*@switchbreak@*/ break;
+           /* XXX Make sure that findNextArg deletes only next arg. */
+           if (a == NULL) {
+               if ((a = findNextArg(con, 1, 1)) == NULL)
+                   /*@switchbreak@*/ break;
+           }
+           s += 3;
+
+           alen = strlen(a);
+           tn += alen;
+           *te = '\0';
+           t = realloc(t, tn);
+           te = t + strlen(t);
+           strncpy(te, a, alen); te += alen;
+           continue;
+           /*@notreached@*/ /*@switchbreak@*/ break;
+       default:
+           /*@switchbreak@*/ break;
+       }
+       *te++ = c;
+    }
+    *te = '\0';
+    t = realloc(t, strlen(t) + 1);     /* XXX memory leak, hard to plug */
+    return t;
+}
+/*@=boundswrite@*/
+
+static void poptStripArg(/*@special@*/ poptContext con, int which)
+       /*@uses con->arg_strip, con->optionStack @*/
+       /*@defines con->arg_strip @*/
+       /*@modifies con @*/
+{
+    /*@-sizeoftype@*/
+    if (con->arg_strip == NULL)
+       con->arg_strip = PBM_ALLOC(con->optionStack[0].argc);
+    if (con->arg_strip != NULL)                /* XXX can't happen */
+    PBM_SET(which, con->arg_strip);
+    /*@=sizeoftype@*/
+    /*@-compdef@*/ /* LCL: con->arg_strip udefined? */
+    return;
+    /*@=compdef@*/
+}
+
+int poptSaveLong(long * arg, int argInfo, long aLong)
+{
+    /* XXX Check alignment, may fail on funky platforms. */
+    if (arg == NULL || (((unsigned long)arg) & (sizeof(*arg)-1)))
+       return POPT_ERROR_NULLARG;
+
+    if (argInfo & POPT_ARGFLAG_NOT)
+       aLong = ~aLong;
+    switch (argInfo & POPT_ARGFLAG_LOGICALOPS) {
+    case 0:
+       *arg = aLong;
+       break;
+    case POPT_ARGFLAG_OR:
+       *arg |= aLong;
+       break;
+    case POPT_ARGFLAG_AND:
+       *arg &= aLong;
+       break;
+    case POPT_ARGFLAG_XOR:
+       *arg ^= aLong;
+       break;
+    default:
+       return POPT_ERROR_BADOPERATION;
+       /*@notreached@*/ break;
+    }
+    return 0;
+}
+
+int poptSaveInt(/*@null@*/ int * arg, int argInfo, long aLong)
+{
+    /* XXX Check alignment, may fail on funky platforms. */
+    if (arg == NULL || (((unsigned long)arg) & (sizeof(*arg)-1)))
+       return POPT_ERROR_NULLARG;
+
+    if (argInfo & POPT_ARGFLAG_NOT)
+       aLong = ~aLong;
+    switch (argInfo & POPT_ARGFLAG_LOGICALOPS) {
+    case 0:
+       *arg = aLong;
+       break;
+    case POPT_ARGFLAG_OR:
+       *arg |= aLong;
+       break;
+    case POPT_ARGFLAG_AND:
+       *arg &= aLong;
+       break;
+    case POPT_ARGFLAG_XOR:
+       *arg ^= aLong;
+       break;
+    default:
+       return POPT_ERROR_BADOPERATION;
+       /*@notreached@*/ break;
+    }
+    return 0;
+}
+
+/*@-boundswrite@*/
+/* returns 'val' element, -1 on last item, POPT_ERROR_* on error */
+int poptGetNextOpt(poptContext con)
+{
+    const struct poptOption * opt = NULL;
+    int done = 0;
+
+    if (con == NULL)
+       return -1;
+    while (!done) {
+       const char * origOptString = NULL;
+       poptCallbackType cb = NULL;
+       const void * cbData = NULL;
+       const char * longArg = NULL;
+       int canstrip = 0;
+       int shorty = 0;
+
+       while (!con->os->nextCharArg && con->os->next == con->os->argc
+               && con->os > con->optionStack) {
+           cleanOSE(con->os--);
+       }
+       if (!con->os->nextCharArg && con->os->next == con->os->argc) {
+           /*@-internalglobs@*/
+           invokeCallbacksPOST(con, con->options);
+           /*@=internalglobs@*/
+           if (con->doExec) return execCommand(con);
+           return -1;
+       }
+
+       /* Process next long option */
+       if (!con->os->nextCharArg) {
+           char * localOptString, * optString;
+           int thisopt;
+
+           /*@-sizeoftype@*/
+           if (con->os->argb && PBM_ISSET(con->os->next, con->os->argb)) {
+               con->os->next++;
+               continue;
+           }
+           /*@=sizeoftype@*/
+           thisopt = con->os->next;
+           if (con->os->argv != NULL)  /* XXX can't happen */
+           origOptString = con->os->argv[con->os->next++];
+
+           if (origOptString == NULL)  /* XXX can't happen */
+               return POPT_ERROR_BADOPT;
+
+           if (con->restLeftover || *origOptString != '-') {
+               if (con->flags & POPT_CONTEXT_POSIXMEHARDER)
+                   con->restLeftover = 1;
+               if (con->flags & POPT_CONTEXT_ARG_OPTS) {
+                   con->os->nextArg = xstrdup(origOptString);
+                   return 0;
+               }
+               if (con->leftovers != NULL)     /* XXX can't happen */
+                   con->leftovers[con->numLeftovers++] = origOptString;
+               continue;
+           }
+
+           /* Make a copy we can hack at */
+           localOptString = optString =
+               strcpy(alloca(strlen(origOptString) + 1), origOptString);
+
+           if (optString[0] == '\0')
+               return POPT_ERROR_BADOPT;
+
+           if (optString[1] == '-' && !optString[2]) {
+               con->restLeftover = 1;
+               continue;
+           } else {
+               char *oe;
+               int singleDash;
+
+               optString++;
+               if (*optString == '-')
+                   singleDash = 0, optString++;
+               else
+                   singleDash = 1;
+
+               /* XXX aliases with arg substitution need "--alias=arg" */
+               if (handleAlias(con, optString, '\0', NULL))
+                   continue;
+
+               if (handleExec(con, optString, '\0'))
+                   continue;
+
+               /* Check for "--long=arg" option. */
+               for (oe = optString; *oe && *oe != '='; oe++)
+                   {};
+               if (*oe == '=') {
+                   *oe++ = '\0';
+                   /* XXX longArg is mapped back to persistent storage. */
+                   longArg = origOptString + (oe - localOptString);
+               }
+
+               opt = findOption(con->options, optString, '\0', &cb, &cbData,
+                                singleDash);
+               if (!opt && !singleDash)
+                   return POPT_ERROR_BADOPT;
+           }
+
+           if (!opt) {
+               con->os->nextCharArg = origOptString + 1;
+           } else {
+               if (con->os == con->optionStack &&
+                  opt->argInfo & POPT_ARGFLAG_STRIP)
+               {
+                   canstrip = 1;
+                   poptStripArg(con, thisopt);
+               }
+               shorty = 0;
+           }
+       }
+
+       /* Process next short option */
+       /*@-branchstate@*/              /* FIX: W2DO? */
+       if (con->os->nextCharArg) {
+           origOptString = con->os->nextCharArg;
+
+           con->os->nextCharArg = NULL;
+
+           if (handleAlias(con, NULL, *origOptString, origOptString + 1))
+               continue;
+
+           if (handleExec(con, NULL, *origOptString)) {
+               /* Restore rest of short options for further processing */
+               origOptString++;
+               if (*origOptString != '\0')
+                   con->os->nextCharArg = origOptString;
+               continue;
+           }
+
+           opt = findOption(con->options, NULL, *origOptString, &cb,
+                            &cbData, 0);
+           if (!opt)
+               return POPT_ERROR_BADOPT;
+           shorty = 1;
+
+           origOptString++;
+           if (*origOptString != '\0')
+               con->os->nextCharArg = origOptString;
+       }
+       /*@=branchstate@*/
+
+       if (opt == NULL) return POPT_ERROR_BADOPT;      /* XXX can't happen */
+       if (opt->arg && (opt->argInfo & POPT_ARG_MASK) == POPT_ARG_NONE) {
+           if (poptSaveInt((int *)opt->arg, opt->argInfo, 1L))
+               return POPT_ERROR_BADOPERATION;
+       } else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_VAL) {
+           if (opt->arg) {
+               if (poptSaveInt((int *)opt->arg, opt->argInfo, (long)opt->val))
+                   return POPT_ERROR_BADOPERATION;
+           }
+       } else if ((opt->argInfo & POPT_ARG_MASK) != POPT_ARG_NONE) {
+           con->os->nextArg = _free(con->os->nextArg);
+           /*@-usedef@*/       /* FIX: W2DO? */
+           if (longArg) {
+           /*@=usedef@*/
+               longArg = expandNextArg(con, longArg);
+               con->os->nextArg = longArg;
+           } else if (con->os->nextCharArg) {
+               longArg = expandNextArg(con, con->os->nextCharArg);
+               con->os->nextArg = longArg;
+               con->os->nextCharArg = NULL;
+           } else {
+               while (con->os->next == con->os->argc &&
+                      con->os > con->optionStack) {
+                   cleanOSE(con->os--);
+               }
+               if (con->os->next == con->os->argc) {
+                   if (!(opt->argInfo & POPT_ARGFLAG_OPTIONAL))
+                       /*@-compdef@*/  /* FIX: con->os->argv not defined */
+                       return POPT_ERROR_NOARG;
+                       /*@=compdef@*/
+                   con->os->nextArg = NULL;
+               } else {
+
+                   /*
+                    * Make sure this isn't part of a short arg or the
+                    * result of an alias expansion.
+                    */
+                   if (con->os == con->optionStack &&
+                       (opt->argInfo & POPT_ARGFLAG_STRIP) &&
+                       canstrip) {
+                       poptStripArg(con, con->os->next);
+                   }
+               
+                   if (con->os->argv != NULL) {        /* XXX can't happen */
+                       /* XXX watchout: subtle side-effects live here. */
+                       longArg = con->os->argv[con->os->next++];
+                       longArg = expandNextArg(con, longArg);
+                       con->os->nextArg = longArg;
+                   }
+               }
+           }
+           longArg = NULL;
+
+           if (opt->arg) {
+               switch (opt->argInfo & POPT_ARG_MASK) {
+               case POPT_ARG_STRING:
+                   /* XXX memory leak, hard to plug */
+                   *((const char **) opt->arg) = (con->os->nextArg)
+                       ? xstrdup(con->os->nextArg) : NULL;
+                   /*@switchbreak@*/ break;
+
+               case POPT_ARG_INT:
+               case POPT_ARG_LONG:
+               {   long aLong = 0;
+                   char *end;
+
+                   if (con->os->nextArg) {
+                       aLong = strtol(con->os->nextArg, &end, 0);
+                       if (!(end && *end == '\0'))
+                           return POPT_ERROR_BADNUMBER;
+                   }
+
+                   if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_LONG) {
+                       if (aLong == LONG_MIN || aLong == LONG_MAX)
+                           return POPT_ERROR_OVERFLOW;
+                       if (poptSaveLong((long *)opt->arg, opt->argInfo, aLong))
+                           return POPT_ERROR_BADOPERATION;
+                   } else {
+                       if (aLong > INT_MAX || aLong < INT_MIN)
+                           return POPT_ERROR_OVERFLOW;
+                       if (poptSaveInt((int *)opt->arg, opt->argInfo, aLong))
+                           return POPT_ERROR_BADOPERATION;
+                   }
+               }   /*@switchbreak@*/ break;
+
+               case POPT_ARG_FLOAT:
+               case POPT_ARG_DOUBLE:
+               {   double aDouble = 0.0;
+                   char *end;
+
+                   if (con->os->nextArg) {
+                       /*@-mods@*/
+                       int saveerrno = errno;
+                       errno = 0;
+                       aDouble = strtod(con->os->nextArg, &end);
+                       if (errno == ERANGE)
+                           return POPT_ERROR_OVERFLOW;
+                       errno = saveerrno;
+                       /*@=mods@*/
+                       if (*end != '\0')
+                           return POPT_ERROR_BADNUMBER;
+                   }
+
+                   if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_DOUBLE) {
+                       *((double *) opt->arg) = aDouble;
+                   } else {
+#define _ABS(a)        ((((a) - 0.0) < DBL_EPSILON) ? -(a) : (a))
+                       if ((_ABS(aDouble) - FLT_MAX) > DBL_EPSILON)
+                           return POPT_ERROR_OVERFLOW;
+                       if ((FLT_MIN - _ABS(aDouble)) > DBL_EPSILON)
+                           return POPT_ERROR_OVERFLOW;
+                       *((float *) opt->arg) = aDouble;
+                   }
+               }   /*@switchbreak@*/ break;
+               default:
+                   fprintf(stdout,
+                       POPT_("option type (%d) not implemented in popt\n"),
+                       (opt->argInfo & POPT_ARG_MASK));
+                   exit(EXIT_FAILURE);
+                   /*@notreached@*/ /*@switchbreak@*/ break;
+               }
+           }
+       }
+
+       if (cb) {
+           /*@-internalglobs@*/
+           invokeCallbacksOPTION(con, con->options, opt, cbData, shorty);
+           /*@=internalglobs@*/
+       } else if (opt->val && ((opt->argInfo & POPT_ARG_MASK) != POPT_ARG_VAL))
+           done = 1;
+
+       if ((con->finalArgvCount + 2) >= (con->finalArgvAlloced)) {
+           con->finalArgvAlloced += 10;
+           con->finalArgv = realloc(con->finalArgv,
+                           sizeof(*con->finalArgv) * con->finalArgvAlloced);
+       }
+
+       if (con->finalArgv != NULL)
+       {   char *s = malloc((opt->longName ? strlen(opt->longName) : 0) + 3);
+           if (s != NULL) {    /* XXX can't happen */
+               if (opt->longName)
+                   sprintf(s, "%s%s",
+                       ((opt->argInfo & POPT_ARGFLAG_ONEDASH) ? "-" : "--"),
+                       opt->longName);
+               else
+                   sprintf(s, "-%c", opt->shortName);
+               con->finalArgv[con->finalArgvCount++] = s;
+           } else
+               con->finalArgv[con->finalArgvCount++] = NULL;
+       }
+
+       if (opt->arg && (opt->argInfo & POPT_ARG_MASK) == POPT_ARG_NONE)
+           /*@-ifempty@*/ ; /*@=ifempty@*/
+       else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_VAL)
+           /*@-ifempty@*/ ; /*@=ifempty@*/
+       else if ((opt->argInfo & POPT_ARG_MASK) != POPT_ARG_NONE) {
+           if (con->finalArgv != NULL && con->os->nextArg)
+               con->finalArgv[con->finalArgvCount++] =
+                       /*@-nullpass@*/ /* LCL: con->os->nextArg != NULL */
+                       xstrdup(con->os->nextArg);
+                       /*@=nullpass@*/
+       }
+    }
+
+    return (opt ? opt->val : -1);      /* XXX can't happen */
+}
+/*@=boundswrite@*/
+
+const char * poptGetOptArg(poptContext con)
+{
+    const char * ret = NULL;
+    /*@-branchstate@*/
+    if (con) {
+       ret = con->os->nextArg;
+       con->os->nextArg = NULL;
+    }
+    /*@=branchstate@*/
+    return ret;
+}
+
+const char * poptGetArg(poptContext con)
+{
+    const char * ret = NULL;
+    if (con && con->leftovers != NULL && con->nextLeftover < con->numLeftovers)
+       ret = con->leftovers[con->nextLeftover++];
+    return ret;
+}
+
+const char * poptPeekArg(poptContext con)
+{
+    const char * ret = NULL;
+    if (con && con->leftovers != NULL && con->nextLeftover < con->numLeftovers)
+       ret = con->leftovers[con->nextLeftover];
+    return ret;
+}
+
+/*@-boundswrite@*/
+const char ** poptGetArgs(poptContext con)
+{
+    if (con == NULL ||
+       con->leftovers == NULL || con->numLeftovers == con->nextLeftover)
+       return NULL;
+
+    /* some apps like [like RPM ;-) ] need this NULL terminated */
+    con->leftovers[con->numLeftovers] = NULL;
+
+    /*@-nullret -nullstate @*/ /* FIX: typedef double indirection. */
+    return (con->leftovers + con->nextLeftover);
+    /*@=nullret =nullstate @*/
+}
+/*@=boundswrite@*/
+
+poptContext poptFreeContext(poptContext con)
+{
+    poptItem item;
+    int i;
+
+    if (con == NULL) return con;
+    poptResetContext(con);
+    con->os->argb = _free(con->os->argb);
+
+    if (con->aliases != NULL)
+    for (i = 0; i < con->numAliases; i++) {
+       item = con->aliases + i;
+       /*@-modobserver -observertrans -dependenttrans@*/
+       item->option.longName = _free(item->option.longName);
+       item->option.descrip = _free(item->option.descrip);
+       item->option.argDescrip = _free(item->option.argDescrip);
+       /*@=modobserver =observertrans =dependenttrans@*/
+       item->argv = _free(item->argv);
+    }
+    con->aliases = _free(con->aliases);
+
+    if (con->execs != NULL)
+    for (i = 0; i < con->numExecs; i++) {
+       item = con->execs + i;
+       /*@-modobserver -observertrans -dependenttrans@*/
+       item->option.longName = _free(item->option.longName);
+       item->option.descrip = _free(item->option.descrip);
+       item->option.argDescrip = _free(item->option.argDescrip);
+       /*@=modobserver =observertrans =dependenttrans@*/
+       item->argv = _free(item->argv);
+    }
+    con->execs = _free(con->execs);
+
+    con->leftovers = _free(con->leftovers);
+    con->finalArgv = _free(con->finalArgv);
+    con->appName = _free(con->appName);
+    con->otherHelp = _free(con->otherHelp);
+    con->execPath = _free(con->execPath);
+    con->arg_strip = PBM_FREE(con->arg_strip);
+    
+    con = _free(con);
+    return con;
+}
+
+int poptAddAlias(poptContext con, struct poptAlias alias,
+               /*@unused@*/ int flags)
+{
+    poptItem item = alloca(sizeof(*item));
+    memset(item, 0, sizeof(*item));
+    item->option.longName = alias.longName;
+    item->option.shortName = alias.shortName;
+    item->option.argInfo = POPT_ARGFLAG_DOC_HIDDEN;
+    item->option.arg = 0;
+    item->option.val = 0;
+    item->option.descrip = NULL;
+    item->option.argDescrip = NULL;
+    item->argc = alias.argc;
+    item->argv = alias.argv;
+    return poptAddItem(con, item, 0);
+}
+
+/*@-boundswrite@*/
+/*@-mustmod@*/ /* LCL: con not modified? */
+int poptAddItem(poptContext con, poptItem newItem, int flags)
+{
+    poptItem * items, item;
+    int * nitems;
+
+    switch (flags) {
+    case 1:
+       items = &con->execs;
+       nitems = &con->numExecs;
+       break;
+    case 0:
+       items = &con->aliases;
+       nitems = &con->numAliases;
+       break;
+    default:
+       return 1;
+       /*@notreached@*/ break;
+    }
+
+    *items = realloc((*items), ((*nitems) + 1) * sizeof(**items));
+    if ((*items) == NULL)
+       return 1;
+
+    item = (*items) + (*nitems);
+
+    item->option.longName =
+       (newItem->option.longName ? xstrdup(newItem->option.longName) : NULL);
+    item->option.shortName = newItem->option.shortName;
+    item->option.argInfo = newItem->option.argInfo;
+    item->option.arg = newItem->option.arg;
+    item->option.val = newItem->option.val;
+    item->option.descrip =
+       (newItem->option.descrip ? xstrdup(newItem->option.descrip) : NULL);
+    item->option.argDescrip =
+       (newItem->option.argDescrip ? xstrdup(newItem->option.argDescrip) : NULL);
+    item->argc = newItem->argc;
+    item->argv = newItem->argv;
+
+    (*nitems)++;
+
+    return 0;
+}
+/*@=mustmod@*/
+/*@=boundswrite@*/
+
+const char * poptBadOption(poptContext con, int flags)
+{
+    struct optionStackEntry * os = NULL;
+
+    if (con != NULL)
+       os = (flags & POPT_BADOPTION_NOALIAS) ? con->optionStack : con->os;
+
+    /*@-nullderef@*/   /* LCL: os->argv != NULL */
+    return (os && os->argv ? os->argv[os->next - 1] : NULL);
+    /*@=nullderef@*/
+}
+
+const char *poptStrerror(const int error)
+{
+    switch (error) {
+      case POPT_ERROR_NOARG:
+       return POPT_("missing argument");
+      case POPT_ERROR_BADOPT:
+       return POPT_("unknown option");
+      case POPT_ERROR_BADOPERATION:
+       return POPT_("mutually exclusive logical operations requested");
+      case POPT_ERROR_NULLARG:
+       return POPT_("opt->arg should not be NULL");
+      case POPT_ERROR_OPTSTOODEEP:
+       return POPT_("aliases nested too deeply");
+      case POPT_ERROR_BADQUOTE:
+       return POPT_("error in parameter quoting");
+      case POPT_ERROR_BADNUMBER:
+       return POPT_("invalid numeric value");
+      case POPT_ERROR_OVERFLOW:
+       return POPT_("number too large or too small");
+      case POPT_ERROR_MALLOC:
+       return POPT_("memory allocation failed");
+      case POPT_ERROR_ERRNO:
+       return strerror(errno);
+      default:
+       return POPT_("unknown error");
+    }
+}
+
+int poptStuffArgs(poptContext con, const char ** argv)
+{
+    int argc;
+    int rc;
+
+    if ((con->os - con->optionStack) == POPT_OPTION_DEPTH)
+       return POPT_ERROR_OPTSTOODEEP;
+
+    for (argc = 0; argv[argc]; argc++)
+       {};
+
+    con->os++;
+    con->os->next = 0;
+    con->os->nextArg = NULL;
+    con->os->nextCharArg = NULL;
+    con->os->currAlias = NULL;
+    rc = poptDupArgv(argc, argv, &con->os->argc, &con->os->argv);
+    con->os->argb = NULL;
+    con->os->stuffed = 1;
+
+    return rc;
+}
+
+const char * poptGetInvocationName(poptContext con)
+{
+    return (con->os->argv ? con->os->argv[0] : "");
+}
+
+/*@-boundswrite@*/
+int poptStrippedArgv(poptContext con, int argc, char ** argv)
+{
+    int numargs = argc;
+    int j = 1;
+    int i;
+    
+    /*@-sizeoftype@*/
+    if (con->arg_strip)
+    for (i = 1; i < argc; i++) {
+       if (PBM_ISSET(i, con->arg_strip))
+           numargs--;
+    }
+    
+    for (i = 1; i < argc; i++) {
+       if (con->arg_strip && PBM_ISSET(i, con->arg_strip))
+           continue;
+       argv[j] = (j < numargs) ? argv[i] : NULL;
+       j++;
+    }
+    /*@=sizeoftype@*/
+    
+    return numargs;
+}
+/*@=boundswrite@*/
diff --git a/ctdb/lib/popt/popt.h b/ctdb/lib/popt/popt.h
new file mode 100644 (file)
index 0000000..04c9f65
--- /dev/null
@@ -0,0 +1,541 @@
+/** \file popt/popt.h
+ * \ingroup popt
+ */
+
+/* (C) 1998-2000 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from 
+   ftp://ftp.rpm.org/pub/rpm/dist. */
+
+#ifndef H_POPT
+#define H_POPT
+
+#include <stdio.h>                     /* for FILE * */
+
+#define POPT_OPTION_DEPTH      10
+
+/** \ingroup popt
+ * \name Arg type identifiers
+ */
+/*@{*/
+#define POPT_ARG_NONE          0       /*!< no arg */
+#define POPT_ARG_STRING                1       /*!< arg will be saved as string */
+#define POPT_ARG_INT           2       /*!< arg will be converted to int */
+#define POPT_ARG_LONG          3       /*!< arg will be converted to long */
+#define POPT_ARG_INCLUDE_TABLE 4       /*!< arg points to table */
+#define POPT_ARG_CALLBACK      5       /*!< table-wide callback... must be
+                                          set first in table; arg points 
+                                          to callback, descrip points to 
+                                          callback data to pass */
+#define POPT_ARG_INTL_DOMAIN    6       /*!< set the translation domain
+                                          for this table and any
+                                          included tables; arg points
+                                          to the domain string */
+#define POPT_ARG_VAL           7       /*!< arg should take value val */
+#define        POPT_ARG_FLOAT          8       /*!< arg will be converted to float */
+#define        POPT_ARG_DOUBLE         9       /*!< arg will be converted to double */
+
+#define POPT_ARG_MASK          0x0000FFFF
+/*@}*/
+
+/** \ingroup popt
+ * \name Arg modifiers
+ */
+/*@{*/
+#define POPT_ARGFLAG_ONEDASH   0x80000000  /*!< allow -longoption */
+#define POPT_ARGFLAG_DOC_HIDDEN 0x40000000  /*!< don't show in help/usage */
+#define POPT_ARGFLAG_STRIP     0x20000000  /*!< strip this arg from argv(only applies to long args) */
+#define        POPT_ARGFLAG_OPTIONAL   0x10000000  /*!< arg may be missing */
+
+#define        POPT_ARGFLAG_OR         0x08000000  /*!< arg will be or'ed */
+#define        POPT_ARGFLAG_NOR        0x09000000  /*!< arg will be nor'ed */
+#define        POPT_ARGFLAG_AND        0x04000000  /*!< arg will be and'ed */
+#define        POPT_ARGFLAG_NAND       0x05000000  /*!< arg will be nand'ed */
+#define        POPT_ARGFLAG_XOR        0x02000000  /*!< arg will be xor'ed */
+#define        POPT_ARGFLAG_NOT        0x01000000  /*!< arg will be negated */
+#define POPT_ARGFLAG_LOGICALOPS \
+        (POPT_ARGFLAG_OR|POPT_ARGFLAG_AND|POPT_ARGFLAG_XOR)
+
+#define        POPT_BIT_SET    (POPT_ARG_VAL|POPT_ARGFLAG_OR)
+                                       /*!< set arg bit(s) */
+#define        POPT_BIT_CLR    (POPT_ARG_VAL|POPT_ARGFLAG_NAND)
+                                       /*!< clear arg bit(s) */
+
+#define        POPT_ARGFLAG_SHOW_DEFAULT 0x00800000 /*!< show default value in --help */
+
+/*@}*/
+
+/** \ingroup popt
+ * \name Callback modifiers
+ */
+/*@{*/
+#define POPT_CBFLAG_PRE                0x80000000  /*!< call the callback before parse */
+#define POPT_CBFLAG_POST       0x40000000  /*!< call the callback after parse */
+#define POPT_CBFLAG_INC_DATA   0x20000000  /*!< use data from the include line,
+                                              not the subtable */
+#define POPT_CBFLAG_SKIPOPTION 0x10000000  /*!< don't callback with option */
+#define POPT_CBFLAG_CONTINUE   0x08000000  /*!< continue callbacks with option */
+/*@}*/
+
+/** \ingroup popt
+ * \name Error return values
+ */
+/*@{*/
+#define POPT_ERROR_NOARG       -10     /*!< missing argument */
+#define POPT_ERROR_BADOPT      -11     /*!< unknown option */
+#define POPT_ERROR_OPTSTOODEEP -13     /*!< aliases nested too deeply */
+#define POPT_ERROR_BADQUOTE    -15     /*!< error in paramter quoting */
+#define POPT_ERROR_ERRNO       -16     /*!< errno set, use strerror(errno) */
+#define POPT_ERROR_BADNUMBER   -17     /*!< invalid numeric value */
+#define POPT_ERROR_OVERFLOW    -18     /*!< number too large or too small */
+#define        POPT_ERROR_BADOPERATION -19     /*!< mutually exclusive logical operations requested */
+#define        POPT_ERROR_NULLARG      -20     /*!< opt->arg should not be NULL */
+#define        POPT_ERROR_MALLOC       -21     /*!< memory allocation failed */
+/*@}*/
+
+/** \ingroup popt
+ * \name poptBadOption() flags
+ */
+/*@{*/
+#define POPT_BADOPTION_NOALIAS  (1 << 0)  /*!< don't go into an alias */
+/*@}*/
+
+/** \ingroup popt
+ * \name poptGetContext() flags
+ */
+/*@{*/
+#define POPT_CONTEXT_NO_EXEC   (1 << 0)  /*!< ignore exec expansions */
+#define POPT_CONTEXT_KEEP_FIRST        (1 << 1)  /*!< pay attention to argv[0] */
+#define POPT_CONTEXT_POSIXMEHARDER (1 << 2) /*!< options can't follow args */
+#define POPT_CONTEXT_ARG_OPTS  (1 << 4) /*!< return args as options with value 0 */
+/*@}*/
+
+/** \ingroup popt
+ */
+struct poptOption {
+/*@observer@*/ /*@null@*/ const char * longName; /*!< may be NULL */
+    char shortName;                    /*!< may be '\0' */
+    int argInfo;
+/*@shared@*/ /*@null@*/ void * arg;    /*!< depends on argInfo */
+    int val;                   /*!< 0 means don't return, just update flag */
+/*@observer@*/ /*@null@*/ const char * descrip;        /*!< description for autohelp -- may be NULL */
+/*@observer@*/ /*@null@*/ const char * argDescrip; /*!< argument description for autohelp */
+};
+
+/** \ingroup popt
+ * A popt alias argument for poptAddAlias().
+ */
+struct poptAlias {
+/*@owned@*/ /*@null@*/ const char * longName;  /*!< may be NULL */
+    char shortName;            /*!< may be '\0' */
+    int argc;
+/*@owned@*/ const char ** argv;        /*!< must be free()able */
+};
+
+/** \ingroup popt
+ * A popt alias or exec argument for poptAddItem().
+ */
+/*@-exporttype@*/
+typedef struct poptItem_s {
+    struct poptOption option;  /*!< alias/exec name(s) and description. */
+    int argc;                  /*!< (alias) no. of args. */
+/*@owned@*/ const char ** argv;        /*!< (alias) args, must be free()able. */
+} * poptItem;
+/*@=exporttype@*/
+
+/** \ingroup popt
+ * \name Auto-generated help/usage
+ */
+/*@{*/
+
+/**
+ * Empty table marker to enable displaying popt alias/exec options.
+ */
+/*@-exportvar@*/
+/*@unchecked@*/ /*@observer@*/
+extern struct poptOption poptAliasOptions[];
+/*@=exportvar@*/
+#define POPT_AUTOALIAS { NULL, '\0', POPT_ARG_INCLUDE_TABLE, poptAliasOptions, \
+                       0, "Options implemented via popt alias/exec:", NULL },
+
+/**
+ * Auto help table options.
+ */
+/*@-exportvar@*/
+/*@unchecked@*/ /*@observer@*/
+extern struct poptOption poptHelpOptions[];
+/*@=exportvar@*/
+#define POPT_AUTOHELP { NULL, '\0', POPT_ARG_INCLUDE_TABLE, poptHelpOptions, \
+                       0, "Help options:", NULL },
+
+#define POPT_TABLEEND { NULL, '\0', 0, 0, 0, NULL, NULL }
+/*@}*/
+
+/** \ingroup popt
+ */
+/*@-exporttype@*/
+typedef /*@abstract@*/ struct poptContext_s * poptContext;
+/*@=exporttype@*/
+
+/** \ingroup popt
+ */
+#ifndef __cplusplus
+/*@-exporttype -typeuse@*/
+typedef struct poptOption * poptOption;
+/*@=exporttype =typeuse@*/
+#endif
+
+/*@-exportconst@*/
+enum poptCallbackReason {
+    POPT_CALLBACK_REASON_PRE   = 0, 
+    POPT_CALLBACK_REASON_POST  = 1,
+    POPT_CALLBACK_REASON_OPTION = 2
+};
+/*@=exportconst@*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*@-type@*/
+
+/** \ingroup popt
+ * Table callback prototype.
+ * @param con          context
+ * @param reason       reason for callback
+ * @param opt          option that triggered callback
+ * @param arg          @todo Document.
+ * @param data         @todo Document.
+ */
+typedef void (*poptCallbackType) (poptContext con, 
+               enum poptCallbackReason reason,
+               /*@null@*/ const struct poptOption * opt,
+               /*@null@*/ const char * arg,
+               /*@null@*/ const void * data)
+       /*@*/;
+
+/** \ingroup popt
+ * Initialize popt context.
+ * @param name
+ * @param argc         no. of arguments
+ * @param argv         argument array
+ * @param options      address of popt option table
+ * @param flags                or'd POPT_CONTEXT_* bits
+ * @return             initialized popt context
+ */
+/*@only@*/ /*@null@*/ poptContext poptGetContext(
+               /*@dependent@*/ /*@keep@*/ const char * name,
+               int argc, /*@dependent@*/ /*@keep@*/ const char ** argv,
+               /*@dependent@*/ /*@keep@*/ const struct poptOption * options,
+               int flags)
+       /*@*/;
+
+/** \ingroup popt
+ * Reinitialize popt context.
+ * @param con          context
+ */
+/*@-exportlocal@*/
+void poptResetContext(/*@null@*/poptContext con)
+       /*@modifies con @*/;
+/*@=exportlocal@*/
+
+/** \ingroup popt
+ * Return value of next option found.
+ * @param con          context
+ * @return             next option val, -1 on last item, POPT_ERROR_* on error
+ */
+int poptGetNextOpt(/*@null@*/poptContext con)
+       /*@globals fileSystem, internalState @*/
+       /*@modifies con, fileSystem, internalState @*/;
+
+/*@-redecl@*/
+/** \ingroup popt
+ * Return next option argument (if any).
+ * @param con          context
+ * @return             option argument, NULL if no more options are available
+ */
+/*@observer@*/ /*@null@*/ const char * poptGetOptArg(/*@null@*/poptContext con)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Return current option's argument.
+ * @param con          context
+ * @return             option argument, NULL if no more options are available
+ */
+/*@observer@*/ /*@null@*/ const char * poptGetArg(/*@null@*/poptContext con)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Peek at current option's argument.
+ * @param con          context
+ * @return             option argument
+ */
+/*@observer@*/ /*@null@*/ const char * poptPeekArg(/*@null@*/poptContext con)
+       /*@*/;
+
+/** \ingroup popt
+ * Return remaining arguments.
+ * @param con          context
+ * @return             argument array, terminated with NULL
+ */
+/*@observer@*/ /*@null@*/ const char ** poptGetArgs(/*@null@*/poptContext con)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Return the option which caused the most recent error.
+ * @param con          context
+ * @param flags
+ * @return             offending option
+ */
+/*@observer@*/ const char * poptBadOption(/*@null@*/poptContext con, int flags)
+       /*@*/;
+/*@=redecl@*/
+
+/** \ingroup popt
+ * Destroy context.
+ * @param con          context
+ * @return             NULL always
+ */
+/*@null@*/ poptContext poptFreeContext( /*@only@*/ /*@null@*/ poptContext con)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Add arguments to context.
+ * @param con          context
+ * @param argv         argument array, NULL terminated
+ * @return             0 on success, POPT_ERROR_OPTSTOODEEP on failure
+ */
+int poptStuffArgs(poptContext con, /*@keep@*/ const char ** argv)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Add alias to context.
+ * @todo Pass alias by reference, not value.
+ * @deprecated Use poptAddItem instead.
+ * @param con          context
+ * @param alias                alias to add
+ * @param flags                (unused)
+ * @return             0 on success
+ */
+/*@unused@*/
+int poptAddAlias(poptContext con, struct poptAlias alias, int flags)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Add alias/exec item to context.
+ * @param con          context
+ * @param newItem      alias/exec item to add
+ * @param flags                0 for alias, 1 for exec
+ * @return             0 on success
+ */
+int poptAddItem(poptContext con, poptItem newItem, int flags)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Read configuration file.
+ * @param con          context
+ * @param fn           file name to read
+ * @return             0 on success, POPT_ERROR_ERRNO on failure
+ */
+int poptReadConfigFile(poptContext con, const char * fn)
+       /*@globals fileSystem, internalState @*/
+       /*@modifies con->execs, con->numExecs,
+               fileSystem, internalState @*/;
+
+/** \ingroup popt
+ * Read default configuration from /etc/popt and $HOME/.popt.
+ * @param con          context
+ * @param useEnv       (unused)
+ * @return             0 on success, POPT_ERROR_ERRNO on failure
+ */
+int poptReadDefaultConfig(poptContext con, /*@unused@*/ int useEnv)
+       /*@globals fileSystem, internalState @*/
+       /*@modifies con->execs, con->numExecs,
+               fileSystem, internalState @*/;
+
+/** \ingroup popt
+ * Duplicate an argument array.
+ * @note: The argument array is malloc'd as a single area, so only argv must
+ * be free'd.
+ *
+ * @param argc         no. of arguments
+ * @param argv         argument array
+ * @retval argcPtr     address of returned no. of arguments
+ * @retval argvPtr     address of returned argument array
+ * @return             0 on success, POPT_ERROR_NOARG on failure
+ */
+int poptDupArgv(int argc, /*@null@*/ const char **argv,
+               /*@null@*/ /*@out@*/ int * argcPtr,
+               /*@null@*/ /*@out@*/ const char *** argvPtr)
+       /*@modifies *argcPtr, *argvPtr @*/;
+
+/** \ingroup popt
+ * Parse a string into an argument array.
+ * The parse allows ', ", and \ quoting, but ' is treated the same as " and
+ * both may include \ quotes.
+ * @note: The argument array is malloc'd as a single area, so only argv must
+ * be free'd.
+ *
+ * @param s            string to parse
+ * @retval argcPtr     address of returned no. of arguments
+ * @retval argvPtr     address of returned argument array
+ */
+int poptParseArgvString(const char * s,
+               /*@out@*/ int * argcPtr, /*@out@*/ const char *** argvPtr)
+       /*@modifies *argcPtr, *argvPtr @*/;
+
+/** \ingroup popt
+ * Parses an input configuration file and returns an string that is a 
+ * command line.  For use with popt.  You must free the return value when done.
+ *
+ * Given the file:
+\verbatim
+# this line is ignored
+    #   this one too
+aaa
+  bbb
+    ccc   
+bla=bla
+
+this_is   =   fdsafdas
+     bad_line=        
+  reall bad line  
+  reall bad line  = again
+5555=   55555   
+  test = with lots of spaces
+\endverbatim
+*
+* The result is:
+\verbatim
+--aaa --bbb --ccc --bla="bla" --this_is="fdsafdas" --5555="55555" --test="with lots of spaces"
+\endverbatim
+*
+* Passing this to poptParseArgvString() yields an argv of:
+\verbatim
+'--aaa'
+'--bbb' 
+'--ccc' 
+'--bla=bla' 
+'--this_is=fdsafdas' 
+'--5555=55555' 
+'--test=with lots of spaces' 
+\endverbatim
+ *
+ * @bug NULL is returned if file line is too long.
+ * @bug Silently ignores invalid lines.
+ *
+ * @param fp           file handle to read
+ * @param *argstrp     return string of options (malloc'd)
+ * @param flags                unused
+ * @return             0 on success
+ * @see                        poptParseArgvString
+ */
+/*@-fcnuse@*/
+int poptConfigFileToString(FILE *fp, /*@out@*/ char ** argstrp, int flags)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, *argstrp, fileSystem @*/;
+/*@=fcnuse@*/
+
+/** \ingroup popt
+ * Return formatted error string for popt failure.
+ * @param error                popt error
+ * @return             error string
+ */
+/*@-redecl@*/
+/*@observer@*/ const char *poptStrerror(const int error)
+       /*@*/;
+/*@=redecl@*/
+
+/** \ingroup popt
+ * Limit search for executables.
+ * @param con          context
+ * @param path         single path to search for executables
+ * @param allowAbsolute        absolute paths only?
+ */
+void poptSetExecPath(poptContext con, const char * path, int allowAbsolute)
+       /*@modifies con @*/;
+
+/** \ingroup popt
+ * Print detailed description of options.
+ * @param con          context
+ * @param fp           ouput file handle
+ * @param flags                (unused)
+ */
+void poptPrintHelp(poptContext con, FILE * fp, /*@unused@*/ int flags)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/;
+
+/** \ingroup popt
+ * Print terse description of options.
+ * @param con          context
+ * @param fp           ouput file handle
+ * @param flags                (unused)
+ */
+void poptPrintUsage(poptContext con, FILE * fp, /*@unused@*/ int flags)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/;
+
+/** \ingroup popt
+ * Provide text to replace default "[OPTION...]" in help/usage output.
+ * @param con          context
+ * @param text         replacement text
+ */
+/*@-fcnuse@*/
+void poptSetOtherOptionHelp(poptContext con, const char * text)
+       /*@modifies con @*/;
+/*@=fcnuse@*/
+
+/** \ingroup popt
+ * Return argv[0] from context.
+ * @param con          context
+ * @return             argv[0]
+ */
+/*@-redecl -fcnuse@*/
+/*@observer@*/ const char * poptGetInvocationName(poptContext con)
+       /*@*/;
+/*@=redecl =fcnuse@*/
+
+/** \ingroup popt
+ * Shuffle argv pointers to remove stripped args, returns new argc.
+ * @param con          context
+ * @param argc         no. of args
+ * @param argv         arg vector
+ * @return             new argc
+ */
+/*@-fcnuse@*/
+int poptStrippedArgv(poptContext con, int argc, char ** argv)
+       /*@modifies *argv @*/;
+/*@=fcnuse@*/
+
+/**
+ * Save a long, performing logical operation with value.
+ * @warning Alignment check may be too strict on certain platorms.
+ * @param arg          integer pointer, aligned on int boundary.
+ * @param argInfo      logical operation (see POPT_ARGFLAG_*)
+ * @param aLong                value to use
+ * @return             0 on success, POPT_ERROR_NULLARG/POPT_ERROR_BADOPERATION
+ */
+/*@-incondefs@*/
+int poptSaveLong(/*@null@*/ long * arg, int argInfo, long aLong)
+       /*@modifies *arg @*/
+       /*@requires maxSet(arg) >= 0 /\ maxRead(arg) == 0 @*/;
+/*@=incondefs@*/
+
+/**
+ * Save an integer, performing logical operation with value.
+ * @warning Alignment check may be too strict on certain platorms.
+ * @param arg          integer pointer, aligned on int boundary.
+ * @param argInfo      logical operation (see POPT_ARGFLAG_*)
+ * @param aLong                value to use
+ * @return             0 on success, POPT_ERROR_NULLARG/POPT_ERROR_BADOPERATION
+ */
+/*@-incondefs@*/
+int poptSaveInt(/*@null@*/ int * arg, int argInfo, long aLong)
+       /*@modifies *arg @*/
+       /*@requires maxSet(arg) >= 0 /\ maxRead(arg) == 0 @*/;
+/*@=incondefs@*/
+
+/*@=type@*/
+#ifdef  __cplusplus
+}
+#endif
+
+#endif
diff --git a/ctdb/lib/popt/poptconfig.c b/ctdb/lib/popt/poptconfig.c
new file mode 100644 (file)
index 0000000..5edf916
--- /dev/null
@@ -0,0 +1,191 @@
+/** \ingroup popt
+ * \file popt/poptconfig.c
+ */
+
+/* (C) 1998-2002 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from 
+   ftp://ftp.rpm.org/pub/rpm/dist. */
+
+#include "system.h"
+#include "poptint.h"
+
+/*@-compmempass@*/     /* FIX: item->option.longName kept, not dependent. */
+static void configLine(poptContext con, char * line)
+       /*@modifies con @*/
+{
+    /*@-type@*/
+    int nameLength = strlen(con->appName);
+    /*@=type@*/
+    const char * entryType;
+    const char * opt;
+    poptItem item = alloca(sizeof(*item));
+    int i, j;
+    
+/*@-boundswrite@*/
+    memset(item, 0, sizeof(*item));
+
+    /*@-type@*/
+    if (strncmp(line, con->appName, nameLength)) return;
+    /*@=type@*/
+
+    line += nameLength;
+    if (*line == '\0' || !isspace(*line)) return;
+
+    while (*line != '\0' && isspace(*line)) line++;
+    entryType = line;
+    while (*line == '\0' || !isspace(*line)) line++;
+    *line++ = '\0';
+
+    while (*line != '\0' && isspace(*line)) line++;
+    if (*line == '\0') return;
+    opt = line;
+    while (*line == '\0' || !isspace(*line)) line++;
+    *line++ = '\0';
+
+    while (*line != '\0' && isspace(*line)) line++;
+    if (*line == '\0') return;
+
+    /*@-temptrans@*/ /* FIX: line alias is saved */
+    if (opt[0] == '-' && opt[1] == '-')
+       item->option.longName = opt + 2;
+    else if (opt[0] == '-' && opt[2] == '\0')
+       item->option.shortName = opt[1];
+    /*@=temptrans@*/
+
+    if (poptParseArgvString(line, &item->argc, &item->argv)) return;
+
+    /*@-modobserver@*/
+    item->option.argInfo = POPT_ARGFLAG_DOC_HIDDEN;
+    for (i = 0, j = 0; i < item->argc; i++, j++) {
+       const char * f;
+       if (!strncmp(item->argv[i], "--POPTdesc=", sizeof("--POPTdesc=")-1)) {
+           f = item->argv[i] + sizeof("--POPTdesc=");
+           if (f[0] == '$' && f[1] == '"') f++;
+           item->option.descrip = f;
+           item->option.argInfo &= ~POPT_ARGFLAG_DOC_HIDDEN;
+           j--;
+       } else
+       if (!strncmp(item->argv[i], "--POPTargs=", sizeof("--POPTargs=")-1)) {
+           f = item->argv[i] + sizeof("--POPTargs=");
+           if (f[0] == '$' && f[1] == '"') f++;
+           item->option.argDescrip = f;
+           item->option.argInfo &= ~POPT_ARGFLAG_DOC_HIDDEN;
+           item->option.argInfo |= POPT_ARG_STRING;
+           j--;
+       } else
+       if (j != i)
+           item->argv[j] = item->argv[i];
+    }
+    if (j != i) {
+       item->argv[j] = NULL;
+       item->argc = j;
+    }
+    /*@=modobserver@*/
+/*@=boundswrite@*/
+       
+    /*@-nullstate@*/ /* FIX: item->argv[] may be NULL */
+    if (!strcmp(entryType, "alias"))
+       (void) poptAddItem(con, item, 0);
+    else if (!strcmp(entryType, "exec"))
+       (void) poptAddItem(con, item, 1);
+    /*@=nullstate@*/
+}
+/*@=compmempass@*/
+
+int poptReadConfigFile(poptContext con, const char * fn)
+{
+    char * file;
+    const char * chptr, * end;
+    char * buf;
+/*@dependent@*/ char * dst;
+    int fd, rc;
+    off_t fileLength;
+
+    fd = open(fn, O_RDONLY);
+    if (fd < 0)
+       return (errno == ENOENT ? 0 : POPT_ERROR_ERRNO);
+
+    fileLength = lseek(fd, 0, SEEK_END);
+    if (fileLength == -1 || lseek(fd, 0, 0) == -1) {
+       rc = errno;
+       (void) close(fd);
+       /*@-mods@*/
+       errno = rc;
+       /*@=mods@*/
+       return POPT_ERROR_ERRNO;
+    }
+
+    file = alloca(fileLength + 1);
+    if (read(fd, (char *)file, fileLength) != fileLength) {
+       rc = errno;
+       (void) close(fd);
+       /*@-mods@*/
+       errno = rc;
+       /*@=mods@*/
+       return POPT_ERROR_ERRNO;
+    }
+    if (close(fd) == -1)
+       return POPT_ERROR_ERRNO;
+
+/*@-boundswrite@*/
+    dst = buf = alloca(fileLength + 1);
+
+    chptr = file;
+    end = (file + fileLength);
+    /*@-infloops@*/    /* LCL: can't detect chptr++ */
+    while (chptr < end) {
+       switch (*chptr) {
+         case '\n':
+           *dst = '\0';
+           dst = buf;
+           while (*dst && isspace(*dst)) dst++;
+           if (*dst && *dst != '#')
+               configLine(con, dst);
+           chptr++;
+           /*@switchbreak@*/ break;
+         case '\\':
+           *dst++ = *chptr++;
+           if (chptr < end) {
+               if (*chptr == '\n') 
+                   dst--, chptr++;     
+                   /* \ at the end of a line does not insert a \n */
+               else
+                   *dst++ = *chptr++;
+           }
+           /*@switchbreak@*/ break;
+         default:
+           *dst++ = *chptr++;
+           /*@switchbreak@*/ break;
+       }
+    }
+    /*@=infloops@*/
+/*@=boundswrite@*/
+
+    return 0;
+}
+
+int poptReadDefaultConfig(poptContext con, /*@unused@*/ int useEnv)
+{
+    char * fn, * home;
+    int rc;
+
+    /*@-type@*/
+    if (!con->appName) return 0;
+    /*@=type@*/
+
+    rc = poptReadConfigFile(con, "/etc/popt");
+    if (rc) return rc;
+#if defined(HAVE_GETUID) && defined(HAVE_GETEUID)
+    if (getuid() != geteuid()) return 0;
+#endif
+
+    if ((home = getenv("HOME"))) {
+       fn = alloca(strlen(home) + 20);
+       strcpy(fn, home);
+       strcat(fn, "/.popt");
+       rc = poptReadConfigFile(con, fn);
+       if (rc) return rc;
+    }
+
+    return 0;
+}
diff --git a/ctdb/lib/popt/popthelp.c b/ctdb/lib/popt/popthelp.c
new file mode 100644 (file)
index 0000000..7ae3de7
--- /dev/null
@@ -0,0 +1,742 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 4 -*- */
+
+/*@-type@*/
+/** \ingroup popt
+ * \file popt/popthelp.c
+ */
+
+/* (C) 1998-2002 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from 
+   ftp://ftp.rpm.org/pub/rpm/dist. */
+
+#include "system.h"
+#include "poptint.h"
+
+/**
+ * Display arguments.
+ * @param con          context
+ * @param foo          (unused)
+ * @param key          option(s)
+ * @param arg          (unused)
+ * @param data         (unused)
+ */
+static void displayArgs(poptContext con,
+               /*@unused@*/ enum poptCallbackReason foo,
+               struct poptOption * key, 
+               /*@unused@*/ const char * arg, /*@unused@*/ void * data)
+       /*@globals fileSystem@*/
+       /*@modifies fileSystem@*/
+{
+    if (key->shortName == '?')
+       poptPrintHelp(con, stdout, 0);
+    else
+       poptPrintUsage(con, stdout, 0);
+    exit(0);
+}
+
+#ifdef NOTYET
+/*@unchecked@*/
+static int show_option_defaults = 0;
+#endif
+
+/**
+ * Empty table marker to enable displaying popt alias/exec options.
+ */
+/*@observer@*/ /*@unchecked@*/
+struct poptOption poptAliasOptions[] = {
+    POPT_TABLEEND
+};
+
+/**
+ * Auto help table options.
+ */
+/*@-castfcnptr@*/
+/*@observer@*/ /*@unchecked@*/
+struct poptOption poptHelpOptions[] = {
+  { NULL, '\0', POPT_ARG_CALLBACK, (void *)&displayArgs, '\0', NULL, NULL },
+  { "help", '?', 0, NULL, '?', N_("Show this help message"), NULL },
+  { "usage", '\0', 0, NULL, 'u', N_("Display brief usage message"), NULL },
+#ifdef NOTYET
+  { "defaults", '\0', POPT_ARG_NONE, &show_option_defaults, 0,
+       N_("Display option defaults in message"), NULL },
+#endif
+    POPT_TABLEEND
+} ;
+/*@=castfcnptr@*/
+
+/**
+ * @param table                option(s)
+ */
+/*@observer@*/ /*@null@*/ static const char *const
+getTableTranslationDomain(/*@null@*/ const struct poptOption *table)
+       /*@*/
+{
+    const struct poptOption *opt;
+
+    if (table != NULL)
+    for (opt = table; opt->longName || opt->shortName || opt->arg; opt++) {
+       if (opt->argInfo == POPT_ARG_INTL_DOMAIN)
+           return opt->arg;
+    }
+    return NULL;
+}
+
+/**
+ * @param opt          option(s)
+ * @param translation_domain   translation domain
+ */
+/*@observer@*/ /*@null@*/ static const char *const
+getArgDescrip(const struct poptOption * opt,
+               /*@-paramuse@*/ /* FIX: i18n macros disabled with lclint */
+               /*@null@*/ const char * translation_domain)
+               /*@=paramuse@*/
+       /*@*/
+{
+    if (!(opt->argInfo & POPT_ARG_MASK)) return NULL;
+
+    if (opt == (poptHelpOptions + 1) || opt == (poptHelpOptions + 2))
+       if (opt->argDescrip) return POPT_(opt->argDescrip);
+
+    if (opt->argDescrip) return D_(translation_domain, opt->argDescrip);
+
+    switch (opt->argInfo & POPT_ARG_MASK) {
+    case POPT_ARG_NONE:                return POPT_("NONE");
+#ifdef DYING
+    case POPT_ARG_VAL:         return POPT_("VAL");
+#else
+    case POPT_ARG_VAL:         return NULL;
+#endif
+    case POPT_ARG_INT:         return POPT_("INT");
+    case POPT_ARG_LONG:                return POPT_("LONG");
+    case POPT_ARG_STRING:      return POPT_("STRING");
+    case POPT_ARG_FLOAT:       return POPT_("FLOAT");
+    case POPT_ARG_DOUBLE:      return POPT_("DOUBLE");
+    default:                   return POPT_("ARG");
+    }
+}
+
+/**
+ * Display default value for an option.
+ * @param lineLength
+ * @param opt          option(s)
+ * @param translation_domain   translation domain
+ * @return
+ */
+static /*@only@*/ /*@null@*/ char *
+singleOptionDefaultValue(int lineLength,
+               const struct poptOption * opt,
+               /*@-paramuse@*/ /* FIX: i18n macros disabled with lclint */
+               /*@null@*/ const char * translation_domain)
+               /*@=paramuse@*/
+       /*@*/
+{
+    const char * defstr = D_(translation_domain, "default");
+    char * le = malloc(4*lineLength + 1);
+    char * l = le;
+
+    if (le == NULL) return NULL;       /* XXX can't happen */
+/*@-boundswrite@*/
+    *le = '\0';
+    *le++ = '(';
+    strcpy(le, defstr);        le += strlen(le);
+    *le++ = ':';
+    *le++ = ' ';
+    if (opt->arg)      /* XXX programmer error */
+    switch (opt->argInfo & POPT_ARG_MASK) {
+    case POPT_ARG_VAL:
+    case POPT_ARG_INT:
+    {  long aLong = *((int *)opt->arg);
+       le += sprintf(le, "%ld", aLong);
+    }  break;
+    case POPT_ARG_LONG:
+    {  long aLong = *((long *)opt->arg);
+       le += sprintf(le, "%ld", aLong);
+    }  break;
+    case POPT_ARG_FLOAT:
+    {  double aDouble = *((float *)opt->arg);
+       le += sprintf(le, "%g", aDouble);
+    }  break;
+    case POPT_ARG_DOUBLE:
+    {  double aDouble = *((double *)opt->arg);
+       le += sprintf(le, "%g", aDouble);
+    }  break;
+    case POPT_ARG_STRING:
+    {  const char * s = *(const char **)opt->arg;
+       if (s == NULL) {
+           strcpy(le, "null"); le += strlen(le);
+       } else {
+           size_t slen = 4*lineLength - (le - l) - sizeof("\"...\")");
+           *le++ = '"';
+           strncpy(le, s, slen); le[slen] = '\0'; le += strlen(le);    
+           if (slen < strlen(s)) {
+               strcpy(le, "...");      le += strlen(le);
+           }
+           *le++ = '"';
+       }
+    }  break;
+    case POPT_ARG_NONE:
+    default:
+       l = _free(l);
+       return NULL;
+       /*@notreached@*/ break;
+    }
+    *le++ = ')';
+    *le = '\0';
+/*@=boundswrite@*/
+
+    return l;
+}
+
+/**
+ * Display help text for an option.
+ * @param fp           output file handle
+ * @param maxLeftCol
+ * @param opt          option(s)
+ * @param translation_domain   translation domain
+ */
+static void singleOptionHelp(FILE * fp, int maxLeftCol, 
+               const struct poptOption * opt,
+               /*@null@*/ const char * translation_domain)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/
+{
+    int indentLength = maxLeftCol + 5;
+    int lineLength = 79 - indentLength;
+    const char * help = D_(translation_domain, opt->descrip);
+    const char * argDescrip = getArgDescrip(opt, translation_domain);
+    int helpLength;
+    char * defs = NULL;
+    char * left;
+    int nb = maxLeftCol + 1;
+
+    /* Make sure there's more than enough room in target buffer. */
+    if (opt->longName) nb += strlen(opt->longName);
+    if (argDescrip)    nb += strlen(argDescrip);
+
+/*@-boundswrite@*/
+    left = malloc(nb);
+    if (left == NULL) return;  /* XXX can't happen */
+    left[0] = '\0';
+    left[maxLeftCol] = '\0';
+
+    if (opt->longName && opt->shortName)
+       sprintf(left, "-%c, %s%s", opt->shortName,
+               ((opt->argInfo & POPT_ARGFLAG_ONEDASH) ? "-" : "--"),
+               opt->longName);
+    else if (opt->shortName != '\0') 
+       sprintf(left, "-%c", opt->shortName);
+    else if (opt->longName)
+       sprintf(left, "%s%s",
+               ((opt->argInfo & POPT_ARGFLAG_ONEDASH) ? "-" : "--"),
+               opt->longName);
+    if (!*left) goto out;
+
+    if (argDescrip) {
+       char * le = left + strlen(left);
+
+       if (opt->argInfo & POPT_ARGFLAG_OPTIONAL)
+           *le++ = '[';
+
+       /* Choose type of output */
+       /*@-branchstate@*/
+       if (opt->argInfo & POPT_ARGFLAG_SHOW_DEFAULT) {
+           defs = singleOptionDefaultValue(lineLength, opt, translation_domain);
+           if (defs) {
+               char * t = malloc((help ? strlen(help) : 0) +
+                               strlen(defs) + sizeof(" "));
+               if (t) {
+                   char * te = t;
+                   *te = '\0';
+                   if (help) {
+                       strcpy(te, help);       te += strlen(te);
+                   }
+                   *te++ = ' ';
+                   strcpy(te, defs);
+                   defs = _free(defs);
+               }
+               defs = t;
+           }
+       }
+       /*@=branchstate@*/
+
+       if (opt->argDescrip == NULL) {
+           switch (opt->argInfo & POPT_ARG_MASK) {
+           case POPT_ARG_NONE:
+               break;
+           case POPT_ARG_VAL:
+#ifdef NOTNOW  /* XXX pug ugly nerdy output */
+           {   long aLong = opt->val;
+               int ops = (opt->argInfo & POPT_ARGFLAG_LOGICALOPS);
+               int negate = (opt->argInfo & POPT_ARGFLAG_NOT);
+
+               /* Don't bother displaying typical values */
+               if (!ops && (aLong == 0L || aLong == 1L || aLong == -1L))
+                   break;
+               *le++ = '[';
+               switch (ops) {
+               case POPT_ARGFLAG_OR:
+                   *le++ = '|';
+                   /*@innerbreak@*/ break;
+               case POPT_ARGFLAG_AND:
+                   *le++ = '&';
+                   /*@innerbreak@*/ break;
+               case POPT_ARGFLAG_XOR:
+                   *le++ = '^';
+                   /*@innerbreak@*/ break;
+               default:
+                   /*@innerbreak@*/ break;
+               }
+               *le++ = '=';
+               if (negate) *le++ = '~';
+               /*@-formatconst@*/
+               le += sprintf(le, (ops ? "0x%lx" : "%ld"), aLong);
+               /*@=formatconst@*/
+               *le++ = ']';
+           }
+#endif
+               break;
+           case POPT_ARG_INT:
+           case POPT_ARG_LONG:
+           case POPT_ARG_FLOAT:
+           case POPT_ARG_DOUBLE:
+           case POPT_ARG_STRING:
+               *le++ = '=';
+               strcpy(le, argDescrip);         le += strlen(le);
+               break;
+           default:
+               break;
+           }
+       } else {
+           *le++ = '=';
+           strcpy(le, argDescrip);             le += strlen(le);
+       }
+       if (opt->argInfo & POPT_ARGFLAG_OPTIONAL)
+           *le++ = ']';
+       *le = '\0';
+    }
+/*@=boundswrite@*/
+
+    if (help)
+       fprintf(fp,"  %-*s   ", maxLeftCol, left);
+    else {
+       fprintf(fp,"  %s\n", left); 
+       goto out;
+    }
+
+    left = _free(left);
+    if (defs) {
+       help = defs; defs = NULL;
+    }
+
+    helpLength = strlen(help);
+/*@-boundsread@*/
+    while (helpLength > lineLength) {
+       const char * ch;
+       char format[16];
+
+       ch = help + lineLength - 1;
+       while (ch > help && !isspace(*ch)) ch--;
+       if (ch == help) break;          /* give up */
+       while (ch > (help + 1) && isspace(*ch)) ch--;
+       ch++;
+
+       sprintf(format, "%%.%ds\n%%%ds", (int) (ch - help), indentLength);
+       /*@-formatconst@*/
+       fprintf(fp, format, help, " ");
+       /*@=formatconst@*/
+       help = ch;
+       while (isspace(*help) && *help) help++;
+       helpLength = strlen(help);
+    }
+/*@=boundsread@*/
+
+    if (helpLength) fprintf(fp, "%s\n", help);
+
+out:
+    /*@-dependenttrans@*/
+    defs = _free(defs);
+    /*@=dependenttrans@*/
+    left = _free(left);
+}
+
+/**
+ * @param opt          option(s)
+ * @param translation_domain   translation domain
+ */
+static int maxArgWidth(const struct poptOption * opt,
+                      /*@null@*/ const char * translation_domain)
+       /*@*/
+{
+    int max = 0;
+    int len = 0;
+    const char * s;
+    
+    if (opt != NULL)
+    while (opt->longName || opt->shortName || opt->arg) {
+       if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INCLUDE_TABLE) {
+           if (opt->arg)       /* XXX program error */
+           len = maxArgWidth(opt->arg, translation_domain);
+           if (len > max) max = len;
+       } else if (!(opt->argInfo & POPT_ARGFLAG_DOC_HIDDEN)) {
+           len = sizeof("  ")-1;
+           if (opt->shortName != '\0') len += sizeof("-X")-1;
+           if (opt->shortName != '\0' && opt->longName) len += sizeof(", ")-1;
+           if (opt->longName) {
+               len += ((opt->argInfo & POPT_ARGFLAG_ONEDASH)
+                       ? sizeof("-")-1 : sizeof("--")-1);
+               len += strlen(opt->longName);
+           }
+
+           s = getArgDescrip(opt, translation_domain);
+           if (s)
+               len += sizeof("=")-1 + strlen(s);
+           if (opt->argInfo & POPT_ARGFLAG_OPTIONAL) len += sizeof("[]")-1;
+           if (len > max) max = len;
+       }
+
+       opt++;
+    }
+    
+    return max;
+}
+
+/**
+ * Display popt alias and exec help.
+ * @param fp           output file handle
+ * @param items                alias/exec array
+ * @param nitems       no. of alias/exec entries
+ * @param left
+ * @param translation_domain   translation domain
+ */
+static void itemHelp(FILE * fp,
+               /*@null@*/ poptItem items, int nitems, int left,
+               /*@null@*/ const char * translation_domain)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/
+{
+    poptItem item;
+    int i;
+
+    if (items != NULL)
+    for (i = 0, item = items; i < nitems; i++, item++) {
+       const struct poptOption * opt;
+       opt = &item->option;
+       if ((opt->longName || opt->shortName) && 
+           !(opt->argInfo & POPT_ARGFLAG_DOC_HIDDEN))
+           singleOptionHelp(fp, left, opt, translation_domain);
+    }
+}
+
+/**
+ * Display help text for a table of options.
+ * @param con          context
+ * @param fp           output file handle
+ * @param table                option(s)
+ * @param left
+ * @param translation_domain   translation domain
+ */
+static void singleTableHelp(poptContext con, FILE * fp,
+               /*@null@*/ const struct poptOption * table, int left,
+               /*@null@*/ const char * translation_domain)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/
+{
+    const struct poptOption * opt;
+    const char *sub_transdom;
+
+    if (table == poptAliasOptions) {
+       itemHelp(fp, con->aliases, con->numAliases, left, NULL);
+       itemHelp(fp, con->execs, con->numExecs, left, NULL);
+       return;
+    }
+
+    if (table != NULL)
+    for (opt = table; (opt->longName || opt->shortName || opt->arg); opt++) {
+       if ((opt->longName || opt->shortName) && 
+           !(opt->argInfo & POPT_ARGFLAG_DOC_HIDDEN))
+           singleOptionHelp(fp, left, opt, translation_domain);
+    }
+
+    if (table != NULL)
+    for (opt = table; (opt->longName || opt->shortName || opt->arg); opt++) {
+       if ((opt->argInfo & POPT_ARG_MASK) != POPT_ARG_INCLUDE_TABLE)
+           continue;
+       sub_transdom = getTableTranslationDomain(opt->arg);
+       if (sub_transdom == NULL)
+           sub_transdom = translation_domain;
+           
+       if (opt->descrip)
+           fprintf(fp, "\n%s\n", D_(sub_transdom, opt->descrip));
+
+       singleTableHelp(con, fp, opt->arg, left, sub_transdom);
+    }
+}
+
+/**
+ * @param con          context
+ * @param fp           output file handle
+ */
+static int showHelpIntro(poptContext con, FILE * fp)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/
+{
+    int len = 6;
+    const char * fn;
+
+    fprintf(fp, POPT_("Usage:"));
+    if (!(con->flags & POPT_CONTEXT_KEEP_FIRST)) {
+/*@-boundsread@*/
+       /*@-nullderef@*/        /* LCL: wazzup? */
+       fn = con->optionStack->argv[0];
+       /*@=nullderef@*/
+/*@=boundsread@*/
+       if (fn == NULL) return len;
+       if (strchr(fn, '/')) fn = strrchr(fn, '/') + 1;
+       fprintf(fp, " %s", fn);
+       len += strlen(fn) + 1;
+    }
+
+    return len;
+}
+
+void poptPrintHelp(poptContext con, FILE * fp, /*@unused@*/ int flags)
+{
+    int leftColWidth;
+
+    (void) showHelpIntro(con, fp);
+    if (con->otherHelp)
+       fprintf(fp, " %s\n", con->otherHelp);
+    else
+       fprintf(fp, " %s\n", POPT_("[OPTION...]"));
+
+    leftColWidth = maxArgWidth(con->options, NULL);
+    singleTableHelp(con, fp, con->options, leftColWidth, NULL);
+}
+
+/**
+ * @param fp           output file handle
+ * @param cursor
+ * @param opt          option(s)
+ * @param translation_domain   translation domain
+ */
+static int singleOptionUsage(FILE * fp, int cursor, 
+               const struct poptOption * opt,
+               /*@null@*/ const char *translation_domain)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/
+{
+    int len = 4;
+    char shortStr[2] = { '\0', '\0' };
+    const char * item = shortStr;
+    const char * argDescrip = getArgDescrip(opt, translation_domain);
+
+    if (opt->shortName != '\0' && opt->longName != NULL) {
+       len += 2;
+       if (!(opt->argInfo & POPT_ARGFLAG_ONEDASH)) len++;
+       len += strlen(opt->longName);
+    } else if (opt->shortName != '\0') {
+       len++;
+       shortStr[0] = opt->shortName;
+       shortStr[1] = '\0';
+    } else if (opt->longName) {
+       len += strlen(opt->longName);
+       if (!(opt->argInfo & POPT_ARGFLAG_ONEDASH)) len++;
+       item = opt->longName;
+    }
+
+    if (len == 4) return cursor;
+
+    if (argDescrip) 
+       len += strlen(argDescrip) + 1;
+
+    if ((cursor + len) > 79) {
+       fprintf(fp, "\n       ");
+       cursor = 7;
+    } 
+
+    if (opt->longName && opt->shortName) {
+       fprintf(fp, " [-%c|-%s%s%s%s]",
+           opt->shortName, ((opt->argInfo & POPT_ARGFLAG_ONEDASH) ? "" : "-"),
+           opt->longName,
+           (argDescrip ? " " : ""),
+           (argDescrip ? argDescrip : ""));
+    } else {
+       fprintf(fp, " [-%s%s%s%s]",
+           ((opt->shortName || (opt->argInfo & POPT_ARGFLAG_ONEDASH)) ? "" : "-"),
+           item,
+           (argDescrip ? (opt->shortName != '\0' ? " " : "=") : ""),
+           (argDescrip ? argDescrip : ""));
+    }
+
+    return cursor + len + 1;
+}
+
+/**
+ * Display popt alias and exec usage.
+ * @param fp           output file handle
+ * @param cursor
+ * @param item         alias/exec array
+ * @param nitems       no. of ara/exec entries
+ * @param translation_domain   translation domain
+ */
+static int itemUsage(FILE * fp, int cursor, poptItem item, int nitems,
+               /*@null@*/ const char * translation_domain)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, fileSystem @*/
+{
+    int i;
+
+    /*@-branchstate@*/         /* FIX: W2DO? */
+    if (item != NULL)
+    for (i = 0; i < nitems; i++, item++) {
+       const struct poptOption * opt;
+       opt = &item->option;
+        if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INTL_DOMAIN) {
+           translation_domain = (const char *)opt->arg;
+       } else if ((opt->longName || opt->shortName) &&
+                !(opt->argInfo & POPT_ARGFLAG_DOC_HIDDEN)) {
+           cursor = singleOptionUsage(fp, cursor, opt, translation_domain);
+       }
+    }
+    /*@=branchstate@*/
+
+    return cursor;
+}
+
+/**
+ * Keep track of option tables already processed.
+ */
+typedef struct poptDone_s {
+    int nopts;
+    int maxopts;
+    const void ** opts;
+} * poptDone;
+
+/**
+ * Display usage text for a table of options.
+ * @param con          context
+ * @param fp           output file handle
+ * @param cursor
+ * @param opt          option(s)
+ * @param translation_domain   translation domain
+ * @param done         tables already processed
+ * @return
+ */
+static int singleTableUsage(poptContext con, FILE * fp, int cursor,
+               /*@null@*/ const struct poptOption * opt,
+               /*@null@*/ const char * translation_domain,
+               /*@null@*/ poptDone done)
+       /*@globals fileSystem @*/
+       /*@modifies *fp, done, fileSystem @*/
+{
+    /*@-branchstate@*/         /* FIX: W2DO? */
+    if (opt != NULL)
+    for (; (opt->longName || opt->shortName || opt->arg) ; opt++) {
+        if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INTL_DOMAIN) {
+           translation_domain = (const char *)opt->arg;
+       } else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INCLUDE_TABLE) {
+           if (done) {
+               int i = 0;
+               for (i = 0; i < done->nopts; i++) {
+/*@-boundsread@*/
+                   const void * that = done->opts[i];
+/*@=boundsread@*/
+                   if (that == NULL || that != opt->arg)
+                       /*@innercontinue@*/ continue;
+                   /*@innerbreak@*/ break;
+               }
+               /* Skip if this table has already been processed. */
+               if (opt->arg == NULL || i < done->nopts)
+                   continue;
+/*@-boundswrite@*/
+               if (done->nopts < done->maxopts)
+                   done->opts[done->nopts++] = (const void *) opt->arg;
+/*@=boundswrite@*/
+           }
+           cursor = singleTableUsage(con, fp, cursor, opt->arg,
+                       translation_domain, done);
+       } else if ((opt->longName || opt->shortName) &&
+                !(opt->argInfo & POPT_ARGFLAG_DOC_HIDDEN)) {
+           cursor = singleOptionUsage(fp, cursor, opt, translation_domain);
+       }
+    }
+    /*@=branchstate@*/
+
+    return cursor;
+}
+
+/**
+ * Return concatenated short options for display.
+ * @todo Sub-tables should be recursed.
+ * @param opt          option(s)
+ * @param fp           output file handle
+ * @retval str         concatenation of short options
+ * @return             length of display string
+ */
+static int showShortOptions(const struct poptOption * opt, FILE * fp,
+               /*@null@*/ char * str)
+       /*@globals fileSystem @*/
+       /*@modifies *str, *fp, fileSystem @*/
+{
+    char * s = alloca(300);    /* larger than the ascii set */
+
+    s[0] = '\0';
+    /*@-branchstate@*/         /* FIX: W2DO? */
+    if (str == NULL) {
+       memset(s, 0, sizeof(s));
+       str = s;
+    }
+    /*@=branchstate@*/
+
+/*@-boundswrite@*/
+    if (opt != NULL)
+    for (; (opt->longName || opt->shortName || opt->arg); opt++) {
+       if (opt->shortName && !(opt->argInfo & POPT_ARG_MASK))
+           str[strlen(str)] = opt->shortName;
+       else if ((opt->argInfo & POPT_ARG_MASK) == POPT_ARG_INCLUDE_TABLE)
+           if (opt->arg)       /* XXX program error */
+               (void) showShortOptions(opt->arg, fp, str);
+    } 
+/*@=boundswrite@*/
+
+    if (s != str || *s != '\0')
+       return 0;
+
+    fprintf(fp, " [-%s]", s);
+    return strlen(s) + 4;
+}
+
+void poptPrintUsage(poptContext con, FILE * fp, /*@unused@*/ int flags)
+{
+    poptDone done = memset(alloca(sizeof(*done)), 0, sizeof(*done));
+    int cursor;
+
+    done->nopts = 0;
+    done->maxopts = 64;
+    cursor = done->maxopts * sizeof(*done->opts);
+/*@-boundswrite@*/
+    done->opts = memset(alloca(cursor), 0, cursor);
+    done->opts[done->nopts++] = (const void *) con->options;
+/*@=boundswrite@*/
+
+    cursor = showHelpIntro(con, fp);
+    cursor += showShortOptions(con->options, fp, NULL);
+    cursor = singleTableUsage(con, fp, cursor, con->options, NULL, done);
+    cursor = itemUsage(fp, cursor, con->aliases, con->numAliases, NULL);
+    cursor = itemUsage(fp, cursor, con->execs, con->numExecs, NULL);
+
+    if (con->otherHelp) {
+       cursor += strlen(con->otherHelp) + 1;
+       if (cursor > 79) fprintf(fp, "\n       ");
+       fprintf(fp, " %s", con->otherHelp);
+    }
+
+    fprintf(fp, "\n");
+}
+
+void poptSetOtherOptionHelp(poptContext con, const char * text)
+{
+    con->otherHelp = _free(con->otherHelp);
+    con->otherHelp = xstrdup(text);
+}
+/*@=type@*/
diff --git a/ctdb/lib/popt/poptint.h b/ctdb/lib/popt/poptint.h
new file mode 100644 (file)
index 0000000..d078997
--- /dev/null
@@ -0,0 +1,116 @@
+/** \ingroup popt
+ * \file popt/poptint.h
+ */
+
+/* (C) 1998-2000 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from 
+   ftp://ftp.rpm.org/pub/rpm/dist. */
+
+#ifndef H_POPTINT
+#define H_POPTINT
+
+/**
+ * Wrapper to free(3), hides const compilation noise, permit NULL, return NULL.
+ * @param p            memory to free
+ * @retval             NULL always
+ */
+/*@unused@*/ static inline /*@null@*/ void *
+_free(/*@only@*/ /*@null@*/ const void * p)
+       /*@modifies p @*/
+{
+    if (p != NULL)     free((void *)(intptr_t)p);
+    return NULL;
+}
+
+/* Bit mask macros. */
+/*@-exporttype -redef @*/
+typedef        unsigned int __pbm_bits;
+/*@=exporttype =redef @*/
+#define        __PBM_NBITS             (8 * sizeof (__pbm_bits))
+#define        __PBM_IX(d)             ((d) / __PBM_NBITS)
+#define __PBM_MASK(d)          ((__pbm_bits) 1 << (((unsigned)(d)) % __PBM_NBITS))
+/*@-exporttype -redef @*/
+typedef struct {
+    __pbm_bits bits[1];
+} pbm_set;
+/*@=exporttype =redef @*/
+#define        __PBM_BITS(set) ((set)->bits)
+
+#define        PBM_ALLOC(d)    calloc(__PBM_IX (d) + 1, sizeof(__pbm_bits))
+#define        PBM_FREE(s)     _free(s);
+#define PBM_SET(d, s)   (__PBM_BITS (s)[__PBM_IX (d)] |= __PBM_MASK (d))
+#define PBM_CLR(d, s)   (__PBM_BITS (s)[__PBM_IX (d)] &= ~__PBM_MASK (d))
+#define PBM_ISSET(d, s) ((__PBM_BITS (s)[__PBM_IX (d)] & __PBM_MASK (d)) != 0)
+
+struct optionStackEntry {
+    int argc;
+/*@only@*/ /*@null@*/
+    const char ** argv;
+/*@only@*/ /*@null@*/
+    pbm_set * argb;
+    int next;
+/*@only@*/ /*@null@*/
+    const char * nextArg;
+/*@observer@*/ /*@null@*/
+    const char * nextCharArg;
+/*@dependent@*/ /*@null@*/
+    poptItem currAlias;
+    int stuffed;
+};
+
+struct poptContext_s {
+    struct optionStackEntry optionStack[POPT_OPTION_DEPTH];
+/*@dependent@*/
+    struct optionStackEntry * os;
+/*@owned@*/ /*@null@*/
+    const char ** leftovers;
+    int numLeftovers;
+    int nextLeftover;
+/*@keep@*/
+    const struct poptOption * options;
+    int restLeftover;
+/*@only@*/ /*@null@*/
+    const char * appName;
+/*@only@*/ /*@null@*/
+    poptItem aliases;
+    int numAliases;
+    int flags;
+/*@owned@*/ /*@null@*/
+    poptItem execs;
+    int numExecs;
+/*@only@*/ /*@null@*/
+    const char ** finalArgv;
+    int finalArgvCount;
+    int finalArgvAlloced;
+/*@dependent@*/ /*@null@*/
+    poptItem doExec;
+/*@only@*/
+    const char * execPath;
+    int execAbsolute;
+/*@only@*/
+    const char * otherHelp;
+/*@null@*/
+    pbm_set * arg_strip;
+};
+
+#ifdef HAVE_LIBINTL_H
+#include <libintl.h>
+#endif
+
+#if defined(HAVE_GETTEXT) && !defined(__LCLINT__)
+#define _(foo) gettext(foo)
+#else
+#define _(foo) foo
+#endif
+
+#if defined(HAVE_DCGETTEXT) && !defined(__LCLINT__)
+#define D_(dom, str) dgettext(dom, str)
+#define POPT_(foo) D_("popt", foo)
+#else
+#define D_(dom, str) str
+#define POPT_(foo) foo
+#endif
+
+#define N_(foo) foo
+
+#endif
diff --git a/ctdb/lib/popt/poptparse.c b/ctdb/lib/popt/poptparse.c
new file mode 100644 (file)
index 0000000..a0dea80
--- /dev/null
@@ -0,0 +1,227 @@
+/** \ingroup popt
+ * \file popt/poptparse.c
+ */
+
+/* (C) 1998-2002 Red Hat, Inc. -- Licensing details are in the COPYING
+   file accompanying popt source distributions, available from 
+   ftp://ftp.rpm.org/pub/rpm/dist. */
+
+#include "system.h"
+
+#define POPT_ARGV_ARRAY_GROW_DELTA 5
+
+/*@-boundswrite@*/
+int poptDupArgv(int argc, const char **argv,
+               int * argcPtr, const char *** argvPtr)
+{
+    size_t nb = (argc + 1) * sizeof(*argv);
+    const char ** argv2;
+    char * dst;
+    int i;
+
+    if (argc <= 0 || argv == NULL)     /* XXX can't happen */
+       return POPT_ERROR_NOARG;
+    for (i = 0; i < argc; i++) {
+       if (argv[i] == NULL)
+           return POPT_ERROR_NOARG;
+       nb += strlen(argv[i]) + 1;
+    }
+       
+    dst = malloc(nb);
+    if (dst == NULL)                   /* XXX can't happen */
+       return POPT_ERROR_MALLOC;
+    argv2 = (void *) dst;
+    dst += (argc + 1) * sizeof(*argv);
+
+    /*@-branchstate@*/
+    for (i = 0; i < argc; i++) {
+       argv2[i] = dst;
+       dst += strlen(strcpy(dst, argv[i])) + 1;
+    }
+    /*@=branchstate@*/
+    argv2[argc] = NULL;
+
+    if (argvPtr) {
+       *argvPtr = argv2;
+    } else {
+       free(argv2);
+       argv2 = NULL;
+    }
+    if (argcPtr)
+       *argcPtr = argc;
+    return 0;
+}
+/*@=boundswrite@*/
+
+/*@-bounds@*/
+int poptParseArgvString(const char * s, int * argcPtr, const char *** argvPtr)
+{
+    const char * src;
+    char quote = '\0';
+    int argvAlloced = POPT_ARGV_ARRAY_GROW_DELTA;
+    const char ** argv = malloc(sizeof(*argv) * argvAlloced);
+    int argc = 0;
+    int buflen = strlen(s) + 1;
+    char * buf = memset(alloca(buflen), 0, buflen);
+    int rc = POPT_ERROR_MALLOC;
+
+    if (argv == NULL) return rc;
+    argv[argc] = buf;
+
+    for (src = s; *src != '\0'; src++) {
+       if (quote == *src) {
+           quote = '\0';
+       } else if (quote != '\0') {
+           if (*src == '\\') {
+               src++;
+               if (!*src) {
+                   rc = POPT_ERROR_BADQUOTE;
+                   goto exit;
+               }
+               if (*src != quote) *buf++ = '\\';
+           }
+           *buf++ = *src;
+       } else if (isspace(*src)) {
+           if (*argv[argc] != '\0') {
+               buf++, argc++;
+               if (argc == argvAlloced) {
+                   argvAlloced += POPT_ARGV_ARRAY_GROW_DELTA;
+                   argv = realloc(argv, sizeof(*argv) * argvAlloced);
+                   if (argv == NULL) goto exit;
+               }
+               argv[argc] = buf;
+           }
+       } else switch (*src) {
+         case '"':
+         case '\'':
+           quote = *src;
+           /*@switchbreak@*/ break;
+         case '\\':
+           src++;
+           if (!*src) {
+               rc = POPT_ERROR_BADQUOTE;
+               goto exit;
+           }
+           /*@fallthrough@*/
+         default:
+           *buf++ = *src;
+           /*@switchbreak@*/ break;
+       }
+    }
+
+    if (strlen(argv[argc])) {
+       argc++, buf++;
+    }
+
+    rc = poptDupArgv(argc, argv, argcPtr, argvPtr);
+
+exit:
+    if (argv) free(argv);
+    return rc;
+}
+/*@=bounds@*/
+
+/* still in the dev stage.
+ * return values, perhaps 1== file erro
+ * 2== line to long
+ * 3== umm.... more?
+ */
+int poptConfigFileToString(FILE *fp, char ** argstrp, /*@unused@*/ int flags)
+{
+    char line[999];
+    char * argstr;
+    char * p;
+    char * q;
+    char * x;
+    int t;
+    int argvlen = 0;
+    size_t maxlinelen = sizeof(line);
+    size_t linelen;
+    int maxargvlen = 480;
+    int linenum = 0;
+
+    *argstrp = NULL;
+
+    /*   |   this_is   =   our_line
+     *      p             q      x
+     */
+
+    if (fp == NULL)
+       return POPT_ERROR_NULLARG;
+
+    argstr = calloc(maxargvlen, sizeof(*argstr));
+    if (argstr == NULL) return POPT_ERROR_MALLOC;
+
+    while (fgets(line, (int)maxlinelen, fp) != NULL) {
+       linenum++;
+       p = line;
+
+       /* loop until first non-space char or EOL */
+       while( *p != '\0' && isspace(*p) )
+           p++;
+
+       linelen = strlen(p);
+       if (linelen >= maxlinelen-1)
+           return POPT_ERROR_OVERFLOW; /* XXX line too long */
+
+       if (*p == '\0' || *p == '\n') continue; /* line is empty */
+       if (*p == '#') continue;                /* comment line */
+
+       q = p;
+
+       while (*q != '\0' && (!isspace(*q)) && *q != '=')
+           q++;
+
+       if (isspace(*q)) {
+           /* a space after the name, find next non space */
+           *q++='\0';
+           while( *q != '\0' && isspace((int)*q) ) q++;
+       }
+       if (*q == '\0') {
+           /* single command line option (ie, no name=val, just name) */
+           q[-1] = '\0';               /* kill off newline from fgets() call */
+           argvlen += (t = q - p) + (sizeof(" --")-1);
+           if (argvlen >= maxargvlen) {
+               maxargvlen = (t > maxargvlen) ? t*2 : maxargvlen*2;
+               argstr = realloc(argstr, maxargvlen);
+               if (argstr == NULL) return POPT_ERROR_MALLOC;
+           }
+           strcat(argstr, " --");
+           strcat(argstr, p);
+           continue;
+       }
+       if (*q != '=')
+           continue;   /* XXX for now, silently ignore bogus line */
+               
+       /* *q is an equal sign. */
+       *q++ = '\0';
+
+       /* find next non-space letter of value */
+       while (*q != '\0' && isspace(*q))
+           q++;
+       if (*q == '\0')
+           continue;   /* XXX silently ignore missing value */
+
+       /* now, loop and strip all ending whitespace */
+       x = p + linelen;
+       while (isspace(*--x))
+           *x = 0;     /* null out last char if space (including fgets() NL) */
+
+       /* rest of line accept */
+       t = x - p;
+       argvlen += t + (sizeof("' --='")-1);
+       if (argvlen >= maxargvlen) {
+           maxargvlen = (t > maxargvlen) ? t*2 : maxargvlen*2;
+           argstr = realloc(argstr, maxargvlen);
+           if (argstr == NULL) return POPT_ERROR_MALLOC;
+       }
+       strcat(argstr, " --");
+       strcat(argstr, p);
+       strcat(argstr, "=\"");
+       strcat(argstr, q);
+       strcat(argstr, "\"");
+    }
+
+    *argstrp = argstr;
+    return 0;
+}
diff --git a/ctdb/lib/popt/samba.m4 b/ctdb/lib/popt/samba.m4
new file mode 100644 (file)
index 0000000..b6e939f
--- /dev/null
@@ -0,0 +1,10 @@
+m4_include(lib/popt/libpopt.m4)
+
+if test x"$POPT_OBJ" = "x"; then
+       SMB_EXT_LIB(LIBPOPT, [${POPT_LIBS}])
+       SMB_ENABLE(LIBPOPT,YES)
+else
+       SMB_SUBSYSTEM(LIBPOPT,
+       [lib/popt/findme.o lib/popt/popt.o lib/popt/poptconfig.o lib/popt/popthelp.o lib/popt/poptparse.o], [], [-Ilib/popt])
+fi
+
diff --git a/ctdb/lib/popt/system.h b/ctdb/lib/popt/system.h
new file mode 100644 (file)
index 0000000..000e23d
--- /dev/null
@@ -0,0 +1,74 @@
+#include "config.h"
+
+#if defined (__GLIBC__) && defined(__LCLINT__)
+/*@-declundef@*/
+/*@unchecked@*/
+extern __const __int32_t *__ctype_tolower;
+/*@unchecked@*/
+extern __const __int32_t *__ctype_toupper;
+/*@=declundef@*/
+#endif
+
+#include <ctype.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if HAVE_MCHECK_H 
+#include <mcheck.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#if HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef __NeXT
+/* access macros are not declared in non posix mode in unistd.h -
+ don't try to use posix on NeXTstep 3.3 ! */
+#include <libc.h>
+#endif
+
+#if defined(__LCLINT__)
+/*@-declundef -incondefs -redecl@*/ /* LCL: missing annotation */
+/*@only@*/ void * alloca (size_t __size)
+       /*@ensures MaxSet(result) == (__size - 1) @*/
+       /*@*/;
+/*@=declundef =incondefs =redecl@*/
+#endif
+
+/* AIX requires this to be the first thing in the file.  */ 
+#ifndef __GNUC__
+# if HAVE_ALLOCA_H
+#  include <alloca.h>
+# else
+#  ifdef _AIX
+#pragma alloca
+#  else
+#   ifndef alloca /* predefined by HP cc +Olibcalls */
+char *alloca ();
+#   endif
+#  endif
+# endif
+#elif defined(__GNUC__) && defined(__STRICT_ANSI__)
+#define alloca __builtin_alloca
+#endif
+
+/*@-redecl -redef@*/
+/*@mayexit@*/ /*@only@*/ char * xstrdup (const char *str)
+       /*@*/;
+/*@=redecl =redef@*/
+
+#if HAVE_MCHECK_H && defined(__GNUC__)
+#define        vmefail()       (fprintf(stderr, "virtual memory exhausted.\n"), exit(EXIT_FAILURE), NULL)
+#define xstrdup(_str)   (strcpy((malloc(strlen(_str)+1) ? : vmefail()), (_str)))
+#else
+#define        xstrdup(_str)   strdup(_str)
+#endif  /* HAVE_MCHECK_H && defined(__GNUC__) */
+
+
+#include "popt.h"
diff --git a/ctdb/lib/replace/.checker_innocent b/ctdb/lib/replace/.checker_innocent
new file mode 100644 (file)
index 0000000..e619176
--- /dev/null
@@ -0,0 +1,4 @@
+>>>MISTAKE21_create_files_6a9e68ada99a97cb
+>>>MISTAKE21_os2_delete_9b2bfa7f38711d09
+>>>MISTAKE21_os2_delete_2fcc29aaa99a97cb
+>>>SECURITY2_os2_delete_9b2bfa7f1c9396ca
diff --git a/ctdb/lib/replace/Makefile b/ctdb/lib/replace/Makefile
new file mode 100644 (file)
index 0000000..3649901
--- /dev/null
@@ -0,0 +1,63 @@
+# simple makefile wrapper to run waf
+
+WAF=WAF_MAKE=1 PATH=buildtools/bin:../../buildtools/bin:$$PATH waf
+
+all:
+       $(WAF) build
+
+install:
+       $(WAF) install
+
+uninstall:
+       $(WAF) uninstall
+
+test:
+       $(WAF) test $(TEST_OPTIONS)
+
+testenv:
+       $(WAF) test --testenv $(TEST_OPTIONS)
+
+quicktest:
+       $(WAF) test --quick $(TEST_OPTIONS)
+
+dist:
+       touch .tmplock
+       WAFLOCK=.tmplock $(WAF) dist
+
+distcheck:
+       touch .tmplock
+       WAFLOCK=.tmplock $(WAF) distcheck
+
+clean:
+       $(WAF) clean
+
+distclean:
+       $(WAF) distclean
+
+reconfigure: configure
+       $(WAF) reconfigure
+
+show_waf_options:
+       $(WAF) --help
+
+# some compatibility make targets
+everything: all
+
+testsuite: all
+
+check: test
+
+torture: all
+
+# this should do an install as well, once install is finished
+installcheck: test
+
+etags:
+       $(WAF) etags
+
+ctags:
+       $(WAF) ctags
+
+bin/%:: FORCE
+       $(WAF) --targets=`basename $@`
+FORCE:
diff --git a/ctdb/lib/replace/README b/ctdb/lib/replace/README
new file mode 100644 (file)
index 0000000..9dd4f73
--- /dev/null
@@ -0,0 +1,127 @@
+This subsystem ensures that we can always use a certain core set of 
+functions and types, that are either provided by the OS or by replacement 
+functions / definitions in this subsystem. The aim is to try to stick 
+to POSIX functions in here as much as possible. Convenience functions 
+that are available on no platform at all belong in other subsystems
+(such as LIBUTIL).
+
+The following functions are guaranteed:
+
+ftruncate
+strlcpy
+strlcat
+mktime
+rename
+initgroups
+memmove
+strdup
+setlinebuf
+vsyslog
+timegm
+setenv
+unsetenv
+strndup
+strnlen
+waitpid
+seteuid
+setegid
+asprintf
+snprintf
+vasprintf
+vsnprintf
+opendir
+readdir
+telldir
+seekdir
+clock_gettime
+closedir
+dlopen
+dlclose
+dlsym
+dlerror
+chroot
+bzero
+strerror
+errno
+mkdtemp
+mkstemp (a secure one!)
+pread
+pwrite
+chown
+lchown
+readline (the library)
+inet_ntoa
+inet_ntop
+inet_pton
+inet_aton
+strtoll
+strtoull
+socketpair
+strptime
+getaddrinfo
+freeaddrinfo
+getnameinfo
+gai_strerror
+getifaddrs
+freeifaddrs
+utime
+utimes
+dup2
+link
+readlink
+symlink
+realpath
+poll
+setproctitle
+
+Types:
+bool
+socklen_t
+uint{8,16,32,64}_t
+int{8,16,32,64}_t
+intptr_t
+sig_atomic_t
+blksize_t
+blkcnt_t
+
+Constants:
+PATH_NAME_MAX
+UINT{16,32,64}_MAX
+INT32_MAX
+RTLD_LAZY
+HOST_NAME_MAX
+UINT16_MAX
+UINT32_MAX
+UINT64_MAX
+CHAR_BIT
+
+Macros:
+va_copy
+__FUNCTION__
+__FILE__
+__LINE__
+__LINESTR__
+__location__
+__STRING
+__STRINGSTRING
+MIN
+MAX
+QSORT_CAST
+ZERO_STRUCT
+ZERO_STRUCTP
+ZERO_STRUCTPN
+ZERO_ARRAY
+ARRAY_SIZE
+PTR_DIFF
+
+Headers:
+stdint.h
+stdbool.h
+
+Optional C keywords:
+volatile
+
+Prerequisites:
+memset (for bzero)
+syslog (for vsyslog)
+mktemp (for mkstemp and mkdtemp)
diff --git a/ctdb/lib/replace/autoconf-2.60.m4 b/ctdb/lib/replace/autoconf-2.60.m4
new file mode 100644 (file)
index 0000000..b2694fd
--- /dev/null
@@ -0,0 +1,236 @@
+# AC_GNU_SOURCE
+# --------------
+AC_DEFUN([AC_GNU_SOURCE],
+[AH_VERBATIM([_GNU_SOURCE],
+[/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# undef _GNU_SOURCE
+#endif])dnl
+AC_BEFORE([$0], [AC_COMPILE_IFELSE])dnl
+AC_BEFORE([$0], [AC_RUN_IFELSE])dnl
+AC_DEFINE([_GNU_SOURCE])
+])
+
+# _AC_C_STD_TRY(STANDARD, TEST-PROLOGUE, TEST-BODY, OPTION-LIST,
+#              ACTION-IF-AVAILABLE, ACTION-IF-UNAVAILABLE)
+# --------------------------------------------------------------
+# Check whether the C compiler accepts features of STANDARD (e.g `c89', `c99')
+# by trying to compile a program of TEST-PROLOGUE and TEST-BODY.  If this fails,
+# try again with each compiler option in the space-separated OPTION-LIST; if one
+# helps, append it to CC.  If eventually successful, run ACTION-IF-AVAILABLE,
+# else ACTION-IF-UNAVAILABLE.
+AC_DEFUN([_AC_C_STD_TRY],
+[AC_MSG_CHECKING([for $CC option to accept ISO ]m4_translit($1, [c], [C]))
+AC_CACHE_VAL(ac_cv_prog_cc_$1,
+[ac_cv_prog_cc_$1=no
+ac_save_CC=$CC
+AC_LANG_CONFTEST([AC_LANG_PROGRAM([$2], [$3])])
+for ac_arg in '' $4
+do
+  CC="$ac_save_CC $ac_arg"
+  _AC_COMPILE_IFELSE([], [ac_cv_prog_cc_$1=$ac_arg])
+  test "x$ac_cv_prog_cc_$1" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CC=$ac_save_CC
+])# AC_CACHE_VAL
+case "x$ac_cv_prog_cc_$1" in
+  x)
+    AC_MSG_RESULT([none needed]) ;;
+  xno)
+    AC_MSG_RESULT([unsupported]) ;;
+  *)
+    CC="$CC $ac_cv_prog_cc_$1"
+    AC_MSG_RESULT([$ac_cv_prog_cc_$1]) ;;
+esac
+AS_IF([test "x$ac_cv_prog_cc_$1" != xno], [$5], [$6])
+])# _AC_C_STD_TRY
+
+# _AC_PROG_CC_C99 ([ACTION-IF-AVAILABLE], [ACTION-IF-UNAVAILABLE])
+# ----------------------------------------------------------------
+# If the C compiler is not in ISO C99 mode by default, try to add an
+# option to output variable CC to make it so.  This macro tries
+# various options that select ISO C99 on some system or another.  It
+# considers the compiler to be in ISO C99 mode if it handles mixed
+# code and declarations, _Bool, inline and restrict.
+AC_DEFUN([_AC_PROG_CC_C99],
+[_AC_C_STD_TRY([c99],
+[[#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <stdio.h>
+
+struct incomplete_array
+{
+  int datasize;
+  double data[];
+};
+
+struct named_init {
+  int number;
+  const wchar_t *name;
+  double average;
+};
+
+typedef const char *ccp;
+
+static inline int
+test_restrict(ccp restrict text)
+{
+  // See if C++-style comments work.
+  // Iterate through items via the restricted pointer.
+  // Also check for declarations in for loops.
+  for (unsigned int i = 0; *(text+i) != '\0'; ++i)
+    continue;
+  return 0;
+}
+
+// Check varargs and va_copy work.
+static void
+test_varargs(const char *format, ...)
+{
+  va_list args;
+  va_start(args, format);
+  va_list args_copy;
+  va_copy(args_copy, args);
+
+  const char *str;
+  int number;
+  float fnumber;
+
+  while (*format)
+    {
+      switch (*format++)
+       {
+       case 's': // string
+         str = va_arg(args_copy, const char *);
+         break;
+       case 'd': // int
+         number = va_arg(args_copy, int);
+         break;
+       case 'f': // float
+         fnumber = (float) va_arg(args_copy, double);
+         break;
+       default:
+         break;
+       }
+    }
+  va_end(args_copy);
+  va_end(args);
+}
+]],
+[[
+  // Check bool and long long datatypes.
+  _Bool success = false;
+  long long int bignum = -1234567890LL;
+  unsigned long long int ubignum = 1234567890uLL;
+
+  // Check restrict.
+  if (test_restrict("String literal") != 0)
+    success = true;
+  char *restrict newvar = "Another string";
+
+  // Check varargs.
+  test_varargs("s, d' f .", "string", 65, 34.234);
+
+  // Check incomplete arrays work.
+  struct incomplete_array *ia =
+    malloc(sizeof(struct incomplete_array) + (sizeof(double) * 10));
+  ia->datasize = 10;
+  for (int i = 0; i < ia->datasize; ++i)
+    ia->data[i] = (double) i * 1.234;
+
+  // Check named initialisers.
+  struct named_init ni = {
+    .number = 34,
+    .name = L"Test wide string",
+    .average = 543.34343,
+  };
+
+  ni.number = 58;
+
+  int dynamic_array[ni.number];
+  dynamic_array[43] = 543;
+
+  // work around unused variable warnings
+  return  bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x';
+]],
+dnl Try
+dnl GCC                -std=gnu99 (unused restrictive modes: -std=c99 -std=iso9899:1999)
+dnl AIX                -qlanglvl=extc99 (unused restrictive mode: -qlanglvl=stdc99)
+dnl Intel ICC  -c99
+dnl IRIX       -c99
+dnl Solaris    (unused because it causes the compiler to assume C99 semantics for
+dnl            library functions, and this is invalid before Solaris 10: -xc99)
+dnl Tru64      -c99
+dnl with extended modes being tried first.
+[[-std=gnu99 -c99 -qlanglvl=extc99]], [$1], [$2])[]dnl
+])# _AC_PROG_CC_C99
+
+# AC_PROG_CC_C99
+# --------------
+AC_DEFUN([AC_PROG_CC_C99],
+[ AC_REQUIRE([AC_PROG_CC])dnl
+  _AC_PROG_CC_C99
+])
+
+# AC_USE_SYSTEM_EXTENSIONS
+# ------------------------
+# Enable extensions on systems that normally disable them,
+# typically due to standards-conformance issues.
+m4_ifndef([AC_USE_SYSTEM_EXTENSIONS],[
+AC_DEFUN([AC_USE_SYSTEM_EXTENSIONS],
+[AC_BEFORE([$0], [AC_COMPILE_IFELSE])dnl
+AC_BEFORE([$0], [AC_RUN_IFELSE])dnl
+
+  AC_CHECK_HEADER([minix/config.h], [MINIX=yes], [MINIX=])
+  if test "$MINIX" = yes; then
+    AC_DEFINE([_POSIX_SOURCE], [1],
+      [Define to 1 if you need to in order for `stat' and other
+       things to work.])
+    AC_DEFINE([_POSIX_1_SOURCE], [2],
+      [Define to 2 if the system does not provide POSIX.1 features
+       except with this defined.])
+    AC_DEFINE([_MINIX], [1],
+      [Define to 1 if on MINIX.])
+  fi
+
+  AH_VERBATIM([__EXTENSIONS__],
+[/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# undef _ALL_SOURCE
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# undef _GNU_SOURCE
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# undef _POSIX_PTHREAD_SEMANTICS
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# undef _TANDEM_SOURCE
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# undef __EXTENSIONS__
+#endif
+])
+  AC_CACHE_CHECK([whether it is safe to define __EXTENSIONS__],
+    [ac_cv_safe_to_define___extensions__],
+    [AC_COMPILE_IFELSE(
+       [AC_LANG_PROGRAM([[
+#        define __EXTENSIONS__ 1
+         ]AC_INCLUDES_DEFAULT])],
+       [ac_cv_safe_to_define___extensions__=yes],
+       [ac_cv_safe_to_define___extensions__=no])])
+  test $ac_cv_safe_to_define___extensions__ = yes &&
+    AC_DEFINE([__EXTENSIONS__])
+  AC_DEFINE([_ALL_SOURCE])
+  AC_DEFINE([_GNU_SOURCE])
+  AC_DEFINE([_POSIX_PTHREAD_SEMANTICS])
+  AC_DEFINE([_TANDEM_SOURCE])
+])# AC_USE_SYSTEM_EXTENSIONS
+])
diff --git a/ctdb/lib/replace/configure b/ctdb/lib/replace/configure
new file mode 100755 (executable)
index 0000000..6a9f875
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+PREVPATH=`dirname $0`
+
+if [ -f $PREVPATH/../../buildtools/bin/waf ]; then
+       WAF=../../buildtools/bin/waf
+elif [ -f $PREVPATH/buildtools/bin/waf ]; then
+       WAF=./buildtools/bin/waf
+else
+       echo "replace: Unable to find waf"
+       exit 1
+fi
+
+# using JOBS=1 gives maximum compatibility with
+# systems like AIX which have broken threading in python
+JOBS=1
+export JOBS
+
+cd . || exit 1
+$WAF configure "$@" || exit 1
+cd $PREVPATH
diff --git a/ctdb/lib/replace/crypt.c b/ctdb/lib/replace/crypt.c
new file mode 100644 (file)
index 0000000..3a067bc
--- /dev/null
@@ -0,0 +1,770 @@
+/*
+   This bit of code was derived from the UFC-crypt package which
+   carries the following copyright 
+   
+   Modified for use by Samba by Andrew Tridgell, October 1994
+
+   Note that this routine is only faster on some machines. Under Linux 1.1.51 
+   libc 4.5.26 I actually found this routine to be slightly slower.
+
+   Under SunOS I found a huge speedup by using these routines 
+   (a factor of 20 or so)
+
+   Warning: I've had a report from Steve Kennedy <steve@gbnet.org>
+   that this crypt routine may sometimes get the wrong answer. Only
+   use UFC_CRYT if you really need it.
+
+*/
+
+#include "replace.h"
+
+#ifndef HAVE_CRYPT
+
+/*
+ * UFC-crypt: ultra fast crypt(3) implementation
+ *
+ * Copyright (C) 1991-1998, Free Software Foundation, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * @(#)crypt_util.c    2.31 02/08/92
+ *
+ * Support routines
+ *
+ */
+
+
+#ifndef long32
+#define long32 int32_t
+#endif
+
+#ifndef long64
+#define long64 int64_t
+#endif
+
+#ifndef ufc_long
+#define ufc_long unsigned
+#endif
+
+#ifndef _UFC_64_
+#define _UFC_32_
+#endif
+
+/* 
+ * Permutation done once on the 56 bit 
+ *  key derived from the original 8 byte ASCII key.
+ */
+static int pc1[56] = { 
+  57, 49, 41, 33, 25, 17,  9,  1, 58, 50, 42, 34, 26, 18,
+  10,  2, 59, 51, 43, 35, 27, 19, 11,  3, 60, 52, 44, 36,
+  63, 55, 47, 39, 31, 23, 15,  7, 62, 54, 46, 38, 30, 22,
+  14,  6, 61, 53, 45, 37, 29, 21, 13,  5, 28, 20, 12,  4
+};
+
+/*
+ * How much to rotate each 28 bit half of the pc1 permutated
+ *  56 bit key before using pc2 to give the i' key
+ */
+static int rots[16] = { 
+  1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1 
+};
+
+/* 
+ * Permutation giving the key 
+ * of the i' DES round 
+ */
+static int pc2[48] = { 
+  14, 17, 11, 24,  1,  5,  3, 28, 15,  6, 21, 10,
+  23, 19, 12,  4, 26,  8, 16,  7, 27, 20, 13,  2,
+  41, 52, 31, 37, 47, 55, 30, 40, 51, 45, 33, 48,
+  44, 49, 39, 56, 34, 53, 46, 42, 50, 36, 29, 32
+};
+
+/*
+ * The E expansion table which selects
+ * bits from the 32 bit intermediate result.
+ */
+static int esel[48] = { 
+  32,  1,  2,  3,  4,  5,  4,  5,  6,  7,  8,  9,
+   8,  9, 10, 11, 12, 13, 12, 13, 14, 15, 16, 17,
+  16, 17, 18, 19, 20, 21, 20, 21, 22, 23, 24, 25,
+  24, 25, 26, 27, 28, 29, 28, 29, 30, 31, 32,  1
+};
+static int e_inverse[64];
+
+/* 
+ * Permutation done on the 
+ * result of sbox lookups 
+ */
+static int perm32[32] = {
+  16,  7, 20, 21, 29, 12, 28, 17,  1, 15, 23, 26,  5, 18, 31, 10,
+  2,   8, 24, 14, 32, 27,  3,  9, 19, 13, 30,  6, 22, 11,  4, 25
+};
+
+/* 
+ * The sboxes
+ */
+static int sbox[8][4][16]= {
+        { { 14,  4, 13,  1,  2, 15, 11,  8,  3, 10,  6, 12,  5,  9,  0,  7 },
+          {  0, 15,  7,  4, 14,  2, 13,  1, 10,  6, 12, 11,  9,  5,  3,  8 },
+          {  4,  1, 14,  8, 13,  6,  2, 11, 15, 12,  9,  7,  3, 10,  5,  0 },
+          { 15, 12,  8,  2,  4,  9,  1,  7,  5, 11,  3, 14, 10,  0,  6, 13 }
+        },
+
+        { { 15,  1,  8, 14,  6, 11,  3,  4,  9,  7,  2, 13, 12,  0,  5, 10 },
+          {  3, 13,  4,  7, 15,  2,  8, 14, 12,  0,  1, 10,  6,  9, 11,  5 },
+          {  0, 14,  7, 11, 10,  4, 13,  1,  5,  8, 12,  6,  9,  3,  2, 15 },
+          { 13,  8, 10,  1,  3, 15,  4,  2, 11,  6,  7, 12,  0,  5, 14,  9 }
+        },
+
+        { { 10,  0,  9, 14,  6,  3, 15,  5,  1, 13, 12,  7, 11,  4,  2,  8 },
+          { 13,  7,  0,  9,  3,  4,  6, 10,  2,  8,  5, 14, 12, 11, 15,  1 },
+          { 13,  6,  4,  9,  8, 15,  3,  0, 11,  1,  2, 12,  5, 10, 14,  7 },
+          {  1, 10, 13,  0,  6,  9,  8,  7,  4, 15, 14,  3, 11,  5,  2, 12 }
+        },
+
+        { {  7, 13, 14,  3,  0,  6,  9, 10,  1,  2,  8,  5, 11, 12,  4, 15 },
+          { 13,  8, 11,  5,  6, 15,  0,  3,  4,  7,  2, 12,  1, 10, 14,  9 },
+          { 10,  6,  9,  0, 12, 11,  7, 13, 15,  1,  3, 14,  5,  2,  8,  4 },
+          {  3, 15,  0,  6, 10,  1, 13,  8,  9,  4,  5, 11, 12,  7,  2, 14 }
+        },
+
+        { {  2, 12,  4,  1,  7, 10, 11,  6,  8,  5,  3, 15, 13,  0, 14,  9 },
+          { 14, 11,  2, 12,  4,  7, 13,  1,  5,  0, 15, 10,  3,  9,  8,  6 },
+          {  4,  2,  1, 11, 10, 13,  7,  8, 15,  9, 12,  5,  6,  3,  0, 14 },
+          { 11,  8, 12,  7,  1, 14,  2, 13,  6, 15,  0,  9, 10,  4,  5,  3 }
+        },
+
+        { { 12,  1, 10, 15,  9,  2,  6,  8,  0, 13,  3,  4, 14,  7,  5, 11 },
+          { 10, 15,  4,  2,  7, 12,  9,  5,  6,  1, 13, 14,  0, 11,  3,  8 },
+          {  9, 14, 15,  5,  2,  8, 12,  3,  7,  0,  4, 10,  1, 13, 11,  6 },
+          {  4,  3,  2, 12,  9,  5, 15, 10, 11, 14,  1,  7,  6,  0,  8, 13 }
+        },
+
+        { {  4, 11,  2, 14, 15,  0,  8, 13,  3, 12,  9,  7,  5, 10,  6,  1 },
+          { 13,  0, 11,  7,  4,  9,  1, 10, 14,  3,  5, 12,  2, 15,  8,  6 },
+          {  1,  4, 11, 13, 12,  3,  7, 14, 10, 15,  6,  8,  0,  5,  9,  2 },
+          {  6, 11, 13,  8,  1,  4, 10,  7,  9,  5,  0, 15, 14,  2,  3, 12 }
+        },
+
+        { { 13,  2,  8,  4,  6, 15, 11,  1, 10,  9,  3, 14,  5,  0, 12,  7 },
+          {  1, 15, 13,  8, 10,  3,  7,  4, 12,  5,  6, 11,  0, 14,  9,  2 },
+          {  7, 11,  4,  1,  9, 12, 14,  2,  0,  6, 10, 13, 15,  3,  5,  8 },
+          {  2,  1, 14,  7,  4, 10,  8, 13, 15, 12,  9,  0,  3,  5,  6, 11 }
+        }
+};
+
+/* 
+ * This is the final 
+ * permutation matrix
+ */
+static int final_perm[64] = {
+  40,  8, 48, 16, 56, 24, 64, 32, 39,  7, 47, 15, 55, 23, 63, 31,
+  38,  6, 46, 14, 54, 22, 62, 30, 37,  5, 45, 13, 53, 21, 61, 29,
+  36,  4, 44, 12, 52, 20, 60, 28, 35,  3, 43, 11, 51, 19, 59, 27,
+  34,  2, 42, 10, 50, 18, 58, 26, 33,  1, 41,  9, 49, 17, 57, 25
+};
+
+/* 
+ * The 16 DES keys in BITMASK format 
+ */
+#ifdef _UFC_32_
+long32 _ufc_keytab[16][2];
+#endif
+
+#ifdef _UFC_64_
+long64 _ufc_keytab[16];
+#endif
+
+
+#define ascii_to_bin(c) ((c)>='a'?(c-59):(c)>='A'?((c)-53):(c)-'.')
+#define bin_to_ascii(c) ((c)>=38?((c)-38+'a'):(c)>=12?((c)-12+'A'):(c)+'.')
+
+/* Macro to set a bit (0..23) */
+#define BITMASK(i) ( (1<<(11-(i)%12+3)) << ((i)<12?16:0) )
+
+/*
+ * sb arrays:
+ *
+ * Workhorses of the inner loop of the DES implementation.
+ * They do sbox lookup, shifting of this  value, 32 bit
+ * permutation and E permutation for the next round.
+ *
+ * Kept in 'BITMASK' format.
+ */
+
+#ifdef _UFC_32_
+long32 _ufc_sb0[8192], _ufc_sb1[8192], _ufc_sb2[8192], _ufc_sb3[8192];
+static long32 *sb[4] = {_ufc_sb0, _ufc_sb1, _ufc_sb2, _ufc_sb3}; 
+#endif
+
+#ifdef _UFC_64_
+long64 _ufc_sb0[4096], _ufc_sb1[4096], _ufc_sb2[4096], _ufc_sb3[4096];
+static long64 *sb[4] = {_ufc_sb0, _ufc_sb1, _ufc_sb2, _ufc_sb3}; 
+#endif
+
+/* 
+ * eperm32tab: do 32 bit permutation and E selection
+ *
+ * The first index is the byte number in the 32 bit value to be permuted
+ *  -  second  -   is the value of this byte
+ *  -  third   -   selects the two 32 bit values
+ *
+ * The table is used and generated internally in init_des to speed it up
+ */
+static ufc_long eperm32tab[4][256][2];
+
+/* 
+ * do_pc1: permform pc1 permutation in the key schedule generation.
+ *
+ * The first   index is the byte number in the 8 byte ASCII key
+ *  -  second    -      -    the two 28 bits halfs of the result
+ *  -  third     -   selects the 7 bits actually used of each byte
+ *
+ * The result is kept with 28 bit per 32 bit with the 4 most significant
+ * bits zero.
+ */
+static ufc_long do_pc1[8][2][128];
+
+/*
+ * do_pc2: permform pc2 permutation in the key schedule generation.
+ *
+ * The first   index is the septet number in the two 28 bit intermediate values
+ *  -  second    -    -  -  septet values
+ *
+ * Knowledge of the structure of the pc2 permutation is used.
+ *
+ * The result is kept with 28 bit per 32 bit with the 4 most significant
+ * bits zero.
+ */
+static ufc_long do_pc2[8][128];
+
+/*
+ * efp: undo an extra e selection and do final
+ *      permutation giving the DES result.
+ * 
+ *      Invoked 6 bit a time on two 48 bit values
+ *      giving two 32 bit longs.
+ */
+static ufc_long efp[16][64][2];
+
+static unsigned char bytemask[8]  = {
+  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+};
+
+static ufc_long longmask[32] = {
+  0x80000000, 0x40000000, 0x20000000, 0x10000000,
+  0x08000000, 0x04000000, 0x02000000, 0x01000000,
+  0x00800000, 0x00400000, 0x00200000, 0x00100000,
+  0x00080000, 0x00040000, 0x00020000, 0x00010000,
+  0x00008000, 0x00004000, 0x00002000, 0x00001000,
+  0x00000800, 0x00000400, 0x00000200, 0x00000100,
+  0x00000080, 0x00000040, 0x00000020, 0x00000010,
+  0x00000008, 0x00000004, 0x00000002, 0x00000001
+};
+
+
+/*
+ * Silly rewrite of 'bzero'. I do so
+ * because some machines don't have
+ * bzero and some don't have memset.
+ */
+
+static void clearmem(char *start, int cnt)
+  { while(cnt--)
+      *start++ = '\0';
+  }
+
+static int initialized = 0;
+
+/* lookup a 6 bit value in sbox */
+
+#define s_lookup(i,s) sbox[(i)][(((s)>>4) & 0x2)|((s) & 0x1)][((s)>>1) & 0xf];
+
+/*
+ * Initialize unit - may be invoked directly
+ * by fcrypt users.
+ */
+
+static void ufc_init_des(void)
+  { int comes_from_bit;
+    int bit, sg;
+    ufc_long j;
+    ufc_long mask1, mask2;
+
+    /*
+     * Create the do_pc1 table used
+     * to affect pc1 permutation
+     * when generating keys
+     */
+    for(bit = 0; bit < 56; bit++) {
+      comes_from_bit  = pc1[bit] - 1;
+      mask1 = bytemask[comes_from_bit % 8 + 1];
+      mask2 = longmask[bit % 28 + 4];
+      for(j = 0; j < 128; j++) {
+       if(j & mask1) 
+         do_pc1[comes_from_bit / 8][bit / 28][j] |= mask2;
+      }
+    }
+
+    /*
+     * Create the do_pc2 table used
+     * to affect pc2 permutation when
+     * generating keys
+     */
+    for(bit = 0; bit < 48; bit++) {
+      comes_from_bit  = pc2[bit] - 1;
+      mask1 = bytemask[comes_from_bit % 7 + 1];
+      mask2 = BITMASK(bit % 24);
+      for(j = 0; j < 128; j++) {
+       if(j & mask1)
+         do_pc2[comes_from_bit / 7][j] |= mask2;
+      }
+    }
+
+    /* 
+     * Now generate the table used to do combined
+     * 32 bit permutation and e expansion
+     *
+     * We use it because we have to permute 16384 32 bit
+     * longs into 48 bit in order to initialize sb.
+     *
+     * Looping 48 rounds per permutation becomes 
+     * just too slow...
+     *
+     */
+
+    clearmem((char*)eperm32tab, sizeof(eperm32tab));
+
+    for(bit = 0; bit < 48; bit++) {
+      ufc_long inner_mask1,comes_from;
+       
+      comes_from = perm32[esel[bit]-1]-1;
+      inner_mask1      = bytemask[comes_from % 8];
+       
+      for(j = 256; j--;) {
+       if(j & inner_mask1)
+         eperm32tab[comes_from / 8][j][bit / 24] |= BITMASK(bit % 24);
+      }
+    }
+    
+    /* 
+     * Create the sb tables:
+     *
+     * For each 12 bit segment of an 48 bit intermediate
+     * result, the sb table precomputes the two 4 bit
+     * values of the sbox lookups done with the two 6
+     * bit halves, shifts them to their proper place,
+     * sends them through perm32 and finally E expands
+     * them so that they are ready for the next
+     * DES round.
+     *
+     */
+    for(sg = 0; sg < 4; sg++) {
+      int j1, j2;
+      int s1, s2;
+    
+      for(j1 = 0; j1 < 64; j1++) {
+       s1 = s_lookup(2 * sg, j1);
+       for(j2 = 0; j2 < 64; j2++) {
+         ufc_long to_permute, inx;
+    
+         s2         = s_lookup(2 * sg + 1, j2);
+         to_permute = ((s1 << 4)  | s2) << (24 - 8 * sg);
+
+#ifdef _UFC_32_
+         inx = ((j1 << 6)  | j2) << 1;
+         sb[sg][inx  ]  = eperm32tab[0][(to_permute >> 24) & 0xff][0];
+         sb[sg][inx+1]  = eperm32tab[0][(to_permute >> 24) & 0xff][1];
+         sb[sg][inx  ] |= eperm32tab[1][(to_permute >> 16) & 0xff][0];
+         sb[sg][inx+1] |= eperm32tab[1][(to_permute >> 16) & 0xff][1];
+         sb[sg][inx  ] |= eperm32tab[2][(to_permute >>  8) & 0xff][0];
+         sb[sg][inx+1] |= eperm32tab[2][(to_permute >>  8) & 0xff][1];
+         sb[sg][inx  ] |= eperm32tab[3][(to_permute)       & 0xff][0];
+         sb[sg][inx+1] |= eperm32tab[3][(to_permute)       & 0xff][1];
+#endif
+#ifdef _UFC_64_
+         inx = ((j1 << 6)  | j2);
+         sb[sg][inx]  = 
+           ((long64)eperm32tab[0][(to_permute >> 24) & 0xff][0] << 32) |
+            (long64)eperm32tab[0][(to_permute >> 24) & 0xff][1];
+         sb[sg][inx] |=
+           ((long64)eperm32tab[1][(to_permute >> 16) & 0xff][0] << 32) |
+            (long64)eperm32tab[1][(to_permute >> 16) & 0xff][1];
+         sb[sg][inx] |= 
+           ((long64)eperm32tab[2][(to_permute >>  8) & 0xff][0] << 32) |
+            (long64)eperm32tab[2][(to_permute >>  8) & 0xff][1];
+         sb[sg][inx] |=
+           ((long64)eperm32tab[3][(to_permute)       & 0xff][0] << 32) |
+            (long64)eperm32tab[3][(to_permute)       & 0xff][1];
+#endif
+       }
+      }
+    }  
+
+    /* 
+     * Create an inverse matrix for esel telling
+     * where to plug out bits if undoing it
+     */
+    for(bit=48; bit--;) {
+      e_inverse[esel[bit] - 1     ] = bit;
+      e_inverse[esel[bit] - 1 + 32] = bit + 48;
+    }
+
+    /* 
+     * create efp: the matrix used to
+     * undo the E expansion and effect final permutation
+     */
+    clearmem((char*)efp, sizeof efp);
+    for(bit = 0; bit < 64; bit++) {
+      int o_bit, o_long;
+      ufc_long word_value, inner_mask1, inner_mask2;
+      int comes_from_f_bit, comes_from_e_bit;
+      int comes_from_word, bit_within_word;
+
+      /* See where bit i belongs in the two 32 bit long's */
+      o_long = bit / 32; /* 0..1  */
+      o_bit  = bit % 32; /* 0..31 */
+
+      /* 
+       * And find a bit in the e permutated value setting this bit.
+       *
+       * Note: the e selection may have selected the same bit several
+       * times. By the initialization of e_inverse, we only look
+       * for one specific instance.
+       */
+      comes_from_f_bit = final_perm[bit] - 1;         /* 0..63 */
+      comes_from_e_bit = e_inverse[comes_from_f_bit]; /* 0..95 */
+      comes_from_word  = comes_from_e_bit / 6;        /* 0..15 */
+      bit_within_word  = comes_from_e_bit % 6;        /* 0..5  */
+
+      inner_mask1 = longmask[bit_within_word + 26];
+      inner_mask2 = longmask[o_bit];
+
+      for(word_value = 64; word_value--;) {
+       if(word_value & inner_mask1)
+         efp[comes_from_word][word_value][o_long] |= inner_mask2;
+      }
+    }
+    initialized++;
+  }
+
+/* 
+ * Process the elements of the sb table permuting the
+ * bits swapped in the expansion by the current salt.
+ */
+
+#ifdef _UFC_32_
+static void shuffle_sb(long32 *k, ufc_long saltbits)
+  { ufc_long j;
+    long32 x;
+    for(j=4096; j--;) {
+      x = (k[0] ^ k[1]) & (long32)saltbits;
+      *k++ ^= x;
+      *k++ ^= x;
+    }
+  }
+#endif
+
+#ifdef _UFC_64_
+static void shuffle_sb(long64 *k, ufc_long saltbits)
+  { ufc_long j;
+    long64 x;
+    for(j=4096; j--;) {
+      x = ((*k >> 32) ^ *k) & (long64)saltbits;
+      *k++ ^= (x << 32) | x;
+    }
+  }
+#endif
+
+/* 
+ * Setup the unit for a new salt
+ * Hopefully we'll not see a new salt in each crypt call.
+ */
+
+static unsigned char current_salt[3] = "&&"; /* invalid value */
+static ufc_long current_saltbits = 0;
+static int direction = 0;
+
+static void setup_salt(const char *s1)
+  { ufc_long i, j, saltbits;
+    const unsigned char *s2 = (const unsigned char *)s1;
+
+    if(!initialized)
+      ufc_init_des();
+
+    if(s2[0] == current_salt[0] && s2[1] == current_salt[1])
+      return;
+    current_salt[0] = s2[0]; current_salt[1] = s2[1];
+
+    /* 
+     * This is the only crypt change to DES:
+     * entries are swapped in the expansion table
+     * according to the bits set in the salt.
+     */
+    saltbits = 0;
+    for(i = 0; i < 2; i++) {
+      long c=ascii_to_bin(s2[i]);
+      if(c < 0 || c > 63)
+       c = 0;
+      for(j = 0; j < 6; j++) {
+       if((c >> j) & 0x1)
+         saltbits |= BITMASK(6 * i + j);
+      }
+    }
+
+    /*
+     * Permute the sb table values
+     * to reflect the changed e
+     * selection table
+     */
+    shuffle_sb(_ufc_sb0, current_saltbits ^ saltbits); 
+    shuffle_sb(_ufc_sb1, current_saltbits ^ saltbits);
+    shuffle_sb(_ufc_sb2, current_saltbits ^ saltbits);
+    shuffle_sb(_ufc_sb3, current_saltbits ^ saltbits);
+
+    current_saltbits = saltbits;
+  }
+
+static void ufc_mk_keytab(char *key)
+  { ufc_long v1, v2, *k1;
+    int i;
+#ifdef _UFC_32_
+    long32 v, *k2 = &_ufc_keytab[0][0];
+#endif
+#ifdef _UFC_64_
+    long64 v, *k2 = &_ufc_keytab[0];
+#endif
+
+    v1 = v2 = 0; k1 = &do_pc1[0][0][0];
+    for(i = 8; i--;) {
+      v1 |= k1[*key   & 0x7f]; k1 += 128;
+      v2 |= k1[*key++ & 0x7f]; k1 += 128;
+    }
+
+    for(i = 0; i < 16; i++) {
+      k1 = &do_pc2[0][0];
+
+      v1 = (v1 << rots[i]) | (v1 >> (28 - rots[i]));
+      v  = k1[(v1 >> 21) & 0x7f]; k1 += 128;
+      v |= k1[(v1 >> 14) & 0x7f]; k1 += 128;
+      v |= k1[(v1 >>  7) & 0x7f]; k1 += 128;
+      v |= k1[(v1      ) & 0x7f]; k1 += 128;
+
+#ifdef _UFC_32_
+      *k2++ = v;
+      v = 0;
+#endif
+#ifdef _UFC_64_
+      v <<= 32;
+#endif
+
+      v2 = (v2 << rots[i]) | (v2 >> (28 - rots[i]));
+      v |= k1[(v2 >> 21) & 0x7f]; k1 += 128;
+      v |= k1[(v2 >> 14) & 0x7f]; k1 += 128;
+      v |= k1[(v2 >>  7) & 0x7f]; k1 += 128;
+      v |= k1[(v2      ) & 0x7f];
+
+      *k2++ = v;
+    }
+
+    direction = 0;
+  }
+
+/* 
+ * Undo an extra E selection and do final permutations
+ */
+
+ufc_long *_ufc_dofinalperm(ufc_long l1, ufc_long l2, ufc_long r1, ufc_long r2)
+  { ufc_long v1, v2, x;
+    static ufc_long ary[2];
+
+    x = (l1 ^ l2) & current_saltbits; l1 ^= x; l2 ^= x;
+    x = (r1 ^ r2) & current_saltbits; r1 ^= x; r2 ^= x;
+
+    v1=v2=0; l1 >>= 3; l2 >>= 3; r1 >>= 3; r2 >>= 3;
+
+    v1 |= efp[15][ r2         & 0x3f][0]; v2 |= efp[15][ r2 & 0x3f][1];
+    v1 |= efp[14][(r2 >>= 6)  & 0x3f][0]; v2 |= efp[14][ r2 & 0x3f][1];
+    v1 |= efp[13][(r2 >>= 10) & 0x3f][0]; v2 |= efp[13][ r2 & 0x3f][1];
+    v1 |= efp[12][(r2 >>= 6)  & 0x3f][0]; v2 |= efp[12][ r2 & 0x3f][1];
+
+    v1 |= efp[11][ r1         & 0x3f][0]; v2 |= efp[11][ r1 & 0x3f][1];
+    v1 |= efp[10][(r1 >>= 6)  & 0x3f][0]; v2 |= efp[10][ r1 & 0x3f][1];
+    v1 |= efp[ 9][(r1 >>= 10) & 0x3f][0]; v2 |= efp[ 9][ r1 & 0x3f][1];
+    v1 |= efp[ 8][(r1 >>= 6)  & 0x3f][0]; v2 |= efp[ 8][ r1 & 0x3f][1];
+
+    v1 |= efp[ 7][ l2         & 0x3f][0]; v2 |= efp[ 7][ l2 & 0x3f][1];
+    v1 |= efp[ 6][(l2 >>= 6)  & 0x3f][0]; v2 |= efp[ 6][ l2 & 0x3f][1];
+    v1 |= efp[ 5][(l2 >>= 10) & 0x3f][0]; v2 |= efp[ 5][ l2 & 0x3f][1];
+    v1 |= efp[ 4][(l2 >>= 6)  & 0x3f][0]; v2 |= efp[ 4][ l2 & 0x3f][1];
+
+    v1 |= efp[ 3][ l1         & 0x3f][0]; v2 |= efp[ 3][ l1 & 0x3f][1];
+    v1 |= efp[ 2][(l1 >>= 6)  & 0x3f][0]; v2 |= efp[ 2][ l1 & 0x3f][1];
+    v1 |= efp[ 1][(l1 >>= 10) & 0x3f][0]; v2 |= efp[ 1][ l1 & 0x3f][1];
+    v1 |= efp[ 0][(l1 >>= 6)  & 0x3f][0]; v2 |= efp[ 0][ l1 & 0x3f][1];
+
+    ary[0] = v1; ary[1] = v2;
+    return ary;
+  }
+
+/* 
+ * crypt only: convert from 64 bit to 11 bit ASCII 
+ * prefixing with the salt
+ */
+
+static char *output_conversion(ufc_long v1, ufc_long v2, const char *salt)
+  { static char outbuf[14];
+    int i, s;
+
+    outbuf[0] = salt[0];
+    outbuf[1] = salt[1] ? salt[1] : salt[0];
+
+    for(i = 0; i < 5; i++)
+      outbuf[i + 2] = bin_to_ascii((v1 >> (26 - 6 * i)) & 0x3f);
+
+    s  = (v2 & 0xf) << 2;
+    v2 = (v2 >> 2) | ((v1 & 0x3) << 30);
+
+    for(i = 5; i < 10; i++)
+      outbuf[i + 2] = bin_to_ascii((v2 >> (56 - 6 * i)) & 0x3f);
+
+    outbuf[12] = bin_to_ascii(s);
+    outbuf[13] = 0;
+
+    return outbuf;
+  }
+
+/* 
+ * UNIX crypt function
+ */
+
+static ufc_long *_ufc_doit(ufc_long , ufc_long, ufc_long, ufc_long, ufc_long);
+   
+char *ufc_crypt(const char *key,const char *salt)
+  { ufc_long *s;
+    char ktab[9];
+
+    /*
+     * Hack DES tables according to salt
+     */
+    setup_salt(salt);
+
+    /*
+     * Setup key schedule
+     */
+    clearmem(ktab, sizeof ktab);
+    strncpy(ktab, key, 8);
+    ufc_mk_keytab(ktab);
+
+    /*
+     * Go for the 25 DES encryptions
+     */
+    s = _ufc_doit((ufc_long)0, (ufc_long)0, 
+                 (ufc_long)0, (ufc_long)0, (ufc_long)25);
+
+    /*
+     * And convert back to 6 bit ASCII
+     */
+    return output_conversion(s[0], s[1], salt);
+  }
+
+
+#ifdef _UFC_32_
+
+/*
+ * 32 bit version
+ */
+
+extern long32 _ufc_keytab[16][2];
+extern long32 _ufc_sb0[], _ufc_sb1[], _ufc_sb2[], _ufc_sb3[];
+
+#define SBA(sb, v) (*(long32*)((char*)(sb)+(v)))
+
+static ufc_long *_ufc_doit(ufc_long l1, ufc_long l2, ufc_long r1, ufc_long r2, ufc_long itr)
+  { int i;
+    long32 s, *k;
+
+    while(itr--) {
+      k = &_ufc_keytab[0][0];
+      for(i=8; i--; ) {
+       s = *k++ ^ r1;
+       l1 ^= SBA(_ufc_sb1, s & 0xffff); l2 ^= SBA(_ufc_sb1, (s & 0xffff)+4);  
+        l1 ^= SBA(_ufc_sb0, s >>= 16);   l2 ^= SBA(_ufc_sb0, (s)         +4); 
+        s = *k++ ^ r2; 
+        l1 ^= SBA(_ufc_sb3, s & 0xffff); l2 ^= SBA(_ufc_sb3, (s & 0xffff)+4);
+        l1 ^= SBA(_ufc_sb2, s >>= 16);   l2 ^= SBA(_ufc_sb2, (s)         +4);
+
+        s = *k++ ^ l1; 
+        r1 ^= SBA(_ufc_sb1, s & 0xffff); r2 ^= SBA(_ufc_sb1, (s & 0xffff)+4);  
+        r1 ^= SBA(_ufc_sb0, s >>= 16);   r2 ^= SBA(_ufc_sb0, (s)         +4); 
+        s = *k++ ^ l2; 
+        r1 ^= SBA(_ufc_sb3, s & 0xffff); r2 ^= SBA(_ufc_sb3, (s & 0xffff)+4);  
+        r1 ^= SBA(_ufc_sb2, s >>= 16);   r2 ^= SBA(_ufc_sb2, (s)         +4);
+      } 
+      s=l1; l1=r1; r1=s; s=l2; l2=r2; r2=s;
+    }
+    return _ufc_dofinalperm(l1, l2, r1, r2);
+  }
+
+#endif
+
+#ifdef _UFC_64_
+
+/*
+ * 64 bit version
+ */
+
+extern long64 _ufc_keytab[16];
+extern long64 _ufc_sb0[], _ufc_sb1[], _ufc_sb2[], _ufc_sb3[];
+
+#define SBA(sb, v) (*(long64*)((char*)(sb)+(v)))
+
+static ufc_long *_ufc_doit(ufc_long l1, ufc_long l2, ufc_long r1, ufc_long r2, ufc_long itr)
+  { int i;
+    long64 l, r, s, *k;
+
+    l = (((long64)l1) << 32) | ((long64)l2);
+    r = (((long64)r1) << 32) | ((long64)r2);
+
+    while(itr--) {
+      k = &_ufc_keytab[0];
+      for(i=8; i--; ) {
+       s = *k++ ^ r;
+       l ^= SBA(_ufc_sb3, (s >>  0) & 0xffff);
+        l ^= SBA(_ufc_sb2, (s >> 16) & 0xffff);
+        l ^= SBA(_ufc_sb1, (s >> 32) & 0xffff);
+        l ^= SBA(_ufc_sb0, (s >> 48) & 0xffff);
+
+       s = *k++ ^ l;
+       r ^= SBA(_ufc_sb3, (s >>  0) & 0xffff);
+        r ^= SBA(_ufc_sb2, (s >> 16) & 0xffff);
+        r ^= SBA(_ufc_sb1, (s >> 32) & 0xffff);
+        r ^= SBA(_ufc_sb0, (s >> 48) & 0xffff);
+      } 
+      s=l; l=r; r=s;
+    }
+
+    l1 = l >> 32; l2 = l & 0xffffffff;
+    r1 = r >> 32; r2 = r & 0xffffffff;
+    return _ufc_dofinalperm(l1, l2, r1, r2);
+  }
+
+#endif
+
+
+#else
+ int ufc_dummy_procedure(void);
+ int ufc_dummy_procedure(void) {return 0;}
+#endif
diff --git a/ctdb/lib/replace/crypt.m4 b/ctdb/lib/replace/crypt.m4
new file mode 100644 (file)
index 0000000..fae2a58
--- /dev/null
@@ -0,0 +1,6 @@
+###############################################
+# test for where we get crypt() from
+AC_CHECK_HEADERS(crypt.h)
+AC_SEARCH_LIBS_EXT(crypt, [crypt], CRYPT_LIBS,
+  [ AC_DEFINE(HAVE_CRYPT,1,[Whether the system has the crypt() function]) ],
+  [ LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/crypt.o" ])
diff --git a/ctdb/lib/replace/dlfcn.c b/ctdb/lib/replace/dlfcn.c
new file mode 100644 (file)
index 0000000..88431ed
--- /dev/null
@@ -0,0 +1,76 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba system utilities
+   Copyright (C) Andrew Tridgell 1992-1998
+   Copyright (C) Jeremy Allison 1998-2002
+   Copyright (C) Jelmer Vernooij 2006
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#ifdef HAVE_DL_H
+#include <dl.h>
+#endif
+
+#ifndef HAVE_DLOPEN
+#ifdef DLOPEN_TAKES_UNSIGNED_FLAGS
+void *rep_dlopen(const char *name, unsigned int flags)
+#else
+void *rep_dlopen(const char *name, int flags)
+#endif
+{
+#ifdef HAVE_SHL_LOAD
+       if (name == NULL)
+               return PROG_HANDLE;
+       return (void *)shl_load(name, flags, 0);
+#else
+       return NULL;
+#endif
+}
+#endif
+
+#ifndef HAVE_DLSYM
+void *rep_dlsym(void *handle, const char *symbol)
+{
+#ifdef HAVE_SHL_FINDSYM
+       void *sym_addr;
+       if (!shl_findsym((shl_t *)&handle, symbol, TYPE_UNDEFINED, &sym_addr))
+               return sym_addr;
+#endif
+    return NULL;
+}
+#endif
+
+#ifndef HAVE_DLERROR
+char *rep_dlerror(void)
+{
+       return "dynamic loading of objects not supported on this platform";
+}
+#endif
+
+#ifndef HAVE_DLCLOSE
+int rep_dlclose(void *handle)
+{
+#ifdef HAVE_SHL_CLOSE
+       return shl_unload((shl_t)handle);
+#else
+       return 0;
+#endif
+}
+#endif
diff --git a/ctdb/lib/replace/dlfcn.m4 b/ctdb/lib/replace/dlfcn.m4
new file mode 100644 (file)
index 0000000..722e024
--- /dev/null
@@ -0,0 +1,31 @@
+dnl dummies provided by dlfcn.c if not available
+save_LIBS="$LIBS"
+LIBS=""
+
+libreplace_cv_dlfcn=no
+AC_SEARCH_LIBS(dlopen, dl)
+
+AC_CHECK_HEADERS(dlfcn.h)
+AC_CHECK_FUNCS([dlopen dlsym dlerror dlclose],[],[libreplace_cv_dlfcn=yes])
+
+libreplace_cv_shl=no
+AC_SEARCH_LIBS(shl_load, sl)
+AC_CHECK_HEADERS(dl.h)
+AC_CHECK_FUNCS([shl_load shl_unload shl_findsym],[],[libreplace_cv_shl=yes])
+
+AC_VERIFY_C_PROTOTYPE([void *dlopen(const char* filename, unsigned int flags)],
+       [
+       return 0;
+       ],[
+       AC_DEFINE(DLOPEN_TAKES_UNSIGNED_FLAGS, 1, [Whether dlopen takes unsigned int flags])
+       ],[],[
+       #include <dlfcn.h>
+       ])
+
+if test x"${libreplace_cv_dlfcn}" = x"yes";then
+       LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/dlfcn.o"
+fi
+
+LIBDL="$LIBS"
+AC_SUBST(LIBDL)
+LIBS="$save_LIBS"
diff --git a/ctdb/lib/replace/getaddrinfo.c b/ctdb/lib/replace/getaddrinfo.c
new file mode 100644 (file)
index 0000000..c5cd52b
--- /dev/null
@@ -0,0 +1,497 @@
+/*
+PostgreSQL Database Management System
+(formerly known as Postgres, then as Postgres95)
+
+Portions Copyright (c) 1996-2005, The PostgreSQL Global Development Group
+
+Portions Copyright (c) 1994, The Regents of the University of California
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose, without fee, and without a written agreement
+is hereby granted, provided that the above copyright notice and this paragraph
+and the following two paragraphs appear in all copies.
+
+IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
+EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
+ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS
+TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+
+*/
+
+/*-------------------------------------------------------------------------
+ *
+ * getaddrinfo.c
+ *       Support getaddrinfo() on platforms that don't have it.
+ *
+ * We also supply getnameinfo() here, assuming that the platform will have
+ * it if and only if it has getaddrinfo().     If this proves false on some
+ * platform, we'll need to split this file and provide a separate configure
+ * test for getnameinfo().
+ *
+ * Copyright (c) 2003-2007, PostgreSQL Global Development Group
+ *
+ * Copyright (C) 2007 Jeremy Allison.
+ * Modified to return multiple IPv4 addresses for Samba.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "replace.h"
+#include "system/network.h"
+
+#ifndef SMB_MALLOC
+#define SMB_MALLOC(s) malloc(s)
+#endif
+
+#ifndef SMB_STRDUP
+#define SMB_STRDUP(s) strdup(s)
+#endif
+
+static int check_hostent_err(struct hostent *hp)
+{
+       if (!hp) {
+               switch (h_errno) {
+                       case HOST_NOT_FOUND:
+                       case NO_DATA:
+                               return EAI_NONAME;
+                       case TRY_AGAIN:
+                               return EAI_AGAIN;
+                       case NO_RECOVERY:
+                       default:
+                               return EAI_FAIL;
+               }
+       }
+       if (!hp->h_name || hp->h_addrtype != AF_INET) {
+               return EAI_FAIL;
+       }
+       return 0;
+}
+
+static char *canon_name_from_hostent(struct hostent *hp,
+                               int *perr)
+{
+       char *ret = NULL;
+
+       *perr = check_hostent_err(hp);
+       if (*perr) {
+               return NULL;
+       }
+       ret = SMB_STRDUP(hp->h_name);
+       if (!ret) {
+               *perr = EAI_MEMORY;
+       }
+       return ret;
+}
+
+static char *get_my_canon_name(int *perr)
+{
+       char name[HOST_NAME_MAX+1];
+
+       if (gethostname(name, HOST_NAME_MAX) == -1) {
+               *perr = EAI_FAIL;
+               return NULL;
+       }
+       /* Ensure null termination. */
+       name[HOST_NAME_MAX] = '\0';
+       return canon_name_from_hostent(gethostbyname(name), perr);
+}
+
+static char *get_canon_name_from_addr(struct in_addr ip,
+                               int *perr)
+{
+       return canon_name_from_hostent(
+                       gethostbyaddr(&ip, sizeof(ip), AF_INET),
+                       perr);
+}
+
+static struct addrinfo *alloc_entry(const struct addrinfo *hints,
+                               struct in_addr ip,
+                               unsigned short port)
+{
+       struct sockaddr_in *psin = NULL;
+       struct addrinfo *ai = SMB_MALLOC(sizeof(*ai));
+
+       if (!ai) {
+               return NULL;
+       }
+       memset(ai, '\0', sizeof(*ai));
+
+       psin = SMB_MALLOC(sizeof(*psin));
+       if (!psin) {
+               free(ai);
+               return NULL;
+       }
+
+       memset(psin, '\0', sizeof(*psin));
+
+       psin->sin_family = AF_INET;
+       psin->sin_port = htons(port);
+       psin->sin_addr = ip;
+
+       ai->ai_flags = 0;
+       ai->ai_family = AF_INET;
+       ai->ai_socktype = hints->ai_socktype;
+       ai->ai_protocol = hints->ai_protocol;
+       ai->ai_addrlen = sizeof(*psin);
+       ai->ai_addr = (struct sockaddr *) psin;
+       ai->ai_canonname = NULL;
+       ai->ai_next = NULL;
+
+       return ai;
+}
+
+/*
+ * get address info for a single ipv4 address.
+ *
+ *     Bugs:   - servname can only be a number, not text.
+ */
+
+static int getaddr_info_single_addr(const char *service,
+                               uint32_t addr,
+                               const struct addrinfo *hints,
+                               struct addrinfo **res)
+{
+
+       struct addrinfo *ai = NULL;
+       struct in_addr ip;
+       unsigned short port = 0;
+
+       if (service) {
+               port = (unsigned short)atoi(service);
+       }
+       ip.s_addr = htonl(addr);
+
+       ai = alloc_entry(hints, ip, port);
+       if (!ai) {
+               return EAI_MEMORY;
+       }
+
+       /* If we're asked for the canonical name,
+        * make sure it returns correctly. */
+       if (!(hints->ai_flags & AI_NUMERICSERV) &&
+                       hints->ai_flags & AI_CANONNAME) {
+               int err;
+               if (addr == INADDR_LOOPBACK || addr == INADDR_ANY) {
+                       ai->ai_canonname = get_my_canon_name(&err);
+               } else {
+                       ai->ai_canonname =
+                       get_canon_name_from_addr(ip,&err);
+               }
+               if (ai->ai_canonname == NULL) {
+                       freeaddrinfo(ai);
+                       return err;
+               }
+       }
+
+       *res = ai;
+       return 0;
+}
+
+/*
+ * get address info for multiple ipv4 addresses.
+ *
+ *     Bugs:   - servname can only be a number, not text.
+ */
+
+static int getaddr_info_name(const char *node,
+                               const char *service,
+                               const struct addrinfo *hints,
+                               struct addrinfo **res)
+{
+       struct addrinfo *listp = NULL, *prevp = NULL;
+       char **pptr = NULL;
+       int err;
+       struct hostent *hp = NULL;
+       unsigned short port = 0;
+
+       if (service) {
+               port = (unsigned short)atoi(service);
+       }
+
+       hp = gethostbyname(node);
+       err = check_hostent_err(hp);
+       if (err) {
+               return err;
+       }
+
+       for(pptr = hp->h_addr_list; *pptr; pptr++) {
+               struct in_addr ip = *(struct in_addr *)*pptr;
+               struct addrinfo *ai = alloc_entry(hints, ip, port);
+
+               if (!ai) {
+                       freeaddrinfo(listp);
+                       return EAI_MEMORY;
+               }
+
+               if (!listp) {
+                       listp = ai;
+                       prevp = ai;
+                       ai->ai_canonname = SMB_STRDUP(hp->h_name);
+                       if (!ai->ai_canonname) {
+                               freeaddrinfo(listp);
+                               return EAI_MEMORY;
+                       }
+               } else {
+                       prevp->ai_next = ai;
+                       prevp = ai;
+               }
+       }
+       *res = listp;
+       return 0;
+}
+
+/*
+ * get address info for ipv4 sockets.
+ *
+ *     Bugs:   - servname can only be a number, not text.
+ */
+
+int rep_getaddrinfo(const char *node,
+               const char *service,
+               const struct addrinfo * hintp,
+               struct addrinfo ** res)
+{
+       struct addrinfo hints;
+
+       /* Setup the hints struct. */
+       if (hintp == NULL) {
+               memset(&hints, 0, sizeof(hints));
+               hints.ai_family = AF_INET;
+               hints.ai_socktype = SOCK_STREAM;
+       } else {
+               memcpy(&hints, hintp, sizeof(hints));
+       }
+
+       if (hints.ai_family != AF_INET && hints.ai_family != AF_UNSPEC) {
+               return EAI_FAMILY;
+       }
+
+       if (hints.ai_socktype == 0) {
+               hints.ai_socktype = SOCK_STREAM;
+       }
+
+       if (!node && !service) {
+               return EAI_NONAME;
+       }
+
+       if (node) {
+               if (node[0] == '\0') {
+                       return getaddr_info_single_addr(service,
+                                       INADDR_ANY,
+                                       &hints,
+                                       res);
+               } else if (hints.ai_flags & AI_NUMERICHOST) {
+                       struct in_addr ip;
+                       if (!inet_aton(node, &ip)) {
+                               return EAI_FAIL;
+                       }
+                       return getaddr_info_single_addr(service,
+                                       ntohl(ip.s_addr),
+                                       &hints,
+                                       res);
+               } else {
+                       return getaddr_info_name(node,
+                                               service,
+                                               &hints,
+                                               res);
+               }
+       } else if (hints.ai_flags & AI_PASSIVE) {
+               return getaddr_info_single_addr(service,
+                                       INADDR_ANY,
+                                       &hints,
+                                       res);
+       }
+       return getaddr_info_single_addr(service,
+                                       INADDR_LOOPBACK,
+                                       &hints,
+                                       res);
+}
+
+
+void rep_freeaddrinfo(struct addrinfo *res)
+{
+       struct addrinfo *next = NULL;
+
+       for (;res; res = next) {
+               next = res->ai_next;
+               if (res->ai_canonname) {
+                       free(res->ai_canonname);
+               }
+               if (res->ai_addr) {
+                       free(res->ai_addr);
+               }
+               free(res);
+       }
+}
+
+
+const char *rep_gai_strerror(int errcode)
+{
+#ifdef HAVE_HSTRERROR
+       int                     hcode;
+
+       switch (errcode)
+       {
+               case EAI_NONAME:
+                       hcode = HOST_NOT_FOUND;
+                       break;
+               case EAI_AGAIN:
+                       hcode = TRY_AGAIN;
+                       break;
+               case EAI_FAIL:
+               default:
+                       hcode = NO_RECOVERY;
+                       break;
+       }
+
+       return hstrerror(hcode);
+#else                                                  /* !HAVE_HSTRERROR */
+
+       switch (errcode)
+       {
+               case EAI_NONAME:
+                       return "Unknown host";
+               case EAI_AGAIN:
+                       return "Host name lookup failure";
+#ifdef EAI_BADFLAGS
+               case EAI_BADFLAGS:
+                       return "Invalid argument";
+#endif
+#ifdef EAI_FAMILY
+               case EAI_FAMILY:
+                       return "Address family not supported";
+#endif
+#ifdef EAI_MEMORY
+               case EAI_MEMORY:
+                       return "Not enough memory";
+#endif
+#ifdef EAI_NODATA
+               case EAI_NODATA:
+                       return "No host data of that type was found";
+#endif
+#ifdef EAI_SERVICE
+               case EAI_SERVICE:
+                       return "Class type not found";
+#endif
+#ifdef EAI_SOCKTYPE
+               case EAI_SOCKTYPE:
+                       return "Socket type not supported";
+#endif
+               default:
+                       return "Unknown server error";
+       }
+#endif   /* HAVE_HSTRERROR */
+}
+
+static int gethostnameinfo(const struct sockaddr *sa,
+                       char *node,
+                       size_t nodelen,
+                       int flags)
+{
+       int ret = -1;
+       char *p = NULL;
+
+       if (!(flags & NI_NUMERICHOST)) {
+               struct hostent *hp = gethostbyaddr(
+                               &((struct sockaddr_in *)sa)->sin_addr,
+                               sizeof(struct in_addr),
+                               sa->sa_family);
+               ret = check_hostent_err(hp);
+               if (ret == 0) {
+                       /* Name looked up successfully. */
+                       ret = snprintf(node, nodelen, "%s", hp->h_name);
+                       if (ret < 0 || (size_t)ret >= nodelen) {
+                               return EAI_MEMORY;
+                       }
+                       if (flags & NI_NOFQDN) {
+                               p = strchr(node,'.');
+                               if (p) {
+                                       *p = '\0';
+                               }
+                       }
+                       return 0;
+               }
+
+               if (flags & NI_NAMEREQD) {
+                       /* If we require a name and didn't get one,
+                        * automatically fail. */
+                       return ret;
+               }
+               /* Otherwise just fall into the numeric host code... */
+       }
+       p = inet_ntoa(((struct sockaddr_in *)sa)->sin_addr);
+       ret = snprintf(node, nodelen, "%s", p);
+       if (ret < 0 || (size_t)ret >= nodelen) {
+               return EAI_MEMORY;
+       }
+       return 0;
+}
+
+static int getservicenameinfo(const struct sockaddr *sa,
+                       char *service,
+                       size_t servicelen,
+                       int flags)
+{
+       int ret = -1;
+       int port = ntohs(((struct sockaddr_in *)sa)->sin_port);
+
+       if (!(flags & NI_NUMERICSERV)) {
+               struct servent *se = getservbyport(
+                               port,
+                               (flags & NI_DGRAM) ? "udp" : "tcp");
+               if (se && se->s_name) {
+                       /* Service name looked up successfully. */
+                       ret = snprintf(service, servicelen, "%s", se->s_name);
+                       if (ret < 0 || (size_t)ret >= servicelen) {
+                               return EAI_MEMORY;
+                       }
+                       return 0;
+               }
+               /* Otherwise just fall into the numeric service code... */
+       }
+       ret = snprintf(service, servicelen, "%d", port);
+       if (ret < 0 || (size_t)ret >= servicelen) {
+               return EAI_MEMORY;
+       }
+       return 0;
+}
+
+/*
+ * Convert an ipv4 address to a hostname.
+ *
+ * Bugs:       - No IPv6 support.
+ */
+int rep_getnameinfo(const struct sockaddr *sa, socklen_t salen,
+                       char *node, size_t nodelen,
+                       char *service, size_t servicelen, int flags)
+{
+
+       /* Invalid arguments. */
+       if (sa == NULL || (node == NULL && service == NULL)) {
+               return EAI_FAIL;
+       }
+
+       if (sa->sa_family != AF_INET) {
+               return EAI_FAIL;
+       }
+
+       if (salen < sizeof(struct sockaddr_in)) {
+               return EAI_FAIL;
+       }
+
+       if (node) {
+               return gethostnameinfo(sa, node, nodelen, flags);
+       }
+
+       if (service) {
+               return getservicenameinfo(sa, service, servicelen, flags);
+       }
+       return 0;
+}
diff --git a/ctdb/lib/replace/getaddrinfo.h b/ctdb/lib/replace/getaddrinfo.h
new file mode 100644 (file)
index 0000000..cf040da
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+PostgreSQL Database Management System
+(formerly known as Postgres, then as Postgres95)
+
+Portions Copyright (c) 1996-2005, The PostgreSQL Global Development Group
+
+Portions Copyright (c) 1994, The Regents of the University of California
+
+Permission to use, copy, modify, and distribute this software and its
+documentation for any purpose, without fee, and without a written agreement
+is hereby granted, provided that the above copyright notice and this paragraph
+and the following two paragraphs appear in all copies.
+
+IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
+EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGE.
+
+THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS
+ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS
+TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+
+*/
+
+/*-------------------------------------------------------------------------
+ *
+ * getaddrinfo.h
+ *       Support getaddrinfo() on platforms that don't have it.
+ *
+ * Note: we use our own routines on platforms that don't HAVE_STRUCT_ADDRINFO,
+ * whether or not the library routine getaddrinfo() can be found.  This
+ * policy is needed because on some platforms a manually installed libbind.a
+ * may provide getaddrinfo(), yet the system headers may not provide the
+ * struct definitions needed to call it.  To avoid conflict with the libbind
+ * definition in such cases, we rename our routines to pg_xxx() via macros.
+ *
+
+in lib/replace we use rep_xxx()
+
+ * This code will also work on platforms where struct addrinfo is defined
+ * in the system headers but no getaddrinfo() can be located.
+ *
+ * Copyright (c) 2003-2007, PostgreSQL Global Development Group
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef GETADDRINFO_H
+#define GETADDRINFO_H
+
+#ifndef HAVE_GETADDRINFO
+
+/* Rename private copies per comments above */
+#ifdef getaddrinfo
+#undef getaddrinfo
+#endif
+#define getaddrinfo rep_getaddrinfo
+#define HAVE_GETADDRINFO
+
+#ifdef freeaddrinfo
+#undef freeaddrinfo
+#endif
+#define freeaddrinfo rep_freeaddrinfo
+#define HAVE_FREEADDRINFO
+
+#ifdef gai_strerror
+#undef gai_strerror
+#endif
+#define gai_strerror rep_gai_strerror
+#define HAVE_GAI_STRERROR
+
+#ifdef getnameinfo
+#undef getnameinfo
+#endif
+#define getnameinfo rep_getnameinfo
+#ifndef HAVE_GETNAMEINFO
+#define HAVE_GETNAMEINFO
+#endif
+
+extern int rep_getaddrinfo(const char *node, const char *service,
+                       const struct addrinfo * hints, struct addrinfo ** res);
+extern void rep_freeaddrinfo(struct addrinfo * res);
+extern const char *rep_gai_strerror(int errcode);
+extern int rep_getnameinfo(const struct sockaddr * sa, socklen_t salen,
+                       char *node, size_t nodelen,
+                       char *service, size_t servicelen, int flags);
+#endif   /* HAVE_GETADDRINFO */
+
+#endif   /* GETADDRINFO_H */
diff --git a/ctdb/lib/replace/getifaddrs.c b/ctdb/lib/replace/getifaddrs.c
new file mode 100644 (file)
index 0000000..84d7906
--- /dev/null
@@ -0,0 +1,352 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba utility functions
+   Copyright (C) Andrew Tridgell 1998
+   Copyright (C) Jeremy Allison 2007
+   Copyright (C) Jelmer Vernooij <jelmer@samba.org> 2007
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define SOCKET_WRAPPER_NOT_REPLACE
+
+#include "replace.h"
+#include "system/network.h"
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/types.h>
+
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+
+#ifndef SIOCGIFCONF
+#ifdef HAVE_SYS_SOCKIO_H
+#include <sys/sockio.h>
+#endif
+#endif
+
+#ifdef HAVE_IFACE_GETIFADDRS
+#define _FOUND_IFACE_ANY
+#else
+
+void rep_freeifaddrs(struct ifaddrs *ifp)
+{
+       if (ifp != NULL) {
+               free(ifp->ifa_name);
+               free(ifp->ifa_addr);
+               free(ifp->ifa_netmask);
+               free(ifp->ifa_dstaddr);
+               freeifaddrs(ifp->ifa_next);
+               free(ifp);
+       }
+}
+
+static struct sockaddr *sockaddr_dup(struct sockaddr *sa)
+{
+       struct sockaddr *ret;
+       socklen_t socklen;
+#ifdef HAVE_SOCKADDR_SA_LEN
+       socklen = sa->sa_len;
+#else
+       socklen = sizeof(struct sockaddr_storage);
+#endif
+       ret = calloc(1, socklen);
+       if (ret == NULL)
+               return NULL;
+       memcpy(ret, sa, socklen);
+       return ret;
+}
+#endif
+
+#if HAVE_IFACE_IFCONF
+
+/* this works for Linux 2.2, Solaris 2.5, SunOS4, HPUX 10.20, OSF1
+   V4.0, Ultrix 4.4, SCO Unix 3.2, IRIX 6.4 and FreeBSD 3.2.
+
+   It probably also works on any BSD style system.  */
+
+int rep_getifaddrs(struct ifaddrs **ifap)
+{
+       struct ifconf ifc;
+       char buff[8192];
+       int fd, i, n;
+       struct ifreq *ifr=NULL;
+       struct ifaddrs *curif;
+       struct ifaddrs *lastif = NULL;
+
+       *ifap = NULL;
+
+       if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) == -1) {
+               return -1;
+       }
+  
+       ifc.ifc_len = sizeof(buff);
+       ifc.ifc_buf = buff;
+
+       if (ioctl(fd, SIOCGIFCONF, &ifc) != 0) {
+               close(fd);
+               return -1;
+       } 
+
+       ifr = ifc.ifc_req;
+  
+       n = ifc.ifc_len / sizeof(struct ifreq);
+
+       /* Loop through interfaces, looking for given IP address */
+       for (i=n-1; i>=0; i--) {
+               if (ioctl(fd, SIOCGIFFLAGS, &ifr[i]) == -1) {
+                       freeifaddrs(*ifap);
+                       return -1;
+               }
+
+               curif = calloc(1, sizeof(struct ifaddrs));
+               curif->ifa_name = strdup(ifr[i].ifr_name);
+               curif->ifa_flags = ifr[i].ifr_flags;
+               curif->ifa_dstaddr = NULL;
+               curif->ifa_data = NULL;
+               curif->ifa_next = NULL;
+
+               curif->ifa_addr = NULL;
+               if (ioctl(fd, SIOCGIFADDR, &ifr[i]) != -1) {
+                       curif->ifa_addr = sockaddr_dup(&ifr[i].ifr_addr);
+               }
+
+               curif->ifa_netmask = NULL;
+               if (ioctl(fd, SIOCGIFNETMASK, &ifr[i]) != -1) {
+                       curif->ifa_netmask = sockaddr_dup(&ifr[i].ifr_addr);
+               }
+
+               if (lastif == NULL) {
+                       *ifap = curif;
+               } else {
+                       lastif->ifa_next = curif;
+               }
+               lastif = curif;
+       }
+
+       close(fd);
+
+       return 0;
+}  
+
+#define _FOUND_IFACE_ANY
+#endif /* HAVE_IFACE_IFCONF */
+#ifdef HAVE_IFACE_IFREQ
+
+#ifndef I_STR
+#include <sys/stropts.h>
+#endif
+
+/****************************************************************************
+this should cover most of the streams based systems
+Thanks to Andrej.Borsenkow@mow.siemens.ru for several ideas in this code
+****************************************************************************/
+int rep_getifaddrs(struct ifaddrs **ifap)
+{
+       struct ifreq ifreq;
+       struct strioctl strioctl;
+       char buff[8192];
+       int fd, i, n;
+       struct ifreq *ifr=NULL;
+       struct ifaddrs *curif;
+       struct ifaddrs *lastif = NULL;
+
+       *ifap = NULL;
+
+       if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) == -1) {
+               return -1;
+       }
+  
+       strioctl.ic_cmd = SIOCGIFCONF;
+       strioctl.ic_dp  = buff;
+       strioctl.ic_len = sizeof(buff);
+       if (ioctl(fd, I_STR, &strioctl) < 0) {
+               close(fd);
+               return -1;
+       } 
+
+       /* we can ignore the possible sizeof(int) here as the resulting
+          number of interface structures won't change */
+       n = strioctl.ic_len / sizeof(struct ifreq);
+
+       /* we will assume that the kernel returns the length as an int
+           at the start of the buffer if the offered size is a
+           multiple of the structure size plus an int */
+       if (n*sizeof(struct ifreq) + sizeof(int) == strioctl.ic_len) {
+               ifr = (struct ifreq *)(buff + sizeof(int));  
+       } else {
+               ifr = (struct ifreq *)buff;  
+       }
+
+       /* Loop through interfaces */
+
+       for (i = 0; i<n; i++) {
+               ifreq = ifr[i];
+  
+               curif = calloc(1, sizeof(struct ifaddrs));
+               if (lastif == NULL) {
+                       *ifap = curif;
+               } else {
+                       lastif->ifa_next = curif;
+               }
+
+               strioctl.ic_cmd = SIOCGIFFLAGS;
+               strioctl.ic_dp  = (char *)&ifreq;
+               strioctl.ic_len = sizeof(struct ifreq);
+               if (ioctl(fd, I_STR, &strioctl) != 0) {
+                       freeifaddrs(*ifap);
+                       return -1;
+               }
+
+               curif->ifa_flags = ifreq.ifr_flags;
+               
+               strioctl.ic_cmd = SIOCGIFADDR;
+               strioctl.ic_dp  = (char *)&ifreq;
+               strioctl.ic_len = sizeof(struct ifreq);
+               if (ioctl(fd, I_STR, &strioctl) != 0) {
+                       freeifaddrs(*ifap);
+                       return -1;
+               }
+
+               curif->ifa_name = strdup(ifreq.ifr_name);
+               curif->ifa_addr = sockaddr_dup(&ifreq.ifr_addr);
+               curif->ifa_dstaddr = NULL;
+               curif->ifa_data = NULL;
+               curif->ifa_next = NULL;
+               curif->ifa_netmask = NULL;
+
+               strioctl.ic_cmd = SIOCGIFNETMASK;
+               strioctl.ic_dp  = (char *)&ifreq;
+               strioctl.ic_len = sizeof(struct ifreq);
+               if (ioctl(fd, I_STR, &strioctl) != 0) {
+                       freeifaddrs(*ifap);
+                       return -1;
+               }
+
+               curif->ifa_netmask = sockaddr_dup(&ifreq.ifr_addr);
+
+               lastif = curif;
+       }
+
+       close(fd);
+
+       return 0;
+}
+
+#define _FOUND_IFACE_ANY
+#endif /* HAVE_IFACE_IFREQ */
+#ifdef HAVE_IFACE_AIX
+
+/****************************************************************************
+this one is for AIX (tested on 4.2)
+****************************************************************************/
+int rep_getifaddrs(struct ifaddrs **ifap)
+{
+       char buff[8192];
+       int fd, i;
+       struct ifconf ifc;
+       struct ifreq *ifr=NULL;
+       struct ifaddrs *curif;
+       struct ifaddrs *lastif = NULL;
+
+       *ifap = NULL;
+
+       if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) == -1) {
+               return -1;
+       }
+
+       ifc.ifc_len = sizeof(buff);
+       ifc.ifc_buf = buff;
+
+       if (ioctl(fd, SIOCGIFCONF, &ifc) != 0) {
+               close(fd);
+               return -1;
+       }
+
+       ifr = ifc.ifc_req;
+
+       /* Loop through interfaces */
+       i = ifc.ifc_len;
+
+       while (i > 0) {
+               unsigned int inc;
+
+               inc = ifr->ifr_addr.sa_len;
+
+               if (ioctl(fd, SIOCGIFADDR, ifr) != 0) {
+                       freeaddrinfo(*ifap);
+                       return -1;
+               }
+
+               curif = calloc(1, sizeof(struct ifaddrs));
+               if (lastif == NULL) {
+                       *ifap = curif;
+               } else {
+                       lastif->ifa_next = curif;
+               }
+
+               curif->ifa_name = strdup(ifr->ifr_name);
+               curif->ifa_addr = sockaddr_dup(&ifr->ifr_addr);
+               curif->ifa_dstaddr = NULL;
+               curif->ifa_data = NULL;
+               curif->ifa_netmask = NULL;
+               curif->ifa_next = NULL;
+
+               if (ioctl(fd, SIOCGIFFLAGS, ifr) != 0) {
+                       freeaddrinfo(*ifap);
+                       return -1;
+               }
+
+               curif->ifa_flags = ifr->ifr_flags;
+
+               if (ioctl(fd, SIOCGIFNETMASK, ifr) != 0) {
+                       freeaddrinfo(*ifap);
+                       return -1;
+               }
+
+               curif->ifa_netmask = sockaddr_dup(&ifr->ifr_addr);
+
+               lastif = curif;
+
+       next:
+               /*
+                * Patch from Archie Cobbs (archie@whistle.com).  The
+                * addresses in the SIOCGIFCONF interface list have a
+                * minimum size. Usually this doesn't matter, but if
+                * your machine has tunnel interfaces, etc. that have
+                * a zero length "link address", this does matter.  */
+
+               if (inc < sizeof(ifr->ifr_addr))
+                       inc = sizeof(ifr->ifr_addr);
+               inc += IFNAMSIZ;
+
+               ifr = (struct ifreq*) (((char*) ifr) + inc);
+               i -= inc;
+       }
+
+       close(fd);
+       return 0;
+}
+
+#define _FOUND_IFACE_ANY
+#endif /* HAVE_IFACE_AIX */
+#ifndef _FOUND_IFACE_ANY
+int rep_getifaddrs(struct ifaddrs **ifap)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
diff --git a/ctdb/lib/replace/hdr_replace.h b/ctdb/lib/replace/hdr_replace.h
new file mode 100644 (file)
index 0000000..6cfa50f
--- /dev/null
@@ -0,0 +1,2 @@
+/* this is a replacement header for a missing system header */
+#include "replace.h"
diff --git a/ctdb/lib/replace/inet_aton.c b/ctdb/lib/replace/inet_aton.c
new file mode 100644 (file)
index 0000000..c6b3bb1
--- /dev/null
@@ -0,0 +1,33 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * replacement functions
+ * Copyright (C) Michael Adam <obnox@samba.org> 2008
+ *
+ *  ** NOTE! The following LGPL license applies to the replace
+ *  ** library. This does NOT imply that all of Samba is released
+ *  ** under the LGPL
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "replace.h"
+#include "system/network.h"
+
+/**
+ * We know that we have inet_pton from earlier libreplace checks.
+ */
+int rep_inet_aton(const char *src, struct in_addr *dst)
+{
+       return (inet_pton(AF_INET, src, dst) > 0) ? 1 : 0;
+}
diff --git a/ctdb/lib/replace/inet_ntoa.c b/ctdb/lib/replace/inet_ntoa.c
new file mode 100644 (file)
index 0000000..e3b80eb
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * replacement routines for broken systems
+ * Copyright (C) Andrew Tridgell 2003
+ * Copyright (C) Michael Adam 2008
+ *
+ *  ** NOTE! The following LGPL license applies to the replace
+ *  ** library. This does NOT imply that all of Samba is released
+ *  ** under the LGPL
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "replace.h"
+#include "system/network.h"
+
+/**
+ * NOTE: this is not thread safe, but it can't be, either
+ * since it returns a pointer to static memory.
+ */
+char *rep_inet_ntoa(struct in_addr ip)
+{
+       uint8_t *p = (uint8_t *)&ip.s_addr;
+       static char buf[18];
+       slprintf(buf, 17, "%d.%d.%d.%d",
+                (int)p[0], (int)p[1], (int)p[2], (int)p[3]);
+       return buf;
+}
diff --git a/ctdb/lib/replace/inet_ntop.c b/ctdb/lib/replace/inet_ntop.c
new file mode 100644 (file)
index 0000000..fb3d8e9
--- /dev/null
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 1996-2001  Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM
+ * DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
+ * INTERNET SOFTWARE CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
+ * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
+ * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+ * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#include "replace.h"
+#include "system/network.h"
+
+#define NS_INT16SZ      2
+#define NS_IN6ADDRSZ   16
+
+/*
+ * WARNING: Don't even consider trying to compile this on a system where
+ * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
+ */
+
+static const char *inet_ntop4(const unsigned char *src, char *dst,
+                             socklen_t size);
+
+#ifdef AF_INET6
+static const char *inet_ntop6(const unsigned char *src, char *dst,
+                             socklen_t size);
+#endif
+
+/* char *
+ * isc_net_ntop(af, src, dst, size)
+ *     convert a network format address to presentation format.
+ * return:
+ *     pointer to presentation format address (`dst'), or NULL (see errno).
+ * author:
+ *     Paul Vixie, 1996.
+ */
+const char *
+rep_inet_ntop(int af, const void *src, char *dst, socklen_t size)
+{
+       switch (af) {
+       case AF_INET:
+               return (inet_ntop4(src, dst, size));
+#ifdef AF_INET6
+       case AF_INET6:
+               return (inet_ntop6(src, dst, size));
+#endif
+       default:
+               errno = EAFNOSUPPORT;
+               return (NULL);
+       }
+       /* NOTREACHED */
+}
+
+/* const char *
+ * inet_ntop4(src, dst, size)
+ *     format an IPv4 address
+ * return:
+ *     `dst' (as a const)
+ * notes:
+ *     (1) uses no statics
+ *     (2) takes a unsigned char* not an in_addr as input
+ * author:
+ *     Paul Vixie, 1996.
+ */
+static const char *
+inet_ntop4(const unsigned char *src, char *dst, socklen_t size)
+{
+       static const char *fmt = "%u.%u.%u.%u";
+       char tmp[sizeof "255.255.255.255"];
+       size_t len;
+
+       len = snprintf(tmp, sizeof tmp, fmt, src[0], src[1], src[2], src[3]);
+       if (len >= size) {
+               errno = ENOSPC;
+               return (NULL);
+       }
+       memcpy(dst, tmp, len + 1);
+
+       return (dst);
+}
+
+/* const char *
+ * isc_inet_ntop6(src, dst, size)
+ *     convert IPv6 binary address into presentation (printable) format
+ * author:
+ *     Paul Vixie, 1996.
+ */
+#ifdef AF_INET6
+static const char *
+inet_ntop6(const unsigned char *src, char *dst, socklen_t size)
+{
+       /*
+        * Note that int32_t and int16_t need only be "at least" large enough
+        * to contain a value of the specified size.  On some systems, like
+        * Crays, there is no such thing as an integer variable with 16 bits.
+        * Keep this in mind if you think this function should have been coded
+        * to use pointer overlays.  All the world's not a VAX.
+        */
+       char tmp[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"], *tp;
+       struct { int base, len; } best, cur;
+       unsigned int words[NS_IN6ADDRSZ / NS_INT16SZ];
+       int i, inc;
+
+       /*
+        * Preprocess:
+        *      Copy the input (bytewise) array into a wordwise array.
+        *      Find the longest run of 0x00's in src[] for :: shorthanding.
+        */
+       memset(words, '\0', sizeof words);
+       for (i = 0; i < NS_IN6ADDRSZ; i++)
+               words[i / 2] |= (src[i] << ((1 - (i % 2)) << 3));
+       best.base = -1;
+       best.len = 0;
+       cur.base = -1;
+       cur.len = 0;
+       for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++) {
+               if (words[i] == 0) {
+                       if (cur.base == -1)
+                               cur.base = i, cur.len = 1;
+                       else
+                               cur.len++;
+               } else {
+                       if (cur.base != -1) {
+                               if (best.base == -1 || cur.len > best.len)
+                                       best = cur;
+                               cur.base = -1;
+                       }
+               }
+       }
+       if (cur.base != -1) {
+               if (best.base == -1 || cur.len > best.len)
+                       best = cur;
+       }
+       if (best.base != -1 && best.len < 2)
+               best.base = -1;
+
+       /*
+        * Format the result.
+        */
+       tp = tmp;
+       for (i = 0; i < (NS_IN6ADDRSZ / NS_INT16SZ); i++) {
+               /* Are we inside the best run of 0x00's? */
+               if (best.base != -1 && i >= best.base &&
+                   i < (best.base + best.len)) {
+                       if (i == best.base)
+                               *tp++ = ':';
+                       continue;
+               }
+               /* Are we following an initial run of 0x00s or any real hex? */
+               if (i != 0)
+                       *tp++ = ':';
+               /* Is this address an encapsulated IPv4? */
+               if (i == 6 && best.base == 0 &&
+                   (best.len == 6 || (best.len == 5 && words[5] == 0xffff))) {
+                       if (!inet_ntop4(src+12, tp, sizeof tmp - (tp - tmp)))
+                               return (NULL);
+                       tp += strlen(tp);
+                       break;
+               }
+               inc = snprintf(tp, 5, "%x", words[i]);
+               if (inc >= 5) {
+                       abort();
+               }
+               tp += inc;
+       }
+       /* Was it a trailing run of 0x00's? */
+       if (best.base != -1 && (best.base + best.len) ==
+           (NS_IN6ADDRSZ / NS_INT16SZ))
+               *tp++ = ':';
+       *tp++ = '\0';
+
+       /*
+        * Check for overflow, copy, and we're done.
+        */
+       if ((size_t)(tp - tmp) > size) {
+               errno = ENOSPC;
+               return (NULL);
+       }
+       memcpy(dst, tmp, tp - tmp);
+       return (dst);
+}
+#endif /* AF_INET6 */
diff --git a/ctdb/lib/replace/inet_pton.c b/ctdb/lib/replace/inet_pton.c
new file mode 100644 (file)
index 0000000..80e4865
--- /dev/null
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 1996-2001  Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM
+ * DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
+ * INTERNET SOFTWARE CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT,
+ * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING
+ * FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
+ * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
+ * WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "replace.h"
+#include "system/network.h"
+
+#define NS_INT16SZ      2
+#define NS_INADDRSZ     4
+#define NS_IN6ADDRSZ   16
+
+/*
+ * WARNING: Don't even consider trying to compile this on a system where
+ * sizeof(int) < 4.  sizeof(int) > 4 is fine; all the world's not a VAX.
+ */
+
+static int inet_pton4(const char *src, unsigned char *dst);
+#ifdef AF_INET6
+static int inet_pton6(const char *src, unsigned char *dst);
+#endif
+
+/* int
+ * inet_pton(af, src, dst)
+ *     convert from presentation format (which usually means ASCII printable)
+ *     to network format (which is usually some kind of binary format).
+ * return:
+ *     1 if the address was valid for the specified address family
+ *     0 if the address wasn't valid (`dst' is untouched in this case)
+ *     -1 if some other error occurred (`dst' is untouched in this case, too)
+ * author:
+ *     Paul Vixie, 1996.
+ */
+int
+rep_inet_pton(int af,
+         const char *src,
+         void *dst)
+{
+       switch (af) {
+       case AF_INET:
+               return (inet_pton4(src, dst));
+#ifdef AF_INET6
+       case AF_INET6:
+               return (inet_pton6(src, dst));
+#endif
+       default:
+               errno = EAFNOSUPPORT;
+               return (-1);
+       }
+       /* NOTREACHED */
+}
+
+/* int
+ * inet_pton4(src, dst)
+ *     like inet_aton() but without all the hexadecimal and shorthand.
+ * return:
+ *     1 if `src' is a valid dotted quad, else 0.
+ * notice:
+ *     does not touch `dst' unless it's returning 1.
+ * author:
+ *     Paul Vixie, 1996.
+ */
+static int
+inet_pton4(src, dst)
+       const char *src;
+       unsigned char *dst;
+{
+       static const char digits[] = "0123456789";
+       int saw_digit, octets, ch;
+       unsigned char tmp[NS_INADDRSZ], *tp;
+
+       saw_digit = 0;
+       octets = 0;
+       *(tp = tmp) = 0;
+       while ((ch = *src++) != '\0') {
+               const char *pch;
+
+               if ((pch = strchr(digits, ch)) != NULL) {
+                       unsigned int new = *tp * 10 + (pch - digits);
+
+                       if (new > 255)
+                               return (0);
+                       *tp = new;
+                       if (! saw_digit) {
+                               if (++octets > 4)
+                                       return (0);
+                               saw_digit = 1;
+                       }
+               } else if (ch == '.' && saw_digit) {
+                       if (octets == 4)
+                               return (0);
+                       *++tp = 0;
+                       saw_digit = 0;
+               } else
+                       return (0);
+       }
+       if (octets < 4)
+               return (0);
+       memcpy(dst, tmp, NS_INADDRSZ);
+       return (1);
+}
+
+/* int
+ * inet_pton6(src, dst)
+ *     convert presentation level address to network order binary form.
+ * return:
+ *     1 if `src' is a valid [RFC1884 2.2] address, else 0.
+ * notice:
+ *     (1) does not touch `dst' unless it's returning 1.
+ *     (2) :: in a full address is silently ignored.
+ * credit:
+ *     inspired by Mark Andrews.
+ * author:
+ *     Paul Vixie, 1996.
+ */
+#ifdef AF_INET6
+static int
+inet_pton6(src, dst)
+       const char *src;
+       unsigned char *dst;
+{
+       static const char xdigits_l[] = "0123456789abcdef",
+                         xdigits_u[] = "0123456789ABCDEF";
+       unsigned char tmp[NS_IN6ADDRSZ], *tp, *endp, *colonp;
+       const char *xdigits, *curtok;
+       int ch, saw_xdigit;
+       unsigned int val;
+
+       memset((tp = tmp), '\0', NS_IN6ADDRSZ);
+       endp = tp + NS_IN6ADDRSZ;
+       colonp = NULL;
+       /* Leading :: requires some special handling. */
+       if (*src == ':')
+               if (*++src != ':')
+                       return (0);
+       curtok = src;
+       saw_xdigit = 0;
+       val = 0;
+       while ((ch = *src++) != '\0') {
+               const char *pch;
+
+               if ((pch = strchr((xdigits = xdigits_l), ch)) == NULL)
+                       pch = strchr((xdigits = xdigits_u), ch);
+               if (pch != NULL) {
+                       val <<= 4;
+                       val |= (pch - xdigits);
+                       if (val > 0xffff)
+                               return (0);
+                       saw_xdigit = 1;
+                       continue;
+               }
+               if (ch == ':') {
+                       curtok = src;
+                       if (!saw_xdigit) {
+                               if (colonp)
+                                       return (0);
+                               colonp = tp;
+                               continue;
+                       }
+                       if (tp + NS_INT16SZ > endp)
+                               return (0);
+                       *tp++ = (unsigned char) (val >> 8) & 0xff;
+                       *tp++ = (unsigned char) val & 0xff;
+                       saw_xdigit = 0;
+                       val = 0;
+                       continue;
+               }
+               if (ch == '.' && ((tp + NS_INADDRSZ) <= endp) &&
+                   inet_pton4(curtok, tp) > 0) {
+                       tp += NS_INADDRSZ;
+                       saw_xdigit = 0;
+                       break;  /* '\0' was seen by inet_pton4(). */
+               }
+               return (0);
+       }
+       if (saw_xdigit) {
+               if (tp + NS_INT16SZ > endp)
+                       return (0);
+               *tp++ = (unsigned char) (val >> 8) & 0xff;
+               *tp++ = (unsigned char) val & 0xff;
+       }
+       if (colonp != NULL) {
+               /*
+                * Since some memmove()'s erroneously fail to handle
+                * overlapping regions, we'll do the shift by hand.
+                */
+               const int n = tp - colonp;
+               int i;
+
+               for (i = 1; i <= n; i++) {
+                       endp[- i] = colonp[n - i];
+                       colonp[n - i] = 0;
+               }
+               tp = endp;
+       }
+       if (tp != endp)
+               return (0);
+       memcpy(dst, tmp, NS_IN6ADDRSZ);
+       return (1);
+}
+#endif
diff --git a/ctdb/lib/replace/install-sh b/ctdb/lib/replace/install-sh
new file mode 100755 (executable)
index 0000000..5871924
--- /dev/null
@@ -0,0 +1,238 @@
+#! /bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+#
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+       -c) instcmd="$cpprog"
+           shift
+           continue;;
+
+       -d) dir_arg=true
+           shift
+           continue;;
+
+       -m) chmodcmd="$chmodprog $2"
+           shift
+           shift
+           continue;;
+
+       -o) chowncmd="$chownprog $2"
+           shift
+           shift
+           continue;;
+
+       -g) chgrpcmd="$chgrpprog $2"
+           shift
+           shift
+           continue;;
+
+       -s) stripcmd="$stripprog"
+           shift
+           continue;;
+
+       -t=*) transformarg=`echo $1 | sed 's/-t=//'`
+           shift
+           continue;;
+
+       -b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+           shift
+           continue;;
+
+       *)  if [ x"$src" = x ]
+           then
+               src=$1
+           else
+               # this colon is to work around a 386BSD /bin/sh bug
+               :
+               dst=$1
+           fi
+           shift
+           continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+       echo "install:  no input file specified"
+       exit 1
+else
+       true
+fi
+
+if [ x"$dir_arg" != x ]; then
+       dst=$src
+       src=""
+       
+       if [ -d $dst ]; then
+               instcmd=:
+       else
+               instcmd=mkdir
+       fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+       if [ -f $src -o -d $src ]
+       then
+               true
+       else
+               echo "install:  $src does not exist"
+               exit 1
+       fi
+       
+       if [ x"$dst" = x ]
+       then
+               echo "install:  no destination specified"
+               exit 1
+       else
+               true
+       fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+       if [ -d $dst ]
+       then
+               dst="$dst"/`basename $src`
+       else
+               true
+       fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='   
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+       pathcomp="${pathcomp}${1}"
+       shift
+
+       if [ ! -d "${pathcomp}" ] ;
+        then
+               $mkdirprog "${pathcomp}"
+       else
+               true
+       fi
+
+       pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+       $doit $instcmd $dst &&
+
+       if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+       if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+       if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+       if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+       if [ x"$transformarg" = x ] 
+       then
+               dstfile=`basename $dst`
+       else
+               dstfile=`basename $dst $transformbasename | 
+                       sed $transformarg`$transformbasename
+       fi
+
+# don't allow the sed command to completely eliminate the filename
+
+       if [ x"$dstfile" = x ] 
+       then
+               dstfile=`basename $dst`
+       else
+               true
+       fi
+
+# Make a temp file name in the proper directory.
+
+       dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+       $doit $instcmd $src $dsttmp &&
+
+       trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+       if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+       if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+       if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+       if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+       $doit $rmcmd -f $dstdir/$dstfile &&
+       $doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0
diff --git a/ctdb/lib/replace/libreplace.m4 b/ctdb/lib/replace/libreplace.m4
new file mode 100644 (file)
index 0000000..f4ae715
--- /dev/null
@@ -0,0 +1,490 @@
+AC_DEFUN_ONCE(AC_LIBREPLACE_LOCATION_CHECKS,
+[
+echo "LIBREPLACE_LOCATION_CHECKS: START"
+
+dnl find the libreplace sources. This is meant to work both for 
+dnl libreplace standalone builds, and builds of packages using libreplace
+libreplacedir=""
+libreplacepaths="$srcdir $srcdir/lib/replace $srcdir/libreplace $srcdir/../libreplace $srcdir/../replace $srcdir/../lib/replace $srcdir/../../../lib/replace"
+for d in $libreplacepaths; do
+       if test -f "$d/replace.c"; then
+               libreplacedir="$d"              
+               AC_SUBST(libreplacedir)
+               break;
+       fi
+done
+if test x"$libreplacedir" = "x"; then
+       AC_MSG_ERROR([cannot find libreplace in $libreplacepaths])
+fi
+LIBREPLACEOBJ="$libreplacedir/replace.o"
+AC_SUBST(LIBREPLACEOBJ)
+
+AC_CANONICAL_BUILD
+AC_CANONICAL_HOST
+AC_CANONICAL_TARGET
+
+echo "LIBREPLACE_LOCATION_CHECKS: END"
+]) dnl end AC_LIBREPLACE_LOCATION_CHECKS
+
+
+AC_DEFUN_ONCE(AC_LIBREPLACE_BROKEN_CHECKS,
+[
+echo "LIBREPLACE_BROKEN_CHECKS: START"
+
+dnl find the libreplace sources. This is meant to work both for 
+dnl libreplace standalone builds, and builds of packages using libreplace
+libreplacedir=""
+libreplacepaths="$srcdir $srcdir/lib/replace $srcdir/libreplace $srcdir/../libreplace $srcdir/../replace $srcdir/../lib/replace $srcdir/../../../lib/replace"
+for d in $libreplacepaths; do
+       if test -f "$d/replace.c"; then
+               libreplacedir="$d"              
+               AC_SUBST(libreplacedir)
+               break;
+       fi
+done
+if test x"$libreplacedir" = "x"; then
+       AC_MSG_ERROR([cannot find libreplace in $libreplacepaths])
+fi
+
+LIBREPLACEOBJ="$libreplacedir/replace.o"
+AC_SUBST(LIBREPLACEOBJ)
+
+LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/snprintf.o  $libreplacedir/xattr.o"
+
+AC_TYPE_UID_T
+AC_TYPE_MODE_T
+AC_TYPE_OFF_T
+AC_TYPE_SIZE_T
+AC_TYPE_PID_T
+AC_STRUCT_ST_RDEV
+AC_CHECK_TYPE(ino_t,unsigned)
+AC_CHECK_TYPE(loff_t,off_t)
+AC_CHECK_TYPE(offset_t,loff_t)
+AC_CHECK_TYPE(blksize_t,long)
+AC_CHECK_TYPE(blkcnt_t,long)
+
+AC_FUNC_MEMCMP
+
+AC_CHECK_FUNCS([pipe strftime srandom random srand rand usleep setbuffer lstat getpgrp utime utimes])
+
+AC_CHECK_HEADERS(stdbool.h stdint.h sys/select.h)
+AC_CHECK_HEADERS(setjmp.h utime.h sys/wait.h)
+
+LIBREPLACE_PROVIDE_HEADER([stdint.h])
+LIBREPLACE_PROVIDE_HEADER([stdbool.h])
+
+AC_DEFINE(HAVE_LIBREPLACE, 1, [We have libreplace])
+
+AC_CHECK_TYPE(bool, 
+[AC_DEFINE(HAVE_BOOL, 1, [Whether the bool type is available])],,
+[
+AC_INCLUDES_DEFAULT
+#ifdef HAVE_STDBOOL_H
+#include <stdbool.h>
+#endif]
+)
+
+AC_CHECK_TYPE(_Bool, 
+[AC_DEFINE(HAVE__Bool, 1, [Whether the _Bool type is available])],,
+[
+AC_INCLUDES_DEFAULT
+#ifdef HAVE_STDBOOL_H
+#include <stdbool.h>
+#endif]
+)
+
+AC_CHECK_HEADERS(linux/types.h)
+
+AC_CACHE_CHECK([for working mmap],libreplace_cv_HAVE_MMAP,[
+AC_TRY_RUN([#include "$libreplacedir/test/shared_mmap.c"],
+           libreplace_cv_HAVE_MMAP=yes,libreplace_cv_HAVE_MMAP=no,libreplace_cv_HAVE_MMAP=cross)])
+if test x"$libreplace_cv_HAVE_MMAP" = x"yes"; then
+    AC_DEFINE(HAVE_MMAP,1,[Whether mmap works])
+fi
+
+AC_CACHE_CHECK([for working mremap],libreplace_cv_HAVE_MREMAP,[
+AC_TRY_RUN([#include "$libreplacedir/test/shared_mremap.c"],
+           libreplace_cv_HAVE_MREMAP=yes,libreplace_cv_HAVE_MREMAP=no,libreplace_cv_HAVE_MREMAP=cross)])
+if test x"$libreplace_cv_HAVE_MREMAP" = x"yes"; then
+    AC_DEFINE(HAVE_MREMAP,1,[Whether mremap works])
+fi
+
+AC_CACHE_CHECK([for incoherent mmap],libreplace_cv_HAVE_INCOHERENT_MMAP,[
+AC_TRY_RUN([#include "$libreplacedir/test/incoherent_mmap.c"],
+           libreplace_cv_HAVE_INCOHERENT_MMAP=yes,libreplace_cv_HAVE_INCOHERENT_MMAP=no,libreplace_cv_HAVE_INCOHERENT_MMAP=cross)])
+if test x"$libreplace_cv_HAVE_INCOHERENT_MMAP" = x"yes"; then
+    AC_DEFINE(HAVE_INCOHERENT_MMAP,1,[Whether mmap is incoherent against write])
+fi
+
+
+AC_CHECK_HEADERS(sys/syslog.h syslog.h)
+AC_CHECK_HEADERS(sys/time.h time.h)
+AC_CHECK_HEADERS(stdarg.h vararg.h)
+AC_CHECK_HEADERS(sys/mount.h mntent.h)
+AC_CHECK_HEADERS(stropts.h)
+AC_CHECK_HEADERS(unix.h)
+AC_CHECK_HEADERS(malloc.h)
+AC_CHECK_HEADERS(syscall.h)
+AC_CHECK_HEADERS(sys/syscall.h)
+AC_CHECK_HEADERS(sys/ucontext.h)
+
+AC_CHECK_FUNCS(syscall setuid seteuid setreuid setresuid setgid setegid setregid setresgid setgroups)
+AC_CHECK_FUNCS(chroot bzero strerror strerror_r memalign posix_memalign getpagesize)
+AC_CHECK_FUNCS(vsyslog setlinebuf mktime ftruncate chsize rename)
+AC_CHECK_FUNCS(waitpid wait4 strlcpy strlcat initgroups memmove strdup)
+AC_CHECK_FUNCS(pread pwrite strndup strcasestr strtok_r mkdtemp dup2 dprintf vdprintf)
+AC_CHECK_FUNCS(isatty chown lchown link readlink symlink realpath)
+AC_CHECK_FUNCS(fdatasync,,[
+       # if we didn't find it, look in librt (Solaris hides it there...)
+       AC_CHECK_LIB(rt, fdatasync,
+               [libreplace_cv_HAVE_FDATASYNC_IN_LIBRT=yes
+               AC_DEFINE(HAVE_FDATASYNC, 1, Define to 1 if there is support for fdatasync)])
+])
+AC_HAVE_DECL(fdatasync, [#include <unistd.h>])
+AC_CHECK_FUNCS(clock_gettime,libreplace_cv_have_clock_gettime=yes,[
+       AC_CHECK_LIB(rt, clock_gettime,
+               [libreplace_cv_HAVE_CLOCK_GETTIME_IN_LIBRT=yes
+               libreplace_cv_have_clock_gettime=yes
+               AC_DEFINE(HAVE_CLOCK_GETTIME, 1, Define to 1 if there is support for clock_gettime)])
+])
+
+AC_CHECK_HEADERS(sys/attributes.h attr/xattr.h sys/xattr.h sys/extattr.h sys/uio.h)
+AC_CHECK_HEADERS(sys/ea.h sys/proplist.h)
+
+LIBREPLACE_FILESYS_LIBS=""
+
+############################################
+# Check for EA implementations
+case "$host_os" in
+  *freebsd4* | *dragonfly* )
+       AC_DEFINE(BROKEN_EXTATTR, 1, [Does extattr API work])
+  ;;
+  *)
+       AC_SEARCH_LIBS(getxattr, [attr])
+       AC_CHECK_FUNCS(attr_get attr_getf attr_list attr_listf attropen attr_remove)
+       AC_CHECK_FUNCS(attr_removef attr_set attr_setf extattr_delete_fd extattr_delete_file)
+       AC_CHECK_FUNCS(extattr_get_fd extattr_get_file extattr_list_fd extattr_list_file)
+       AC_CHECK_FUNCS(extattr_set_fd extattr_set_file fgetea fgetxattr flistea flistxattr)
+       AC_CHECK_FUNCS(fremoveea fremovexattr fsetea fsetxattr getea getxattr listea)
+       AC_CHECK_FUNCS(listxattr removeea removexattr setea setxattr)
+
+       AC_CHECK_LIB_EXT(attr, LIBREPLACE_FILESYS_LIBS, flistea)
+       AC_CHECK_LIB_EXT(attr, LIBREPLACE_FILESYS_LIBS, flistxattr)
+       AC_CHECK_LIB_EXT(attr, LIBREPLACE_FILESYS_LIBS, attr_listf)
+       AC_CHECK_LIB_EXT(attr, LIBREPLACE_FILESYS_LIBS, extattr_list_fd)
+
+  ;;
+esac
+
+
+########################################################
+# Do xattr functions take additional options like on Darwin?
+if test x"$ac_cv_func_getxattr" = x"yes" ; then
+       AC_CACHE_CHECK([whether xattr interface takes additional options], smb_attr_cv_xattr_add_opt, [
+               old_LIBS=$LIBS
+               LIBS="$LIBS $LIBREPLACE_FILESYS_LIBS"
+               AC_TRY_COMPILE([
+                       #include <sys/types.h>
+                       #if HAVE_ATTR_XATTR_H
+                       #include <attr/xattr.h>
+                       #elif HAVE_SYS_XATTR_H
+                       #include <sys/xattr.h>
+                       #endif
+               ],[
+                       getxattr(0, 0, 0, 0, 0, 0);
+               ],
+               [smb_attr_cv_xattr_add_opt=yes],
+               [smb_attr_cv_xattr_add_opt=no;LIBS=$old_LIBS])
+       ])
+       if test x"$smb_attr_cv_xattr_add_opt" = x"yes"; then
+               AC_DEFINE(XATTR_ADDITIONAL_OPTIONS, 1, [xattr functions have additional options])
+       fi
+fi
+
+AC_CHECK_FUNCS(get_current_dir_name)
+AC_HAVE_DECL(setresuid, [#include <unistd.h>])
+AC_HAVE_DECL(setresgid, [#include <unistd.h>])
+AC_HAVE_DECL(errno, [#include <errno.h>])
+
+AC_CACHE_CHECK([for secure mkstemp],libreplace_cv_HAVE_SECURE_MKSTEMP,[
+AC_TRY_RUN([#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+main() { 
+  struct stat st;
+  char tpl[20]="/tmp/test.XXXXXX";
+  char tpl2[20]="/tmp/test.XXXXXX";
+  int fd = mkstemp(tpl);
+  int fd2 = mkstemp(tpl2);
+  if (fd == -1) {
+        if (fd2 != -1) {
+                unlink(tpl2);
+        }
+        exit(1);
+  }
+  if (fd2 == -1) exit(1);
+  unlink(tpl);
+  unlink(tpl2);
+  if (fstat(fd, &st) != 0) exit(1);
+  if ((st.st_mode & 0777) != 0600) exit(1);
+  if (strcmp(tpl, "/tmp/test.XXXXXX") == 0) {
+        exit(1);
+  }
+  if (strcmp(tpl, tpl2) == 0) {
+        exit(1);
+  }
+  exit(0);
+}],
+libreplace_cv_HAVE_SECURE_MKSTEMP=yes,
+libreplace_cv_HAVE_SECURE_MKSTEMP=no,
+libreplace_cv_HAVE_SECURE_MKSTEMP=cross)])
+if test x"$libreplace_cv_HAVE_SECURE_MKSTEMP" = x"yes"; then
+    AC_DEFINE(HAVE_SECURE_MKSTEMP,1,[Whether mkstemp is secure])
+fi
+
+dnl Provided by snprintf.c:
+AC_CHECK_HEADERS(stdio.h strings.h)
+AC_CHECK_DECLS([snprintf, vsnprintf, asprintf, vasprintf])
+AC_CHECK_FUNCS(snprintf vsnprintf asprintf vasprintf)
+
+AC_CACHE_CHECK([for C99 vsnprintf],libreplace_cv_HAVE_C99_VSNPRINTF,[
+AC_TRY_RUN([
+#include <sys/types.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+void foo(const char *format, ...) { 
+       va_list ap;
+       int len;
+       char buf[20];
+       long long l = 1234567890;
+       l *= 100;
+
+       va_start(ap, format);
+       len = vsnprintf(buf, 0, format, ap);
+       va_end(ap);
+       if (len != 5) exit(1);
+
+       va_start(ap, format);
+       len = vsnprintf(0, 0, format, ap);
+       va_end(ap);
+       if (len != 5) exit(2);
+
+       if (snprintf(buf, 3, "hello") != 5 || strcmp(buf, "he") != 0) exit(3);
+
+       if (snprintf(buf, 20, "%lld", l) != 12 || strcmp(buf, "123456789000") != 0) exit(4);
+       if (snprintf(buf, 20, "%zu", 123456789) != 9 || strcmp(buf, "123456789") != 0) exit(5);
+       if (snprintf(buf, 20, "%2\$d %1\$d", 3, 4) != 3 || strcmp(buf, "4 3") != 0) exit(6);
+       if (snprintf(buf, 20, "%s", 0) < 3) exit(7);
+
+       exit(0);
+}
+main() { foo("hello"); }
+],
+libreplace_cv_HAVE_C99_VSNPRINTF=yes,libreplace_cv_HAVE_C99_VSNPRINTF=no,libreplace_cv_HAVE_C99_VSNPRINTF=cross)])
+if test x"$libreplace_cv_HAVE_C99_VSNPRINTF" = x"yes"; then
+    AC_DEFINE(HAVE_C99_VSNPRINTF,1,[Whether there is a C99 compliant vsnprintf])
+fi
+
+
+dnl VA_COPY
+AC_CACHE_CHECK([for va_copy],libreplace_cv_HAVE_VA_COPY,[
+AC_TRY_LINK([#include <stdarg.h>
+va_list ap1,ap2;], [va_copy(ap1,ap2);],
+libreplace_cv_HAVE_VA_COPY=yes,libreplace_cv_HAVE_VA_COPY=no)])
+if test x"$libreplace_cv_HAVE_VA_COPY" = x"yes"; then
+    AC_DEFINE(HAVE_VA_COPY,1,[Whether va_copy() is available])
+fi
+
+if test x"$libreplace_cv_HAVE_VA_COPY" != x"yes"; then
+AC_CACHE_CHECK([for __va_copy],libreplace_cv_HAVE___VA_COPY,[
+AC_TRY_LINK([#include <stdarg.h>
+va_list ap1,ap2;], [__va_copy(ap1,ap2);],
+libreplace_cv_HAVE___VA_COPY=yes,libreplace_cv_HAVE___VA_COPY=no)])
+if test x"$libreplace_cv_HAVE___VA_COPY" = x"yes"; then
+    AC_DEFINE(HAVE___VA_COPY,1,[Whether __va_copy() is available])
+fi
+fi
+
+dnl __FUNCTION__ macro
+AC_CACHE_CHECK([for __FUNCTION__ macro],libreplace_cv_HAVE_FUNCTION_MACRO,[
+AC_TRY_COMPILE([#include <stdio.h>], [printf("%s\n", __FUNCTION__);],
+libreplace_cv_HAVE_FUNCTION_MACRO=yes,libreplace_cv_HAVE_FUNCTION_MACRO=no)])
+if test x"$libreplace_cv_HAVE_FUNCTION_MACRO" = x"yes"; then
+    AC_DEFINE(HAVE_FUNCTION_MACRO,1,[Whether there is a __FUNCTION__ macro])
+else
+    dnl __func__ macro
+    AC_CACHE_CHECK([for __func__ macro],libreplace_cv_HAVE_func_MACRO,[
+    AC_TRY_COMPILE([#include <stdio.h>], [printf("%s\n", __func__);],
+    libreplace_cv_HAVE_func_MACRO=yes,libreplace_cv_HAVE_func_MACRO=no)])
+    if test x"$libreplace_cv_HAVE_func_MACRO" = x"yes"; then
+       AC_DEFINE(HAVE_func_MACRO,1,[Whether there is a __func__ macro])
+    fi
+fi
+
+AC_CHECK_HEADERS([sys/param.h limits.h])
+
+AC_CHECK_TYPE(comparison_fn_t, 
+[AC_DEFINE(HAVE_COMPARISON_FN_T, 1,[Whether or not we have comparison_fn_t])])
+
+AC_HAVE_DECL(setenv, [#include <stdlib.h>])
+AC_CHECK_FUNCS(setenv unsetenv)
+AC_HAVE_DECL(environ, [#include <unistd.h>])
+
+AC_CHECK_FUNCS(strnlen)
+AC_CHECK_FUNCS(strtoull __strtoull strtouq strtoll __strtoll strtoq)
+
+AC_CHECK_FUNCS(memmem)
+
+# this test disabled as we don't actually need __VA_ARGS__ yet
+AC_TRY_CPP([
+#define eprintf(...) fprintf(stderr, __VA_ARGS__)
+eprintf("bla", "bar");
+], AC_DEFINE(HAVE__VA_ARGS__MACRO, 1, [Whether the __VA_ARGS__ macro is available]))
+
+
+AC_CACHE_CHECK([for sig_atomic_t type],libreplace_cv_sig_atomic_t, [
+    AC_TRY_COMPILE([
+#include <sys/types.h>
+#if STDC_HEADERS
+#include <stdlib.h>
+#include <stddef.h>
+#endif
+#include <signal.h>],[sig_atomic_t i = 0],
+       libreplace_cv_sig_atomic_t=yes,libreplace_cv_sig_atomic_t=no)])
+if test x"$libreplace_cv_sig_atomic_t" = x"yes"; then
+   AC_DEFINE(HAVE_SIG_ATOMIC_T_TYPE,1,[Whether we have the atomic_t variable type])
+fi
+
+
+dnl Check if the C compiler understands volatile (it should, being ANSI).
+AC_CACHE_CHECK([that the C compiler understands volatile],libreplace_cv_volatile, [
+       AC_TRY_COMPILE([#include <sys/types.h>],[volatile int i = 0],
+               libreplace_cv_volatile=yes,libreplace_cv_volatile=no)])
+if test x"$libreplace_cv_volatile" = x"yes"; then
+       AC_DEFINE(HAVE_VOLATILE, 1, [Whether the C compiler understands volatile])
+fi
+
+m4_include(system/config.m4)
+
+AC_CACHE_CHECK([for O_DIRECT flag to open(2)],libreplace_cv_HAVE_OPEN_O_DIRECT,[
+AC_TRY_COMPILE([
+#include <unistd.h>
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#endif],
+[int fd = open("/dev/null", O_DIRECT);],
+libreplace_cv_HAVE_OPEN_O_DIRECT=yes,libreplace_cv_HAVE_OPEN_O_DIRECT=no)])
+if test x"$libreplace_cv_HAVE_OPEN_O_DIRECT" = x"yes"; then
+    AC_DEFINE(HAVE_OPEN_O_DIRECT,1,[Whether the open(2) accepts O_DIRECT])
+fi
+
+m4_include(dlfcn.m4)
+m4_include(strptime.m4)
+m4_include(win32.m4)
+m4_include(timegm.m4)
+m4_include(repdir.m4)
+m4_include(crypt.m4)
+
+if test x$libreplace_cv_have_clock_gettime = xyes ; then
+       SMB_CHECK_CLOCK_ID(CLOCK_MONOTONIC)
+       SMB_CHECK_CLOCK_ID(CLOCK_PROCESS_CPUTIME_ID)
+       SMB_CHECK_CLOCK_ID(CLOCK_REALTIME)
+fi
+
+AC_CACHE_CHECK([for struct timespec type],libreplace_cv_struct_timespec, [
+    AC_TRY_COMPILE([
+#include <sys/types.h>
+#if STDC_HEADERS
+#include <stdlib.h>
+#include <stddef.h>
+#endif
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+],[struct timespec ts;],
+       libreplace_cv_struct_timespec=yes,libreplace_cv_struct_timespec=no)])
+if test x"$libreplace_cv_struct_timespec" = x"yes"; then
+   AC_DEFINE(HAVE_STRUCT_TIMESPEC,1,[Whether we have struct timespec])
+fi
+
+AC_CACHE_CHECK([for ucontext_t type],libreplace_cv_ucontext_t, [
+    AC_TRY_COMPILE([
+#include <signal.h>
+#if HAVE_SYS_UCONTEXT_H
+#include <sys/ucontext.h>
+# endif
+],[ucontext_t uc; sigaddset(&uc.uc_sigmask, SIGUSR1);],
+       libreplace_cv_ucontext_t=yes,libreplace_cv_ucontext_t=no)])
+if test x"$libreplace_cv_ucontext_t" = x"yes"; then
+   AC_DEFINE(HAVE_UCONTEXT_T,1,[Whether we have ucontext_t])
+fi
+
+AC_CHECK_FUNCS([printf memset memcpy],,[AC_MSG_ERROR([Required function not found])])
+
+echo "LIBREPLACE_BROKEN_CHECKS: END"
+]) dnl end AC_LIBREPLACE_BROKEN_CHECKS
+
+AC_DEFUN_ONCE(AC__LIBREPLACE_ALL_CHECKS_START,
+[
+#LIBREPLACE_ALL_CHECKS: START"
+])
+AC_DEFUN_ONCE(AC__LIBREPLACE_ALL_CHECKS_END,
+[
+#LIBREPLACE_ALL_CHECKS: END"
+])
+m4_define(AC_LIBREPLACE_ALL_CHECKS,
+[
+AC__LIBREPLACE_ALL_CHECKS_START
+AC_LIBREPLACE_LOCATION_CHECKS
+AC_LIBREPLACE_CC_CHECKS
+AC_LIBREPLACE_BROKEN_CHECKS
+AC__LIBREPLACE_ALL_CHECKS_END
+CFLAGS="$CFLAGS -I$libreplacedir"
+])
+
+m4_include(libreplace_cc.m4)
+m4_include(libreplace_ld.m4)
+m4_include(libreplace_network.m4)
+m4_include(libreplace_macros.m4)
+
+
+dnl SMB_CHECK_CLOCK_ID(clockid)
+dnl Test whether the specified clock_gettime clock ID is available. If it
+dnl is, we define HAVE_clockid
+AC_DEFUN([SMB_CHECK_CLOCK_ID],
+[
+    AC_MSG_CHECKING(for $1)
+    AC_TRY_LINK([
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+    ],
+    [
+clockid_t clk = $1;
+    ],
+    [
+       AC_MSG_RESULT(yes)
+       AC_DEFINE(HAVE_$1, 1,
+           [Whether the clock_gettime clock ID $1 is available])
+    ],
+    [
+       AC_MSG_RESULT(no)
+    ])
+])
+m4_ifndef([AC_USE_SYSTEM_EXTENSIONS],[m4_include(autoconf-2.60.m4)])
diff --git a/ctdb/lib/replace/libreplace_cc.m4 b/ctdb/lib/replace/libreplace_cc.m4
new file mode 100644 (file)
index 0000000..50cb735
--- /dev/null
@@ -0,0 +1,197 @@
+
+AC_DEFUN_ONCE(AC__LIBREPLACE_ONLY_CC_CHECKS_START,
+[
+echo "LIBREPLACE_CC_CHECKS: START"
+])
+
+AC_DEFUN_ONCE(AC__LIBREPLACE_ONLY_CC_CHECKS_END,
+[
+echo "LIBREPLACE_CC_CHECKS: END"
+])
+
+dnl
+dnl
+dnl AC_LIBREPLACE_CC_CHECKS
+dnl
+dnl Note: we need to use m4_define instead of AC_DEFUN because
+dnl       of the ordering of tests
+dnl       
+dnl 
+m4_define(AC_LIBREPLACE_CC_CHECKS,
+[
+AC__LIBREPLACE_ONLY_CC_CHECKS_START
+
+dnl stop the C89 attempt by autoconf - if autoconf detects -Ae it will enable it
+dnl which conflicts with C99 on HPUX
+ac_cv_prog_cc_Ae=no
+
+savedCFLAGS=$CFLAGS
+AC_PROG_CC
+CFLAGS=$savedCFLAGS
+
+dnl don't try for C99 if we are using gcc, as otherwise we 
+dnl lose immediate structure constants
+if test x"$GCC" != x"yes" ; then
+AC_PROG_CC_C99
+fi
+
+if test x"$GCC" = x"yes" ; then
+       AC_MSG_CHECKING([for version of gcc])
+       GCC_VERSION=`$CC -dumpversion`
+       AC_MSG_RESULT(${GCC_VERSION})
+fi
+AC_USE_SYSTEM_EXTENSIONS
+AC_C_BIGENDIAN
+AC_C_INLINE
+LIBREPLACE_C99_STRUCT_INIT([],[AC_MSG_WARN([c99 structure initializer are not supported])])
+
+AC_PROG_INSTALL
+
+AC_ISC_POSIX
+AC_N_DEFINE(_XOPEN_SOURCE_EXTENDED)
+
+AC_MSG_CHECKING(checking getconf LFS_CFLAGS for large file support flags)
+LFS_CFLAGS=`(getconf LFS_CFLAGS) 2>/dev/null` || LFS_CFLAGS=""
+
+AC_MSG_RESULT(${LFS_CFLAGS})
+if test "x$LFS_CFLAGS" != 'x-1' || test "x$LFS_CFLAGS" != "xundefined"; then
+   CFLAGS="$CFLAGS $LFS_CFLAGS"
+fi
+
+AC_SYS_LARGEFILE
+
+dnl Add #include for broken IRIX header files
+case "$host_os" in
+       *irix6*) AC_ADD_INCLUDE(<standards.h>)
+               AC_N_DEFINE(_XOPEN_SOURCE,600)
+               AC_N_DEFINE(_BSD_TYPES)
+               ;;
+       *hpux*)
+               # mmap on HPUX is completely broken...
+               AC_DEFINE(MMAP_BLACKLIST, 1, [Whether MMAP is broken])
+               if test "`uname -r`" = "B.11.00" -o "`uname -r`" = "B.11.11"; then
+                       AC_MSG_WARN([Enabling HPUX 11.00/11.11 header bug workaround])
+                       CFLAGS="$CFLAGS -Dpread=pread64 -Dpwrite=pwrite64"
+               fi
+               if test "`uname -r`" = "B.11.23"; then
+                       AC_MSG_WARN([Enabling HPUX 11.23 machine/sys/getppdp.h bug workaround])
+                       CFLAGS="$CFLAGS -D_MACHINE_SYS_GETPPDP_INCLUDED"
+               fi
+               ;;
+       *aix*)
+               AC_DEFINE(BROKEN_STRNDUP, 1, [Whether strndup is broken])
+               AC_DEFINE(BROKEN_STRNLEN, 1, [Whether strnlen is broken])
+               if test "${GCC}" != "yes"; then
+                       ## for funky AIX compiler using strncpy()
+                       CFLAGS="$CFLAGS -D_LINUX_SOURCE_COMPAT -qmaxmem=32000"
+               fi
+               ;;
+       *osf*)
+               # this brings in socklen_t
+               AC_N_DEFINE(_XOPEN_SOURCE,600)
+               AC_N_DEFINE(_OSF_SOURCE)
+               ;;
+       #
+       # VOS may need to have POSIX support and System V compatibility enabled.
+       #
+       *vos*)
+               case "$CFLAGS" in
+                       *-D_POSIX_C_SOURCE*);;
+                       *)
+                               CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=200112L"
+                               AC_DEFINE(_POSIX_C_SOURCE, 200112L, [Whether to enable POSIX support])
+                               ;;
+               esac
+               case "$CFLAGS" in
+                       *-D_SYSV*|*-D_SVID_SOURCE*);;
+                       *)
+                               CFLAGS="$CFLAGS -D_SYSV"
+                               AC_DEFINE(_SYSV, 1, [Whether to enable System V compatibility])
+                               ;;
+               esac
+               ;;
+esac
+
+# Do not check for standards.h on darwin, we get nasty warnings on
+# OS/X Lion. Probably a positive-list of OS'es like IRIX and AIX
+# would be the better choice, but this seems to work fine
+
+case "$host_os" in
+     *darwin*)
+       ;;
+     *)
+        AC_CHECK_HEADERS([standards.h])
+       ;;
+esac
+
+# Solaris needs HAVE_LONG_LONG defined
+AC_CHECK_TYPES(long long)
+
+AC_CHECK_SIZEOF(int)
+AC_CHECK_SIZEOF(char)
+AC_CHECK_SIZEOF(short)
+AC_CHECK_SIZEOF(long)
+AC_CHECK_SIZEOF(long long)
+
+AC_CHECK_TYPE(int8_t, char)
+AC_CHECK_TYPE(uint8_t, unsigned char)
+AC_CHECK_TYPE(int16_t, short)
+AC_CHECK_TYPE(uint16_t, unsigned short)
+
+if test $ac_cv_sizeof_int -eq 4 ; then
+AC_CHECK_TYPE(int32_t, int)
+AC_CHECK_TYPE(uint32_t, unsigned int)
+elif test $ac_cv_size_long -eq 4 ; then
+AC_CHECK_TYPE(int32_t, long)
+AC_CHECK_TYPE(uint32_t, unsigned long)
+else
+AC_MSG_ERROR([LIBREPLACE no 32-bit type found])
+fi
+
+AC_CHECK_TYPE(int64_t, long long)
+AC_CHECK_TYPE(uint64_t, unsigned long long)
+
+AC_CHECK_TYPE(size_t, unsigned int)
+AC_CHECK_TYPE(ssize_t, int)
+
+AC_CHECK_SIZEOF(off_t)
+AC_CHECK_SIZEOF(size_t)
+AC_CHECK_SIZEOF(ssize_t)
+
+AC_CHECK_TYPES([intptr_t, uintptr_t, ptrdiff_t])
+
+if test x"$ac_cv_type_long_long" != x"yes";then
+       AC_MSG_ERROR([LIBREPLACE needs type 'long long'])
+fi
+if test $ac_cv_sizeof_long_long -lt 8;then
+       AC_MSG_ERROR([LIBREPLACE needs sizeof(long long) >= 8])
+fi
+
+############################################
+# check if the compiler can do immediate structures
+AC_SUBST(libreplace_cv_immediate_structures)
+AC_CACHE_CHECK([for immediate structures],libreplace_cv_immediate_structures,[
+       AC_TRY_COMPILE([
+               #include <stdio.h>
+       ],[
+               typedef struct {unsigned x;} FOOBAR;
+               #define X_FOOBAR(x) ((FOOBAR) { x })
+               #define FOO_ONE X_FOOBAR(1)
+               FOOBAR f = FOO_ONE;   
+               static const struct {
+                       FOOBAR y; 
+               } f2[] = {
+                       {FOO_ONE}
+               };
+               static const FOOBAR f3[] = {FOO_ONE};
+       ],
+       libreplace_cv_immediate_structures=yes,
+       libreplace_cv_immediate_structures=no,
+       libreplace_cv_immediate_structures=cross)
+])
+if test x"$libreplace_cv_immediate_structures" = x"yes"; then
+       AC_DEFINE(HAVE_IMMEDIATE_STRUCTURES,1,[Whether the compiler supports immediate structures])
+fi
+
+AC__LIBREPLACE_ONLY_CC_CHECKS_END
+]) dnl end AC_LIBREPLACE_CC_CHECKS
diff --git a/ctdb/lib/replace/libreplace_ld.m4 b/ctdb/lib/replace/libreplace_ld.m4
new file mode 100644 (file)
index 0000000..bf0df61
--- /dev/null
@@ -0,0 +1,337 @@
+#
+# This offers a nice overview how to build shared libraries on all platforms
+#        http://www.fortran-2000.com/ArnaudRecipes/sharedlib.html
+#
+
+AC_DEFUN([AC_LIBREPLACE_STLD],
+[
+       AC_PATH_PROG(PROG_AR, ar)
+
+       STLD=${PROG_AR}
+
+       AC_SUBST(STLD)
+])
+
+AC_DEFUN([AC_LIBREPLACE_STLD_FLAGS],
+[
+       STLD_FLAGS="-rcs"
+       AC_SUBST(STLD_FLAGS)
+])
+
+AC_DEFUN([AC_LD_EXPORT_DYNAMIC],
+[
+saved_LDFLAGS="$LDFLAGS"
+if AC_TRY_COMMAND([${CC-cc} $CFLAGS -Wl,--version 2>&1 | grep "GNU ld" >/dev/null]); then
+       LD_EXPORT_DYNAMIC="-Wl,-export-dynamic"
+else
+       case "$host_os" in
+               hpux* )
+                 LD_EXPORT_DYNAMIC="-Wl,-E"
+                 ;;
+               *)
+                 LD_EXPORT_DYNAMIC=""
+                 ;;
+         esac
+fi
+AC_SUBST(LD_EXPORT_DYNAMIC)
+LDFLAGS="$saved_LDFLAGS"
+])
+
+AC_DEFUN([AC_LD_PICFLAG],
+[
+case "$host_os" in
+       *linux*|*gnu*)
+               PICFLAG="-fPIC" 
+               ;;
+       *solaris*)
+               if test "${GCC}" = "yes"; then
+                       PICFLAG="-fPIC"
+               else
+                       PICFLAG="-KPIC"
+               fi
+               ;;
+       *sunos*)
+               PICFLAG="-KPIC"   # Is this correct for SunOS
+               ;;
+       *netbsd* | *freebsd* | *dragonfly* )  
+               PICFLAG="-fPIC -DPIC"
+               ;;
+       *openbsd*)
+               PICFLAG="-fPIC"
+               ;;
+       *irix*)
+               if test "${GCC}" = "yes"; then
+                       PICFLAG="-fPIC"
+               else 
+                       PICFLAG="-KPIC"
+               fi
+               ;;
+       *aix*)
+               # as AIX code is always position independent...
+               PICFLAG="-O2"
+               ;;
+       *hpux*)
+               if test "${GCC}" = "yes"; then
+                       PICFLAG="-fPIC"
+               elif test "$host_cpu" = "ia64"; then
+                       PICFLAG="+z"
+               elif test $ac_cv_prog_cc_Ae = yes; then
+                       PICFLAG="+z +ESnolit"
+               fi
+               ;;
+       *osf*)
+               PICFLAG="-fPIC"
+               ;;
+       *unixware*)
+               PICFLAG="-KPIC"
+               ;;
+       *darwin*)
+               PICFLAG="-fno-common"
+               ;;
+esac
+AC_SUBST(PICFLAG)
+])
+
+AC_DEFUN([AC_LIBREPLACE_LD_SHLIB_LINKER],
+[
+       LD_SHLIB_LINKER="${CC}"
+
+       case "$host_os" in
+               *irix*)
+                       LD_SHLIB_LINKER="${PROG_LD}"
+                       ;;
+       esac
+
+       AC_SUBST(LD_SHLIB_LINKER)
+])
+
+AC_DEFUN([AC_LIBREPLACE_LD_SHLIB_FLAGS],
+[
+       LD_SHLIB_FLAGS="-shared"
+
+       case "$host_os" in
+               *linux*|*gnu*)
+                       LD_SHLIB_FLAGS="-shared -Wl,-Bsymbolic"
+                       ;;
+               *solaris*)
+                       LD_SHLIB_FLAGS="-G"
+                       if test "${GCC}" = "no"; then
+                               ## ${CFLAGS} added for building 64-bit shared 
+                               ## libs using Sun's Compiler
+                               LD_SHLIB_FLAGS="-G \${CFLAGS}"
+                       fi
+                       ;;
+               *sunos*)
+                       LD_SHLIB_FLAGS="-G"
+                       ;;
+               *irix*)
+                       LD_SHLIB_FLAGS="-shared"
+                       ;;
+               *aix*)
+                       LD_SHLIB_FLAGS="-Wl,-G,-bexpall,-bbigtoc"
+                       ;;
+               *hpux*)
+                       if test "${GCC}" = "yes"; then
+                               LD_SHLIB_FLAGS="-shared"
+                       else
+                               LD_SHLIB_FLAGS="-b"
+                       fi
+                       ;;
+               *osf*)
+                       LD_SHLIB_FLAGS="-shared"
+                       ;;
+               *darwin*)
+                       LD_SHLIB_FLAGS="-dynamiclib -Wl,-search_paths_first"
+                       ;;
+       esac
+
+       AC_SUBST(LD_SHLIB_FLAGS)
+])
+
+AC_DEFUN([AC_LIBREPLACE_LD_SHLIB_DISALLOW_UNDEF_FLAG],
+[
+       LD_SHLIB_DISALLOW_UNDEF_FLAG=""
+
+       #
+       # TODO: enforce error not only warnings
+       #
+       # NOTE: -Wl,--no-allow-shlib-undefined isn't what we want...
+       #       as it bails out on broken system libraries
+       #
+       case "$host_os" in
+               *osf*)
+                       LD_SHLIB_DISALLOW_UNDEF_FLAG="-warning_unresolved"
+                       ;;
+               *darwin*)
+                       LD_SHLIB_DISALLOW_UNDEF_FLAG="-undefined error"
+                       ;;
+       esac
+
+       AC_SUBST(LD_SHLIB_DISALLOW_UNDEF_FLAG)
+])
+
+AC_DEFUN([AC_LIBREPLACE_SHLD],
+[
+       AC_REQUIRE([AC_LIBREPLACE_LD_SHLIB_LINKER])
+       SHLD="$LD_SHLIB_LINKER"
+       AC_SUBST(SHLD)
+])
+
+AC_DEFUN([AC_LIBREPLACE_SHLD_FLAGS],
+[
+       AC_REQUIRE([AC_LIBREPLACE_LD_SHLIB_FLAGS])
+       AC_REQUIRE([AC_LIBREPLACE_LD_SHLIB_DISALLOW_UNDEF_FLAG])
+       SHLD_FLAGS="$LD_SHLIB_FLAGS $LD_SHLIB_DISALLOW_UNDEF_FLAG"
+       AC_SUBST(SHLD_FLAGS)
+])
+
+AC_DEFUN([AC_LD_SHLIBEXT],
+[
+       SHLIBEXT="so"
+       case "$host_os" in
+               *hpux*)
+                       if test "$host_cpu" = "ia64"; then
+                               SHLIBEXT="so"
+                       else
+                               SHLIBEXT="sl"
+                       fi
+               ;;
+               *darwin*)
+                       SHLIBEXT="dylib"
+               ;;
+       esac
+       AC_SUBST(SHLIBEXT)
+])
+
+AC_DEFUN([AC_LD_SONAMEFLAG],
+[
+       AC_SUBST(SONAMEFLAG)
+       SONAMEFLAG=""
+       case "$host_os" in 
+               *linux*|*gnu*|*qnx*)
+                       SONAMEFLAG="-Wl,-soname="
+                       ;;
+               *solaris*)
+                       SONAMEFLAG="-h "
+                       if test "${GCC}" = "yes"; then
+                               SONAMEFLAG="-Wl,-soname="
+                       fi
+                       ;;
+               *sunos*)
+                       SONAMEFLAG="-Wl,-h,"
+                       ;;
+               *netbsd* | *freebsd* | *dragonfly* )
+                       SONAMEFLAG="-Wl,-soname,"
+                       ;;
+               *openbsd*)
+                       SONAMEFLAG="-Wl,-soname,"
+                       ;;
+               *irix*)
+                       SONAMEFLAG="-Wl,-soname,"
+                       ;;
+               *hpux*)
+                       SONAMEFLAG="-Wl,+h,"
+                       ;;
+               *osf*)
+                       SONAMEFLAG="-Wl,-soname,"
+                       ;;
+               *unixware*)
+                       SONAMEFLAG="-Wl,-soname,"
+                       ;;
+               *darwin*)
+                       SONAMEFLAG="#"
+                       ;;
+               *aix*)
+                       # Not supported
+                       SONAMEFLAG="#"
+                       ;;
+               esac
+])
+
+AC_DEFUN([AC_LD_VERSIONSCRIPT],
+[
+       AC_SUBST(VERSIONSCRIPT)
+       VERSIONSCRIPT=""
+       case "$host_os" in 
+               *linux*|*gnu*)
+                       VERSIONSCRIPT="-Wl,--version-script"
+                       ;;
+               *solaris*)
+                       if test "${GCC}" = "yes"; then
+                               VERSIONSCRIPT="-Wl,--version-script"
+                       fi
+                       ;;
+               esac
+])
+
+AC_DEFUN([AC_LIBREPLACE_MDLD],
+[
+       AC_REQUIRE([AC_LIBREPLACE_LD_SHLIB_LINKER])
+       MDLD="$LD_SHLIB_LINKER"
+       AC_SUBST(MDLD)
+])
+
+AC_DEFUN([AC_LIBREPLACE_LD_SHLIB_ALLOW_UNDEF_FLAG],
+[
+       LD_ALLOW_SHLIB_UNDEF_FLAG=""
+
+       case "$host_os" in
+               *linux*|*gnu*)
+                       LD_SHLIB_ALLOW_UNDEF_FLAG="-Wl,--allow-shlib-undefined"
+                       ;;
+               *osf*)
+                       LD_SHLIB_ALLOW_UNDEF_FLAG="-Wl,-expect_unresolved,\"*\""
+                       ;;
+               *darwin*)
+                       LD_SHLIB_ALLOW_UNDEF_FLAG="-undefined dynamic_lookup"
+                       ;;
+               *aix*)
+                       LD_SHLIB_ALLOW_UNDEF_FLAG="-Wl,-bnoentry"
+                       ;;
+       esac
+
+       AC_SUBST(LD_SHLIB_ALLOW_UNDEF_FLAG)
+])
+
+AC_DEFUN([AC_LIBREPLACE_MDLD_FLAGS],
+[
+       AC_REQUIRE([AC_LIBREPLACE_LD_SHLIB_FLAGS])
+       AC_REQUIRE([AC_LIBREPLACE_LD_SHLIB_ALLOW_UNDEF_FLAG])
+       MDLD_FLAGS="$LD_SHLIB_FLAGS $LD_SHLIB_ALLOW_UNDEF_FLAG"
+       AC_SUBST(MDLD_FLAGS)
+])
+
+AC_DEFUN([AC_LIBREPLACE_RUNTIME_LIB_PATH_VAR],
+[
+       case "$host_os" in
+               *linux*|*gnu*)
+                       LIB_PATH_VAR=LD_LIBRARY_PATH
+               ;;
+               *bsd*)
+                       LIB_PATH_VAR=LD_LIBRARY_PATH
+               ;;
+               *solaris*)
+                       LIB_PATH_VAR=LD_LIBRARY_PATH
+               ;;
+               *hpux*)
+                       LIB_PATH_VAR=SHLIB_PATH
+               ;;
+               *osf*)
+                       LIB_PATH_VAR=LD_LIBRARY_PATH
+               ;;
+               *aix*)
+                       LIB_PATH_VAR=LIBPATH
+                       ;;
+               *irix*)
+                       LIB_PATH_VAR=LD_LIBRARY_PATH
+                       ;;
+               *darwin*)
+                       LIB_PATH_VAR=DYLD_LIBRARY_PATH
+                       ;;
+               *)
+                       LIB_PATH_VAR=LD_LIBRARY_PATH
+                       ;;
+       esac
+
+       AC_SUBST(LIB_PATH_VAR)
+])
diff --git a/ctdb/lib/replace/libreplace_macros.m4 b/ctdb/lib/replace/libreplace_macros.m4
new file mode 100644 (file)
index 0000000..f3753c4
--- /dev/null
@@ -0,0 +1,347 @@
+#
+# This is a collection of useful autoconf macros
+#
+
+############################################
+# Check if the compiler handles c99 struct initialization, and if not try -AC99 and -c99 flags
+# Usage: LIBREPLACE_C99_STRUCT_INIT(success-action,failure-action)
+# changes CFLAGS to add -AC99 or -c99 if needed
+AC_DEFUN([LIBREPLACE_C99_STRUCT_INIT],
+[
+saved_CFLAGS="$CFLAGS";
+c99_init=no
+if test x"$c99_init" = x"no"; then
+    AC_MSG_CHECKING(for C99 designated initializers)
+    CFLAGS="$saved_CFLAGS";
+    AC_TRY_COMPILE([#include <stdio.h>],
+     [ struct foo {int x;char y;};
+       struct foo bar = { .y = 'X', .x = 1 };   
+     ],
+     [AC_MSG_RESULT(yes); c99_init=yes],[AC_MSG_RESULT(no)])
+fi
+if test x"$c99_init" = x"no"; then
+    AC_MSG_CHECKING(for C99 designated initializers with -AC99)
+    CFLAGS="$saved_CFLAGS -AC99";
+    AC_TRY_COMPILE([#include <stdio.h>],
+     [ struct foo {int x;char y;};
+       struct foo bar = { .y = 'X', .x = 1 };   
+     ],
+     [AC_MSG_RESULT(yes); c99_init=yes],[AC_MSG_RESULT(no)])
+fi
+if test x"$c99_init" = x"no"; then
+    AC_MSG_CHECKING(for C99 designated initializers with -qlanglvl=extc99)
+    CFLAGS="$saved_CFLAGS -qlanglvl=extc99";
+    AC_TRY_COMPILE([#include <stdio.h>],
+     [ struct foo {int x;char y;};
+       struct foo bar = { .y = 'X', .x = 1 };   
+     ],
+     [AC_MSG_RESULT(yes); c99_init=yes],[AC_MSG_RESULT(no)])
+fi
+if test x"$c99_init" = x"no"; then
+    AC_MSG_CHECKING(for C99 designated initializers with -qlanglvl=stdc99)
+    CFLAGS="$saved_CFLAGS -qlanglvl=stdc99";
+    AC_TRY_COMPILE([#include <stdio.h>],
+     [ struct foo {int x;char y;};
+       struct foo bar = { .y = 'X', .x = 1 };   
+     ],
+     [AC_MSG_RESULT(yes); c99_init=yes],[AC_MSG_RESULT(no)])
+fi
+if test x"$c99_init" = x"no"; then
+    AC_MSG_CHECKING(for C99 designated initializers with -c99)
+    CFLAGS="$saved_CFLAGS -c99"
+    AC_TRY_COMPILE([#include <stdio.h>],
+     [ struct foo {int x;char y;};
+       struct foo bar = { .y = 'X', .x = 1 };   
+     ],
+     [AC_MSG_RESULT(yes); c99_init=yes],[AC_MSG_RESULT(no)])
+fi
+
+if test "`uname`" = "HP-UX"; then
+  if test "$ac_cv_c_compiler_gnu" = no; then
+       # special override for broken HP-UX compiler - I can't find a way to test
+       # this properly (its a compiler bug)
+       CFLAGS="$CFLAGS -AC99";
+       c99_init=yes;
+  fi
+fi
+
+if test x"$c99_init" = x"yes"; then
+    saved_CFLAGS=""
+    $1
+else
+    CFLAGS="$saved_CFLAGS"
+    saved_CFLAGS=""
+    $2
+fi
+])
+
+dnl AC_PROG_CC_FLAG(flag)
+AC_DEFUN(AC_PROG_CC_FLAG,
+[AC_CACHE_CHECK(whether ${CC-cc} accepts -$1, ac_cv_prog_cc_$1,
+[echo 'void f(){}' > conftest.c
+if test -z "`${CC-cc} -$1 -c conftest.c 2>&1`"; then
+  ac_cv_prog_cc_$1=yes
+else
+  ac_cv_prog_cc_$1=no
+fi
+rm -f conftest*
+])])
+
+dnl see if a declaration exists for a function or variable
+dnl defines HAVE_function_DECL if it exists
+dnl AC_HAVE_DECL(var, includes)
+AC_DEFUN(AC_HAVE_DECL,
+[
+ AC_CACHE_CHECK([for $1 declaration],ac_cv_have_$1_decl,[
+    AC_TRY_COMPILE([$2],[int i = (int)$1],
+        ac_cv_have_$1_decl=yes,ac_cv_have_$1_decl=no)])
+ if test x"$ac_cv_have_$1_decl" = x"yes"; then
+    AC_DEFINE([HAVE_]translit([$1], [a-z], [A-Z])[_DECL],1,[Whether $1() is available])
+ fi
+])
+
+
+# AC_CHECK_LIB_EXT(LIBRARY, [EXT_LIBS], [FUNCTION],
+#              [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND],
+#              [ADD-ACTION-IF-FOUND],[OTHER-LIBRARIES])
+# ------------------------------------------------------
+#
+# Use a cache variable name containing both the library and function name,
+# because the test really is for library $1 defining function $3, not
+# just for library $1.  Separate tests with the same $1 and different $3s
+# may have different results.
+#
+# Note that using directly AS_VAR_PUSHDEF([ac_Lib], [ac_cv_lib_$1_$3])
+# is asking for trouble, since AC_CHECK_LIB($lib, fun) would give
+# ac_cv_lib_$lib_fun, which is definitely not what was meant.  Hence
+# the AS_LITERAL_IF indirection.
+#
+# FIXME: This macro is extremely suspicious.  It DEFINEs unconditionally,
+# whatever the FUNCTION, in addition to not being a *S macro.  Note
+# that the cache does depend upon the function we are looking for.
+#
+# It is on purpose we used `ac_check_lib_ext_save_LIBS' and not just
+# `ac_save_LIBS': there are many macros which don't want to see `LIBS'
+# changed but still want to use AC_CHECK_LIB_EXT, so they save `LIBS'.
+# And ``ac_save_LIBS' is too tempting a name, so let's leave them some
+# freedom.
+AC_DEFUN([AC_CHECK_LIB_EXT],
+[
+AH_CHECK_LIB_EXT([$1])
+ac_check_lib_ext_save_LIBS=$LIBS
+LIBS="-l$1 $$2 $7 $LIBS"
+AS_LITERAL_IF([$1],
+      [AS_VAR_PUSHDEF([ac_Lib_ext], [ac_cv_lib_ext_$1])],
+      [AS_VAR_PUSHDEF([ac_Lib_ext], [ac_cv_lib_ext_$1''])])dnl
+
+m4_ifval([$3],
+ [
+    AH_CHECK_FUNC_EXT([$3])
+    AS_LITERAL_IF([$1],
+              [AS_VAR_PUSHDEF([ac_Lib_func], [ac_cv_lib_ext_$1_$3])],
+              [AS_VAR_PUSHDEF([ac_Lib_func], [ac_cv_lib_ext_$1''_$3])])dnl
+    AC_CACHE_CHECK([for $3 in -l$1], ac_Lib_func,
+       [AC_TRY_LINK_FUNC($3,
+                 [AS_VAR_SET(ac_Lib_func, yes);
+                 AS_VAR_SET(ac_Lib_ext, yes)],
+                 [AS_VAR_SET(ac_Lib_func, no);
+                 AS_VAR_SET(ac_Lib_ext, no)])
+       ])
+    AS_IF([test AS_VAR_GET(ac_Lib_func) = yes],
+        [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_$3))])dnl
+    AS_VAR_POPDEF([ac_Lib_func])dnl
+ ],[
+    AC_CACHE_CHECK([for -l$1], ac_Lib_ext,
+       [AC_TRY_LINK_FUNC([main],
+                 [AS_VAR_SET(ac_Lib_ext, yes)],
+                 [AS_VAR_SET(ac_Lib_ext, no)])
+       ])
+ ])
+LIBS=$ac_check_lib_ext_save_LIBS
+
+AS_IF([test AS_VAR_GET(ac_Lib_ext) = yes],
+    [m4_default([$4], 
+        [AC_DEFINE_UNQUOTED(AS_TR_CPP(HAVE_LIB$1))
+               case "$$2" in
+                   *-l$1*)
+                       ;;
+                   *)
+                       $2="-l$1 $$2"
+                       ;;
+               esac])
+               [$6]
+           ],
+           [$5])dnl
+AS_VAR_POPDEF([ac_Lib_ext])dnl
+])# AC_CHECK_LIB_EXT
+
+# AH_CHECK_LIB_EXT(LIBNAME)
+# ---------------------
+m4_define([AH_CHECK_LIB_EXT],
+[AH_TEMPLATE(AS_TR_CPP(HAVE_LIB$1),
+             [Define to 1 if you have the `]$1[' library (-l]$1[).])])
+
+dnl AC_SEARCH_LIBS_EXT(FUNCTION, SEARCH-LIBS, EXT_LIBS,
+dnl                    [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND],
+dnl                    [OTHER-LIBRARIES])
+dnl --------------------------------------------------------
+dnl Search for a library defining FUNC, if it's not already available.
+AC_DEFUN([AC_SEARCH_LIBS_EXT],
+[AC_CACHE_CHECK([for library containing $1], [ac_cv_search_ext_$1],
+[
+ac_func_search_ext_save_LIBS=$LIBS
+ac_cv_search_ext_$1=no
+AC_LINK_IFELSE([AC_LANG_CALL([], [$1])],
+              [ac_cv_search_ext_$1="none required"])
+if test "$ac_cv_search_ext_$1" = no; then
+  for ac_lib in $2; do
+    LIBS="-l$ac_lib $$3 $6 $ac_func_search_save_ext_LIBS"
+    AC_LINK_IFELSE([AC_LANG_CALL([], [$1])],
+                  [ac_cv_search_ext_$1="-l$ac_lib"
+break])
+  done
+fi
+LIBS=$ac_func_search_ext_save_LIBS])
+AS_IF([test "$ac_cv_search_ext_$1" != no],
+  [test "$ac_cv_search_ext_$1" = "none required" || $3="$ac_cv_search_ext_$1 $$3"
+  $4],
+      [$5])dnl
+])
+
+dnl check for a function in a $LIBS and $OTHER_LIBS libraries variable.
+dnl AC_CHECK_FUNC_EXT(func,OTHER_LIBS,IF-TRUE,IF-FALSE)
+AC_DEFUN([AC_CHECK_FUNC_EXT],
+[
+    AH_CHECK_FUNC_EXT($1)      
+    ac_check_func_ext_save_LIBS=$LIBS
+    LIBS="$2 $LIBS"
+    AS_VAR_PUSHDEF([ac_var], [ac_cv_func_ext_$1])dnl
+    AC_CACHE_CHECK([for $1], ac_var,
+       [AC_LINK_IFELSE([AC_LANG_FUNC_LINK_TRY([$1])],
+                [AS_VAR_SET(ac_var, yes)],
+                [AS_VAR_SET(ac_var, no)])])
+    LIBS=$ac_check_func_ext_save_LIBS
+    AS_IF([test AS_VAR_GET(ac_var) = yes], 
+           [AC_DEFINE_UNQUOTED(AS_TR_CPP([HAVE_$1])) $3], 
+           [$4])dnl
+AS_VAR_POPDEF([ac_var])dnl
+])# AC_CHECK_FUNC
+
+# AH_CHECK_FUNC_EXT(FUNCNAME)
+# ---------------------
+m4_define([AH_CHECK_FUNC_EXT],
+[AH_TEMPLATE(AS_TR_CPP(HAVE_$1),
+             [Define to 1 if you have the `]$1[' function.])])
+
+dnl Define an AC_DEFINE with ifndef guard.
+dnl AC_N_DEFINE(VARIABLE [, VALUE])
+AC_DEFUN([AC_N_DEFINE],
+[
+AH_VERBATIM([$1], [
+#ifndef $1
+# undef $1
+#endif
+])
+
+ cat >>confdefs.h <<\EOF
+#ifndef $1
+[#define] $1 m4_if($#, 1, 1, [$2])
+#endif
+EOF
+])
+
+dnl Add an #include
+dnl AC_ADD_INCLUDE(VARIABLE)
+define(AC_ADD_INCLUDE,
+[cat >> confdefs.h <<\EOF
+[#include] $1
+EOF
+])
+
+dnl remove an #include
+dnl AC_REMOVE_INCLUDE(VARIABLE)
+define(AC_REMOVE_INCLUDE,
+[
+grep -v '[#include] $1' confdefs.h >confdefs.h.tmp
+cat confdefs.h.tmp > confdefs.h
+rm confdefs.h.tmp
+])
+
+dnl remove an #define
+dnl AC_REMOVE_DEFINE(VARIABLE)
+define(AC_REMOVE_DEFINE,
+[
+grep -v '[#define] $1 ' confdefs.h |grep -v '[#define] $1[$]'>confdefs.h.tmp
+cat confdefs.h.tmp > confdefs.h
+rm confdefs.h.tmp
+])
+
+dnl AS_HELP_STRING is not available in autoconf 2.57, and AC_HELP_STRING is deprecated
+dnl in autoconf 2.59, so define AS_HELP_STRING to be AC_HELP_STRING unless it is already
+dnl defined.
+m4_ifdef([AS_HELP_STRING], , [m4_define([AS_HELP_STRING], m4_defn([AC_HELP_STRING]))])
+
+dnl check if the prototype in the header matches the given one
+dnl AC_VERIFY_C_PROTOTYPE(prototype,functionbody,[IF-TRUE].[IF-FALSE],[extraheaders])
+AC_DEFUN(AC_VERIFY_C_PROTOTYPE,
+[AC_CACHE_CHECK([for prototype $1], AS_TR_SH([ac_cv_c_prototype_$1]),
+       AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+               AC_INCLUDES_DEFAULT
+               $5
+               $1
+               {
+                       $2
+               }
+       ])],[
+               eval AS_TR_SH([ac_cv_c_prototype_$1])=yes
+       ],[
+               eval AS_TR_SH([ac_cv_c_prototype_$1])=no
+       ])
+)
+AS_IF([eval test $AS_TR_SH([ac_cv_c_prototype_$1]) = yes],[$3],[$4])
+])
+
+AC_DEFUN(LIBREPLACE_PROVIDE_HEADER, 
+[AC_CHECK_HEADER([$1], 
+               [ AC_CONFIG_COMMANDS(rm-$1, [rm -f $libreplacedir/$1], [libreplacedir=$libreplacedir]) ],
+               [ AC_CONFIG_COMMANDS(mk-$1, [echo "#include \"replace.h\"" > $libreplacedir/$1], [libreplacedir=$libreplacedir]) ]
+       )
+])
+
+dnl AC_HAVE_TYPE(TYPE,INCLUDES)
+AC_DEFUN([AC_HAVE_TYPE], [
+AC_REQUIRE([AC_HEADER_STDC])
+cv=`echo "$1" | sed 'y%./+- %__p__%'`
+AC_MSG_CHECKING(for $1)
+AC_CACHE_VAL([ac_cv_type_$cv],
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+AC_INCLUDES_DEFAULT
+$2]],
+[[$1 foo;]])],
+[eval "ac_cv_type_$cv=yes"],
+[eval "ac_cv_type_$cv=no"]))dnl
+ac_foo=`eval echo \\$ac_cv_type_$cv`
+AC_MSG_RESULT($ac_foo)
+if test "$ac_foo" = yes; then
+  ac_tr_hdr=HAVE_`echo $1 | sed 'y%abcdefghijklmnopqrstuvwxyz./- %ABCDEFGHIJKLMNOPQRSTUVWXYZ____%'`
+if false; then
+       AC_CHECK_TYPES($1)
+fi
+  AC_DEFINE_UNQUOTED($ac_tr_hdr, 1, [Define if you have type `$1'])
+fi
+])
+
+# AC_CHECK_VALUEOF(TYPE, [INCLUDES = DEFAULT-INCLUDES])
+# ---------------------------------------------------------------
+AC_DEFUN([AC_CHECK_VALUEOF],
+[AS_LITERAL_IF(m4_translit([[$1]], [*], [p]), [],
+              [m4_fatal([$0: requires literal arguments])])]dnl
+[
+_AC_CACHE_CHECK_INT([value of $1], [AS_TR_SH([ac_cv_valueof_$1])],
+  [(long int) ($1)],
+  [AC_INCLUDES_DEFAULT([$2])],
+  [])
+
+AC_DEFINE_UNQUOTED(AS_TR_CPP(valueof_$1), $AS_TR_SH([ac_cv_valueof_$1]),
+                  [The value of `$1'.])
+])# AC_CHECK_VALUEOF
diff --git a/ctdb/lib/replace/libreplace_network.m4 b/ctdb/lib/replace/libreplace_network.m4
new file mode 100644 (file)
index 0000000..bb2a843
--- /dev/null
@@ -0,0 +1,503 @@
+AC_DEFUN_ONCE(AC_LIBREPLACE_NETWORK_CHECKS,
+[
+echo "LIBREPLACE_NETWORK_CHECKS: START"
+
+AC_DEFINE(LIBREPLACE_NETWORK_CHECKS, 1, [LIBREPLACE_NETWORK_CHECKS were used])
+LIBREPLACE_NETWORK_OBJS=""
+LIBREPLACE_NETWORK_LIBS=""
+
+AC_CHECK_HEADERS(sys/socket.h netinet/in.h netdb.h arpa/inet.h)
+AC_CHECK_HEADERS(netinet/in_systm.h)
+AC_CHECK_HEADERS([netinet/ip.h], [], [],[
+       #include <sys/types.h>
+       #ifdef HAVE_NETINET_IN_H
+       #include <netinet/in.h>
+       #endif
+       #ifdef HAVE_NETINET_IN_SYSTM_H
+       #include <netinet/in_systm.h>
+       #endif
+])
+AC_CHECK_HEADERS(netinet/tcp.h netinet/in_ip.h)
+AC_CHECK_HEADERS(sys/sockio.h sys/un.h)
+AC_CHECK_HEADERS(sys/uio.h)
+
+dnl we need to check that net/if.h really can be used, to cope with hpux
+dnl where including it always fails
+AC_CACHE_CHECK([for usable net/if.h],libreplace_cv_USABLE_NET_IF_H,[
+       AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+               AC_INCLUDES_DEFAULT
+               #if HAVE_SYS_SOCKET_H
+               # include <sys/socket.h>
+               #endif
+               #include <net/if.h>
+               int main(void) {return 0;}])],
+               [libreplace_cv_USABLE_NET_IF_H=yes],
+               [libreplace_cv_USABLE_NET_IF_H=no]
+       )
+])
+if test x"$libreplace_cv_USABLE_NET_IF_H" = x"yes";then
+       AC_DEFINE(HAVE_NET_IF_H, 1, usability of net/if.h)
+fi
+
+AC_HAVE_TYPE([socklen_t],[#include <sys/socket.h>])
+AC_HAVE_TYPE([sa_family_t],[#include <sys/socket.h>])
+AC_HAVE_TYPE([struct addrinfo], [#include <netdb.h>])
+AC_HAVE_TYPE([struct sockaddr], [#include <sys/socket.h>])
+AC_HAVE_TYPE([struct sockaddr_storage], [
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+])
+AC_HAVE_TYPE([struct sockaddr_in6], [
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+])
+
+if test x"$ac_cv_type_struct_sockaddr_storage" = x"yes"; then
+AC_CHECK_MEMBER(struct sockaddr_storage.ss_family,
+                AC_DEFINE(HAVE_SS_FAMILY, 1, [Defined if struct sockaddr_storage has ss_family field]),,
+                [
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+               ])
+
+if test x"$ac_cv_member_struct_sockaddr_storage_ss_family" != x"yes"; then
+AC_CHECK_MEMBER(struct sockaddr_storage.__ss_family,
+                AC_DEFINE(HAVE___SS_FAMILY, 1, [Defined if struct sockaddr_storage has __ss_family field]),,
+                [
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+               ])
+fi
+fi
+
+AC_CACHE_CHECK([for sin_len in sock],libreplace_cv_HAVE_SOCK_SIN_LEN,[
+       AC_TRY_COMPILE(
+               [
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+               ],[
+struct sockaddr_in sock; sock.sin_len = sizeof(sock);
+               ],[
+               libreplace_cv_HAVE_SOCK_SIN_LEN=yes
+               ],[
+               libreplace_cv_HAVE_SOCK_SIN_LEN=no
+               ])
+])
+if test x"$libreplace_cv_HAVE_SOCK_SIN_LEN" = x"yes"; then
+       AC_DEFINE(HAVE_SOCK_SIN_LEN,1,[Whether the sockaddr_in struct has a sin_len property])
+fi
+
+############################################
+# check for unix domain sockets
+AC_CACHE_CHECK([for unix domain sockets],libreplace_cv_HAVE_UNIXSOCKET,[
+       AC_TRY_COMPILE([
+#include <sys/types.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+               ],[
+struct sockaddr_un sunaddr;
+sunaddr.sun_family = AF_UNIX;
+               ],[
+               libreplace_cv_HAVE_UNIXSOCKET=yes
+               ],[
+               libreplace_cv_HAVE_UNIXSOCKET=no
+               ])
+])
+if test x"$libreplace_cv_HAVE_UNIXSOCKET" = x"yes"; then
+       AC_DEFINE(HAVE_UNIXSOCKET,1,[If we need to build with unixscoket support])
+fi
+
+dnl The following test is roughly taken from the cvs sources.
+dnl
+dnl If we can't find connect, try looking in -lsocket, -lnsl, and -linet.
+dnl The Irix 5 libc.so has connect and gethostbyname, but Irix 5 also has
+dnl libsocket.so which has a bad implementation of gethostbyname (it
+dnl only looks in /etc/hosts), so we only look for -lsocket if we need
+dnl it.
+AC_CHECK_FUNCS(connect)
+if test x"$ac_cv_func_connect" = x"no"; then
+       AC_CHECK_LIB_EXT(nsl_s, LIBREPLACE_NETWORK_LIBS, connect)
+       AC_CHECK_LIB_EXT(nsl, LIBREPLACE_NETWORK_LIBS, connect)
+       AC_CHECK_LIB_EXT(socket, LIBREPLACE_NETWORK_LIBS, connect)
+       AC_CHECK_LIB_EXT(inet, LIBREPLACE_NETWORK_LIBS, connect)
+       dnl We can't just call AC_CHECK_FUNCS(connect) here,
+       dnl because the value has been cached.
+       if test x"$ac_cv_lib_ext_nsl_s_connect" = x"yes" ||
+               test x"$ac_cv_lib_ext_nsl_connect" = x"yes" ||
+               test x"$ac_cv_lib_ext_socket_connect" = x"yes" ||
+               test x"$ac_cv_lib_ext_inet_connect" = x"yes"
+       then
+               AC_DEFINE(HAVE_CONNECT,1,[Whether the system has connect()])
+       fi
+fi
+
+AC_CHECK_FUNCS(gethostbyname)
+if test x"$ac_cv_func_gethostbyname" = x"no"; then
+       AC_CHECK_LIB_EXT(nsl_s, LIBREPLACE_NETWORK_LIBS, gethostbyname)
+       AC_CHECK_LIB_EXT(nsl, LIBREPLACE_NETWORK_LIBS, gethostbyname)
+       AC_CHECK_LIB_EXT(socket, LIBREPLACE_NETWORK_LIBS, gethostbyname)
+       dnl We can't just call AC_CHECK_FUNCS(gethostbyname) here,
+       dnl because the value has been cached.
+       if test x"$ac_cv_lib_ext_nsl_s_gethostbyname" = x"yes" ||
+               test x"$ac_cv_lib_ext_nsl_gethostbyname" = x"yes" ||
+               test x"$ac_cv_lib_ext_socket_gethostbyname" = x"yes"
+       then
+               AC_DEFINE(HAVE_GETHOSTBYNAME,1,
+                         [Whether the system has gethostbyname()])
+       fi
+fi
+
+dnl HP-UX has if_nametoindex in -lipv6
+AC_CHECK_FUNCS(if_nametoindex)
+if test x"$ac_cv_func_if_nametoindex" = x"no"; then
+       AC_CHECK_LIB_EXT(ipv6, LIBREPLACE_NETWORK_LIBS, if_nametoindex)
+       dnl We can't just call AC_CHECK_FUNCS(if_nametoindex) here,
+       dnl because the value has been cached.
+       if test x"$ac_cv_lib_ext_ipv6_if_nametoindex" = x"yes"
+       then
+               AC_DEFINE(HAVE_IF_NAMETOINDEX, 1,
+                         [Whether the system has if_nametoindex()])
+       fi
+fi
+
+# The following tests need LIBS="${LIBREPLACE_NETWORK_LIBS}"
+old_LIBS=$LIBS
+LIBS="${LIBREPLACE_NETWORK_LIBS}"
+libreplace_SAVE_CPPFLAGS="$CPPFLAGS"
+CPPFLAGS="$CPPFLAGS -I$libreplacedir"
+
+AC_CHECK_FUNCS(socketpair,[],[LIBREPLACE_NETWORK_OBJS="${LIBREPLACE_NETWORK_OBJS} $libreplacedir/socketpair.o"])
+
+AC_CACHE_CHECK([for broken inet_ntoa],libreplace_cv_REPLACE_INET_NTOA,[
+AC_TRY_RUN([
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#ifdef HAVE_ARPA_INET_H
+#include <arpa/inet.h>
+#endif
+main() { struct in_addr ip; ip.s_addr = 0x12345678;
+if (strcmp(inet_ntoa(ip),"18.52.86.120") &&
+    strcmp(inet_ntoa(ip),"120.86.52.18")) { exit(0); }
+exit(1);}],
+           libreplace_cv_REPLACE_INET_NTOA=yes,libreplace_cv_REPLACE_INET_NTOA=no,libreplace_cv_REPLACE_INET_NTOA=cross)])
+
+AC_CHECK_FUNCS(inet_ntoa,[],[libreplace_cv_REPLACE_INET_NTOA=yes])
+if test x"$libreplace_cv_REPLACE_INET_NTOA" = x"yes"; then
+    AC_DEFINE(REPLACE_INET_NTOA,1,[Whether inet_ntoa should be replaced])
+    LIBREPLACE_NETWORK_OBJS="${LIBREPLACE_NETWORK_OBJS} $libreplacedir/inet_ntoa.o"
+fi
+
+AC_CHECK_FUNCS(inet_aton,[],[LIBREPLACE_NETWORK_OBJS="${LIBREPLACE_NETWORK_OBJS} $libreplacedir/inet_aton.o"])
+
+AC_CHECK_FUNCS(inet_ntop,[],[LIBREPLACE_NETWORK_OBJS="${LIBREPLACE_NETWORK_OBJS} $libreplacedir/inet_ntop.o"])
+
+AC_CHECK_FUNCS(inet_pton,[],[LIBREPLACE_NETWORK_OBJS="${LIBREPLACE_NETWORK_OBJS} $libreplacedir/inet_pton.o"])
+
+dnl test for getaddrinfo/getnameinfo
+AC_CACHE_CHECK([for getaddrinfo],libreplace_cv_HAVE_GETADDRINFO,[
+AC_TRY_LINK([
+#include <sys/types.h>
+#if STDC_HEADERS
+#include <stdlib.h>
+#include <stddef.h>
+#endif
+#include <sys/socket.h>
+#include <netdb.h>],
+[
+struct sockaddr sa;
+struct addrinfo *ai = NULL;
+int ret = getaddrinfo(NULL, NULL, NULL, &ai);
+if (ret != 0) {
+       const char *es = gai_strerror(ret);
+}
+freeaddrinfo(ai);
+ret = getnameinfo(&sa, sizeof(sa),
+               NULL, 0,
+               NULL, 0, 0);
+
+],
+libreplace_cv_HAVE_GETADDRINFO=yes,libreplace_cv_HAVE_GETADDRINFO=no)])
+
+if test x"$libreplace_cv_HAVE_GETADDRINFO" = x"yes"; then
+       # getaddrinfo is broken on some AIX systems
+       # see bug 5910, use our replacements if we detect
+       # a broken system.
+       AC_TRY_RUN([
+               #include <stddef.h>
+               #include <sys/types.h>
+               #include <sys/socket.h>
+               #include <netdb.h>
+               int main(int argc, const char *argv[])
+               {
+                       struct addrinfo hints = {0,};
+                       struct addrinfo *ppres;
+                       const char hostname1[] = "0.0.0.0";
+                       const char hostname2[] = "127.0.0.1";
+                       const char hostname3[] = "::";
+                       hints.ai_socktype = SOCK_STREAM;
+                       hints.ai_family = AF_UNSPEC;
+                       hints.ai_flags =
+                               AI_NUMERICHOST|AI_PASSIVE|AI_ADDRCONFIG;
+                       /* Test for broken flag combination on AIX. */
+                       if (getaddrinfo(hostname1, NULL, &hints, &ppres) == EAI_BADFLAGS) {
+                               /* This fails on an IPv6-only box, but not with
+                                  the EAI_BADFLAGS error. */
+                               return 1;
+                       }
+                       if (getaddrinfo(hostname2, NULL, &hints, &ppres) == 0) {
+                               /* IPv4 lookup works - good enough. */
+                               return 0;
+                       }
+                       /* Uh-oh, no IPv4. Are we IPv6-only ? */
+                       return getaddrinfo(hostname3, NULL, &hints, &ppres) != 0 ? 1 : 0;
+               }],
+               libreplace_cv_HAVE_GETADDRINFO=yes,
+               libreplace_cv_HAVE_GETADDRINFO=no)
+fi
+
+if test x"$libreplace_cv_HAVE_GETADDRINFO" = x"yes"; then
+       AC_DEFINE(HAVE_GETADDRINFO,1,[Whether the system has getaddrinfo])
+       AC_DEFINE(HAVE_GETNAMEINFO,1,[Whether the system has getnameinfo])
+       AC_DEFINE(HAVE_FREEADDRINFO,1,[Whether the system has freeaddrinfo])
+       AC_DEFINE(HAVE_GAI_STRERROR,1,[Whether the system has gai_strerror])
+else
+       LIBREPLACE_NETWORK_OBJS="${LIBREPLACE_NETWORK_OBJS} $libreplacedir/getaddrinfo.o"
+fi
+
+AC_CHECK_HEADERS([ifaddrs.h])
+
+dnl Used when getifaddrs is not available
+AC_CHECK_MEMBERS([struct sockaddr.sa_len], 
+        [AC_DEFINE(HAVE_SOCKADDR_SA_LEN, 1, [Whether struct sockaddr has a sa_len member])],
+        [],
+        [#include <sys/socket.h>])
+
+dnl test for getifaddrs and freeifaddrs
+AC_CACHE_CHECK([for getifaddrs and freeifaddrs],libreplace_cv_HAVE_GETIFADDRS,[
+AC_TRY_LINK([
+#include <sys/types.h>
+#if STDC_HEADERS
+#include <stdlib.h>
+#include <stddef.h>
+#endif
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <ifaddrs.h>
+#include <netdb.h>],
+[
+struct ifaddrs *ifp = NULL;
+int ret = getifaddrs (&ifp);
+freeifaddrs(ifp);
+],
+libreplace_cv_HAVE_GETIFADDRS=yes,libreplace_cv_HAVE_GETIFADDRS=no)])
+if test x"$libreplace_cv_HAVE_GETIFADDRS" = x"yes"; then
+    AC_DEFINE(HAVE_GETIFADDRS,1,[Whether the system has getifaddrs])
+    AC_DEFINE(HAVE_FREEIFADDRS,1,[Whether the system has freeifaddrs])
+       AC_DEFINE(HAVE_STRUCT_IFADDRS,1,[Whether struct ifaddrs is available])
+fi
+
+##################
+# look for a method of finding the list of network interfaces
+iface=no;
+AC_CACHE_CHECK([for iface getifaddrs],libreplace_cv_HAVE_IFACE_GETIFADDRS,[
+AC_TRY_RUN([
+#define HAVE_IFACE_GETIFADDRS 1
+#define NO_CONFIG_H 1
+#define AUTOCONF_TEST 1
+#define SOCKET_WRAPPER_NOT_REPLACE
+#include "$libreplacedir/replace.c"
+#include "$libreplacedir/inet_ntop.c"
+#include "$libreplacedir/snprintf.c"
+#include "$libreplacedir/getifaddrs.c"
+#define getifaddrs_test main
+#include "$libreplacedir/test/getifaddrs.c"],
+           libreplace_cv_HAVE_IFACE_GETIFADDRS=yes,libreplace_cv_HAVE_IFACE_GETIFADDRS=no,libreplace_cv_HAVE_IFACE_GETIFADDRS=cross)])
+if test x"$libreplace_cv_HAVE_IFACE_GETIFADDRS" = x"yes"; then
+    iface=yes;AC_DEFINE(HAVE_IFACE_GETIFADDRS,1,[Whether iface getifaddrs is available])
+else
+       LIBREPLACE_NETWORK_OBJS="${LIBREPLACE_NETWORK_OBJS} $libreplacedir/getifaddrs.o"
+fi
+
+
+if test $iface = no; then
+AC_CACHE_CHECK([for iface AIX],libreplace_cv_HAVE_IFACE_AIX,[
+AC_TRY_RUN([
+#define HAVE_IFACE_AIX 1
+#define NO_CONFIG_H 1
+#define AUTOCONF_TEST 1
+#undef _XOPEN_SOURCE_EXTENDED
+#define SOCKET_WRAPPER_NOT_REPLACE
+#include "$libreplacedir/replace.c"
+#include "$libreplacedir/inet_ntop.c"
+#include "$libreplacedir/snprintf.c"
+#include "$libreplacedir/getifaddrs.c"
+#define getifaddrs_test main
+#include "$libreplacedir/test/getifaddrs.c"],
+           libreplace_cv_HAVE_IFACE_AIX=yes,libreplace_cv_HAVE_IFACE_AIX=no,libreplace_cv_HAVE_IFACE_AIX=cross)])
+if test x"$libreplace_cv_HAVE_IFACE_AIX" = x"yes"; then
+    iface=yes;AC_DEFINE(HAVE_IFACE_AIX,1,[Whether iface AIX is available])
+fi
+fi
+
+
+if test $iface = no; then
+AC_CACHE_CHECK([for iface ifconf],libreplace_cv_HAVE_IFACE_IFCONF,[
+AC_TRY_RUN([
+#define HAVE_IFACE_IFCONF 1
+#define NO_CONFIG_H 1
+#define AUTOCONF_TEST 1
+#define SOCKET_WRAPPER_NOT_REPLACE
+#include "$libreplacedir/replace.c"
+#include "$libreplacedir/inet_ntop.c"
+#include "$libreplacedir/snprintf.c"
+#include "$libreplacedir/getifaddrs.c"
+#define getifaddrs_test main
+#include "$libreplacedir/test/getifaddrs.c"],
+           libreplace_cv_HAVE_IFACE_IFCONF=yes,libreplace_cv_HAVE_IFACE_IFCONF=no,libreplace_cv_HAVE_IFACE_IFCONF=cross)])
+if test x"$libreplace_cv_HAVE_IFACE_IFCONF" = x"yes"; then
+    iface=yes;AC_DEFINE(HAVE_IFACE_IFCONF,1,[Whether iface ifconf is available])
+fi
+fi
+
+if test $iface = no; then
+AC_CACHE_CHECK([for iface ifreq],libreplace_cv_HAVE_IFACE_IFREQ,[
+AC_TRY_RUN([
+#define HAVE_IFACE_IFREQ 1
+#define NO_CONFIG_H 1
+#define AUTOCONF_TEST 1
+#define SOCKET_WRAPPER_NOT_REPLACE
+#include "$libreplacedir/replace.c"
+#include "$libreplacedir/inet_ntop.c"
+#include "$libreplacedir/snprintf.c"
+#include "$libreplacedir/getifaddrs.c"
+#define getifaddrs_test main
+#include "$libreplacedir/test/getifaddrs.c"],
+           libreplace_cv_HAVE_IFACE_IFREQ=yes,libreplace_cv_HAVE_IFACE_IFREQ=no,libreplace_cv_HAVE_IFACE_IFREQ=cross)])
+if test x"$libreplace_cv_HAVE_IFACE_IFREQ" = x"yes"; then
+    iface=yes;AC_DEFINE(HAVE_IFACE_IFREQ,1,[Whether iface ifreq is available])
+fi
+fi
+
+dnl Some old Linux systems have broken header files and
+dnl miss the IPV6_V6ONLY define in netinet/in.h,
+dnl but have it in linux/in6.h.
+dnl We can't include both files so we just check if the value
+dnl if defined and do the replacement in system/network.h
+AC_CACHE_CHECK([for IPV6_V6ONLY support],libreplace_cv_HAVE_IPV6_V6ONLY,[
+       AC_TRY_COMPILE([
+#include <stdlib.h> /* for NULL */
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netdb.h>
+#include <netinet/in.h>
+               ],
+               [
+#ifndef IPV6_V6ONLY
+#error no IPV6_V6ONLY
+#endif
+               ],[
+               libreplace_cv_HAVE_IPV6_V6ONLY=yes
+               ],[
+               libreplace_cv_HAVE_IPV6_V6ONLY=no
+               ])
+])
+if test x"$libreplace_cv_HAVE_IPV6_V6ONLY" != x"yes"; then
+   dnl test for IPV6_V6ONLY
+   AC_CACHE_CHECK([for IPV6_V6ONLY in linux/in6.h],libreplace_cv_HAVE_LINUX_IPV6_V6ONLY_26,[
+       AC_TRY_COMPILE([
+       #include <linux/in6.h>
+               ],
+               [
+       #if (IPV6_V6ONLY != 26)
+       #error no linux IPV6_V6ONLY
+       #endif
+               ],[
+               libreplace_cv_HAVE_LINUX_IPV6_V6ONLY_26=yes
+               ],[
+               libreplace_cv_HAVE_LINUX_IPV6_V6ONLY_26=no
+               ])
+       ])
+       if test x"$libreplace_cv_HAVE_LINUX_IPV6_V6ONLY_26" = x"yes"; then
+               AC_DEFINE(HAVE_LINUX_IPV6_V6ONLY_26,1,[Whether the system has IPV6_V6ONLY in linux/in6.h])
+       fi
+fi
+
+dnl test for ipv6
+AC_CACHE_CHECK([for ipv6 support],libreplace_cv_HAVE_IPV6,[
+       AC_TRY_LINK([
+#include <stdlib.h> /* for NULL */
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <netdb.h>
+#include <netinet/in.h>
+               ],
+               [
+struct sockaddr_storage sa_store;
+struct addrinfo *ai = NULL;
+struct in6_addr in6addr;
+int idx = if_nametoindex("iface1");
+int s = socket(AF_INET6, SOCK_STREAM, 0);
+int ret = getaddrinfo(NULL, NULL, NULL, &ai);
+if (ret != 0) {
+       const char *es = gai_strerror(ret);
+}
+freeaddrinfo(ai);
+{
+       int val = 1;
+       #ifdef HAVE_LINUX_IPV6_V6ONLY_26
+       #define IPV6_V6ONLY 26
+       #endif
+       ret = setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
+                        (const void *)&val, sizeof(val));
+}
+               ],[
+               libreplace_cv_HAVE_IPV6=yes
+               ],[
+               libreplace_cv_HAVE_IPV6=no
+               ])
+])
+if test x"$libreplace_cv_HAVE_IPV6" = x"yes"; then
+    AC_DEFINE(HAVE_IPV6,1,[Whether the system has IPv6 support])
+fi
+
+LIBS=$old_LIBS
+CPPFLAGS="$libreplace_SAVE_CPPFLAGS"
+
+AC_CACHE_CHECK([for SO_PEERCRED],libreplace_cv_HAVE_PEERCRED,[
+AC_TRY_COMPILE([#include <sys/types.h>
+#include <sys/socket.h>],
+[struct ucred cred;
+ socklen_t cred_len;
+ int ret = getsockopt(0, SOL_SOCKET, SO_PEERCRED, &cred, &cred_len);
+],
+libreplace_cv_HAVE_PEERCRED=yes,libreplace_cv_HAVE_PEERCRED=no,libreplace_cv_HAVE_PEERCRED=cross)])
+if test x"$libreplace_cv_HAVE_PEERCRED" = x"yes"; then
+    AC_DEFINE(HAVE_PEERCRED,1,[Whether we can use SO_PEERCRED to get socket credentials])
+fi
+
+AC_CACHE_CHECK([for getpeereid],libreplace_cv_HAVE_GETPEEREID,[
+AC_TRY_LINK([#include <sys/types.h>
+#include <unistd.h>],
+[uid_t uid; gid_t gid; int ret;
+ ret = getpeereid(0, &uid, &gid);
+],
+libreplace_cv_HAVE_GETPEEREID=yes,libreplace_cv_HAVE_GETPEEREID=no)])
+if test x"$libreplace_cv_HAVE_GETPEEREID" = xyes; then
+   AC_DEFINE(HAVE_GETPEEREID,1,
+            [Whether we have getpeereid to get socket credentials])
+fi
+
+LIBREPLACEOBJ="${LIBREPLACEOBJ} ${LIBREPLACE_NETWORK_OBJS}"
+
+echo "LIBREPLACE_NETWORK_CHECKS: END"
+]) dnl end AC_LIBREPLACE_NETWORK_CHECKS
diff --git a/ctdb/lib/replace/poll.c b/ctdb/lib/replace/poll.c
new file mode 100644 (file)
index 0000000..1105617
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+   Unix SMB/CIFS implementation.
+   poll.c - poll wrapper
+
+   This file is based on code from libssh (LGPLv2.1+ at the time it
+   was downloaded), thus the following copyrights:
+
+   Copyright (c) 2009-2010 by Andreas Schneider <mail@cynapses.org>
+   Copyright (c) 2003-2009 by Aris Adamantiadis
+   Copyright (c) 2009 Aleksandar Kanchev
+   Copyright (C) Volker Lendecke 2011
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "replace.h"
+#include "system/select.h"
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+
+
+int rep_poll(struct pollfd *fds, nfds_t nfds, int timeout)
+{
+       fd_set rfds, wfds, efds;
+       struct timeval tv, *ptv;
+       int max_fd;
+       int rc;
+       nfds_t i;
+
+       if ((fds == NULL) && (nfds != 0)) {
+               errno = EFAULT;
+               return -1;
+       }
+
+       FD_ZERO(&rfds);
+       FD_ZERO(&wfds);
+       FD_ZERO(&efds);
+
+       rc = 0;
+       max_fd = 0;
+
+       /* compute fd_sets and find largest descriptor */
+       for (i = 0; i < nfds; i++) {
+               if ((fds[i].fd < 0) || (fds[i].fd >= FD_SETSIZE)) {
+                       fds[i].revents = POLLNVAL;
+                       continue;
+               }
+
+               if (fds[i].events & (POLLIN | POLLRDNORM)) {
+                       FD_SET(fds[i].fd, &rfds);
+               }
+               if (fds[i].events & (POLLOUT | POLLWRNORM | POLLWRBAND)) {
+                       FD_SET(fds[i].fd, &wfds);
+               }
+               if (fds[i].events & (POLLPRI | POLLRDBAND)) {
+                       FD_SET(fds[i].fd, &efds);
+               }
+               if (fds[i].fd > max_fd &&
+                   (fds[i].events & (POLLIN | POLLOUT | POLLPRI |
+                                     POLLRDNORM | POLLRDBAND |
+                                     POLLWRNORM | POLLWRBAND))) {
+                       max_fd = fds[i].fd;
+               }
+       }
+
+       if (timeout < 0) {
+               ptv = NULL;
+       } else {
+               ptv = &tv;
+               if (timeout == 0) {
+                       tv.tv_sec = 0;
+                       tv.tv_usec = 0;
+               } else {
+                       tv.tv_sec = timeout / 1000;
+                       tv.tv_usec = (timeout % 1000) * 1000;
+               }
+       }
+
+       rc = select(max_fd + 1, &rfds, &wfds, &efds, ptv);
+       if (rc < 0) {
+               return -1;
+       }
+
+       for (rc = 0, i = 0; i < nfds; i++) {
+               if ((fds[i].fd < 0) || (fds[i].fd >= FD_SETSIZE)) {
+                       continue;
+               }
+
+               fds[i].revents = 0;
+
+               if (FD_ISSET(fds[i].fd, &rfds)) {
+                       int err = errno;
+                       int available = 0;
+                       int ret;
+
+                       /* support for POLLHUP */
+                       ret = ioctl(fds[i].fd, FIONREAD, &available);
+                       if ((ret == -1) || (available == 0)) {
+                               fds[i].revents |= POLLHUP;
+                       } else {
+                               fds[i].revents |= fds[i].events
+                                       & (POLLIN | POLLRDNORM);
+                       }
+
+                       errno = err;
+               }
+               if (FD_ISSET(fds[i].fd, &wfds)) {
+                       fds[i].revents |= fds[i].events
+                               & (POLLOUT | POLLWRNORM | POLLWRBAND);
+               }
+               if (FD_ISSET(fds[i].fd, &efds)) {
+                       fds[i].revents |= fds[i].events
+                               & (POLLPRI | POLLRDBAND);
+               }
+               if (fds[i].revents & ~POLLHUP) {
+                       rc++;
+               }
+       }
+       return rc;
+}
diff --git a/ctdb/lib/replace/repdir.m4 b/ctdb/lib/replace/repdir.m4
new file mode 100644 (file)
index 0000000..682ab44
--- /dev/null
@@ -0,0 +1,81 @@
+AC_CACHE_CHECK([for broken readdir],libreplace_cv_READDIR_NEEDED,[
+       AC_TRY_RUN([
+#define test_readdir_os2_delete main
+#include "$libreplacedir/test/os2_delete.c"],
+       [libreplace_cv_READDIR_NEEDED=no],
+       [libreplace_cv_READDIR_NEEDED=yes],
+       [libreplace_cv_READDIR_NEEDED="assuming not"])
+])
+
+AC_CHECK_FUNCS(dirfd)
+AC_HAVE_DECL(dirfd, [#include <dirent.h>])
+
+#
+# try to replace with getdirentries() if needed
+#
+if test x"$libreplace_cv_READDIR_NEEDED" = x"yes"; then
+AC_CHECK_FUNCS(getdirentries)
+AC_VERIFY_C_PROTOTYPE([long telldir(const DIR *dir)],
+       [
+       return 0;
+       ],[
+       AC_DEFINE(TELLDIR_TAKES_CONST_DIR, 1, [Whether telldir takes a const pointer])
+       ],[],[
+       #include <dirent.h>
+       ])
+
+AC_VERIFY_C_PROTOTYPE([int seekdir(DIR *dir, long ofs)],
+       [
+       return 0;
+       ],[
+       AC_DEFINE(SEEKDIR_RETURNS_INT, 1, [Whether seekdir returns an int])
+       ],[],[
+       #include <dirent.h>
+       ])
+AC_CACHE_CHECK([for replacing readdir using getdirentries()],libreplace_cv_READDIR_GETDIRENTRIES,[
+       AC_TRY_RUN([
+#define _LIBREPLACE_REPLACE_H
+#include "$libreplacedir/repdir_getdirentries.c"
+#define test_readdir_os2_delete main
+#include "$libreplacedir/test/os2_delete.c"],
+       [libreplace_cv_READDIR_GETDIRENTRIES=yes],
+       [libreplace_cv_READDIR_GETDIRENTRIES=no])
+])
+fi
+if test x"$libreplace_cv_READDIR_GETDIRENTRIES" = x"yes"; then
+       AC_DEFINE(REPLACE_READDIR,1,[replace readdir])
+       AC_DEFINE(REPLACE_READDIR_GETDIRENTRIES,1,[replace readdir using getdirentries()])
+       LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/repdir_getdirentries.o"
+       libreplace_cv_READDIR_NEEDED=no
+fi
+
+#
+# try to replace with getdents() if needed
+#
+if test x"$libreplace_cv_READDIR_NEEDED" = x"yes"; then
+AC_CHECK_FUNCS(getdents)
+AC_CACHE_CHECK([for replacing readdir using getdents()],libreplace_cv_READDIR_GETDENTS,[
+       AC_TRY_RUN([
+#define _LIBREPLACE_REPLACE_H
+#error _donot_use_getdents_replacement_anymore
+#include "$libreplacedir/repdir_getdents.c"
+#define test_readdir_os2_delete main
+#include "$libreplacedir/test/os2_delete.c"],
+       [libreplace_cv_READDIR_GETDENTS=yes],
+       [libreplace_cv_READDIR_GETDENTS=no])
+])
+fi
+if test x"$libreplace_cv_READDIR_GETDENTS" = x"yes"; then
+       AC_DEFINE(REPLACE_READDIR,1,[replace readdir])
+       AC_DEFINE(REPLACE_READDIR_GETDENTS,1,[replace readdir using getdents()])
+       LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/repdir_getdents.o"
+       libreplace_cv_READDIR_NEEDED=no
+fi
+
+AC_MSG_CHECKING([a usable readdir()])
+if test x"$libreplace_cv_READDIR_NEEDED" = x"yes"; then
+       AC_MSG_RESULT(no)
+       AC_MSG_WARN([the provided readdir() is broken])
+else
+       AC_MSG_RESULT(yes)
+fi
diff --git a/ctdb/lib/replace/repdir_getdents.c b/ctdb/lib/replace/repdir_getdents.c
new file mode 100644 (file)
index 0000000..afc634a
--- /dev/null
@@ -0,0 +1,166 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   Copyright (C) Andrew Tridgell 2005
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+  a replacement for opendir/readdir/telldir/seekdir/closedir for BSD systems
+
+  This is needed because the existing directory handling in FreeBSD
+  and OpenBSD (and possibly NetBSD) doesn't correctly handle unlink()
+  on files in a directory where telldir() has been used. On a block
+  boundary it will occasionally miss a file when seekdir() is used to
+  return to a position previously recorded with telldir().
+
+  This also fixes a severe performance and memory usage problem with
+  telldir() on BSD systems. Each call to telldir() in BSD adds an
+  entry to a linked list, and those entries are cleaned up on
+  closedir(). This means with a large directory closedir() can take an
+  arbitrary amount of time, causing network timeouts as millions of
+  telldir() entries are freed
+
+  Note! This replacement code is not portable. It relies on getdents()
+  always leaving the file descriptor at a seek offset that is a
+  multiple of DIR_BUF_SIZE. If the code detects that this doesn't
+  happen then it will abort(). It also does not handle directories
+  with offsets larger than can be stored in a long,
+
+  This code is available under other free software licenses as
+  well. Contact the author.
+*/
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+#define DIR_BUF_BITS 9
+#define DIR_BUF_SIZE (1<<DIR_BUF_BITS)
+
+struct dir_buf {
+       int fd;
+       int nbytes, ofs;
+       off_t seekpos;
+       char buf[DIR_BUF_SIZE];
+};
+
+DIR *opendir(const char *dname)
+{
+       struct dir_buf *d;
+       struct stat sb;
+       d = malloc(sizeof(*d));
+       if (d == NULL) {
+               errno = ENOMEM;
+               return NULL;
+       }
+       d->fd = open(dname, O_RDONLY);
+       if (d->fd == -1) {
+               free(d);
+               return NULL;
+       }
+       if (fstat(d->fd, &sb) < 0) {
+               close(d->fd);
+               free(d);
+               return NULL;
+       }
+       if (!S_ISDIR(sb.st_mode)) {
+               close(d->fd);
+               free(d);   
+               errno = ENOTDIR;
+               return NULL;
+       }
+       d->ofs = 0;
+       d->seekpos = 0;
+       d->nbytes = 0;
+       return (DIR *)d;
+}
+
+struct dirent *readdir(DIR *dir)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       struct dirent *de;
+
+       if (d->ofs >= d->nbytes) {
+               d->seekpos = lseek(d->fd, 0, SEEK_CUR);
+               d->nbytes = getdents(d->fd, d->buf, DIR_BUF_SIZE);
+               d->ofs = 0;
+       }
+       if (d->ofs >= d->nbytes) {
+               return NULL;
+       }
+       de = (struct dirent *)&d->buf[d->ofs];
+       d->ofs += de->d_reclen;
+       return de;
+}
+
+long telldir(DIR *dir)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       if (d->ofs >= d->nbytes) {
+               d->seekpos = lseek(d->fd, 0, SEEK_CUR);
+               d->ofs = 0;
+               d->nbytes = 0;
+       }
+       /* this relies on seekpos always being a multiple of
+          DIR_BUF_SIZE. Is that always true on BSD systems? */
+       if (d->seekpos & (DIR_BUF_SIZE-1)) {
+               abort();
+       }
+       return d->seekpos + d->ofs;
+}
+
+void seekdir(DIR *dir, long ofs)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       d->seekpos = lseek(d->fd, ofs & ~(DIR_BUF_SIZE-1), SEEK_SET);
+       d->nbytes = getdents(d->fd, d->buf, DIR_BUF_SIZE);
+       d->ofs = 0;
+       while (d->ofs < (ofs & (DIR_BUF_SIZE-1))) {
+               if (readdir(dir) == NULL) break;
+       }
+}
+
+void rewinddir(DIR *dir)
+{
+       seekdir(dir, 0);
+}
+
+int closedir(DIR *dir)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       int r = close(d->fd);
+       if (r != 0) {
+               return r;
+       }
+       free(d);
+       return 0;
+}
+
+#ifndef dirfd
+/* darn, this is a macro on some systems. */
+int dirfd(DIR *dir)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       return d->fd;
+}
+#endif
diff --git a/ctdb/lib/replace/repdir_getdirentries.c b/ctdb/lib/replace/repdir_getdirentries.c
new file mode 100644 (file)
index 0000000..197e593
--- /dev/null
@@ -0,0 +1,183 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   Copyright (C) Andrew Tridgell 2005
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+  a replacement for opendir/readdir/telldir/seekdir/closedir for BSD
+  systems using getdirentries
+
+  This is needed because the existing directory handling in FreeBSD
+  and OpenBSD (and possibly NetBSD) doesn't correctly handle unlink()
+  on files in a directory where telldir() has been used. On a block
+  boundary it will occasionally miss a file when seekdir() is used to
+  return to a position previously recorded with telldir().
+
+  This also fixes a severe performance and memory usage problem with
+  telldir() on BSD systems. Each call to telldir() in BSD adds an
+  entry to a linked list, and those entries are cleaned up on
+  closedir(). This means with a large directory closedir() can take an
+  arbitrary amount of time, causing network timeouts as millions of
+  telldir() entries are freed
+
+  Note! This replacement code is not portable. It relies on
+  getdirentries() always leaving the file descriptor at a seek offset
+  that is a multiple of DIR_BUF_SIZE. If the code detects that this
+  doesn't happen then it will abort(). It also does not handle
+  directories with offsets larger than can be stored in a long,
+
+  This code is available under other free software licenses as
+  well. Contact the author.
+*/
+
+#include "replace.h"
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+#define DIR_BUF_BITS 9
+#define DIR_BUF_SIZE (1<<DIR_BUF_BITS)
+
+struct dir_buf {
+       int fd;
+       int nbytes, ofs;
+       off_t seekpos;
+       char buf[DIR_BUF_SIZE];
+};
+
+DIR *opendir(const char *dname)
+{
+       struct dir_buf *d;
+       struct stat sb;
+       d = malloc(sizeof(*d));
+       if (d == NULL) {
+               errno = ENOMEM;
+               return NULL;
+       }
+       d->fd = open(dname, O_RDONLY);
+       if (d->fd == -1) {
+               free(d);
+               return NULL;
+       }
+       if (fstat(d->fd, &sb) < 0) {
+               close(d->fd);
+               free(d);
+               return NULL;
+       }
+       if (!S_ISDIR(sb.st_mode)) {
+               close(d->fd);
+               free(d);   
+               errno = ENOTDIR;
+               return NULL;
+       }
+       d->ofs = 0;
+       d->seekpos = 0;
+       d->nbytes = 0;
+       return (DIR *)d;
+}
+
+struct dirent *readdir(DIR *dir)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       struct dirent *de;
+
+       if (d->ofs >= d->nbytes) {
+               long pos;
+               d->nbytes = getdirentries(d->fd, d->buf, DIR_BUF_SIZE, &pos);
+               d->seekpos = pos;
+               d->ofs = 0;
+       }
+       if (d->ofs >= d->nbytes) {
+               return NULL;
+       }
+       de = (struct dirent *)&d->buf[d->ofs];
+       d->ofs += de->d_reclen;
+       return de;
+}
+
+#ifdef TELLDIR_TAKES_CONST_DIR
+long telldir(const DIR *dir)
+#else
+long telldir(DIR *dir)
+#endif
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       if (d->ofs >= d->nbytes) {
+               d->seekpos = lseek(d->fd, 0, SEEK_CUR);
+               d->ofs = 0;
+               d->nbytes = 0;
+       }
+       /* this relies on seekpos always being a multiple of
+          DIR_BUF_SIZE. Is that always true on BSD systems? */
+       if (d->seekpos & (DIR_BUF_SIZE-1)) {
+               abort();
+       }
+       return d->seekpos + d->ofs;
+}
+
+#ifdef SEEKDIR_RETURNS_INT
+int seekdir(DIR *dir, long ofs)
+#else
+void seekdir(DIR *dir, long ofs)
+#endif
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       long pos;
+       d->seekpos = lseek(d->fd, ofs & ~(DIR_BUF_SIZE-1), SEEK_SET);
+       d->nbytes = getdirentries(d->fd, d->buf, DIR_BUF_SIZE, &pos);
+       d->ofs = 0;
+       while (d->ofs < (ofs & (DIR_BUF_SIZE-1))) {
+               if (readdir(dir) == NULL) break;
+       }
+#ifdef SEEKDIR_RETURNS_INT
+       return -1;
+#endif
+}
+
+void rewinddir(DIR *dir)
+{
+       seekdir(dir, 0);
+}
+
+int closedir(DIR *dir)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       int r = close(d->fd);
+       if (r != 0) {
+               return r;
+       }
+       free(d);
+       return 0;
+}
+
+#ifndef dirfd
+/* darn, this is a macro on some systems. */
+int dirfd(DIR *dir)
+{
+       struct dir_buf *d = (struct dir_buf *)dir;
+       return d->fd;
+}
+#endif
+
+
diff --git a/ctdb/lib/replace/replace-test.h b/ctdb/lib/replace/replace-test.h
new file mode 100644 (file)
index 0000000..ed8e75e
--- /dev/null
@@ -0,0 +1,9 @@
+#ifndef __LIB_REPLACE_REPLACE_TEST_H__
+#define __LIB_REPLACE_REPLACE_TEST_H__
+
+int libreplace_test_strptime(void);
+int test_readdir_os2_delete(void);
+int getifaddrs_test(void);
+
+#endif /* __LIB_REPLACE_REPLACE_TEST_H__ */
+
diff --git a/ctdb/lib/replace/replace-testsuite.h b/ctdb/lib/replace/replace-testsuite.h
new file mode 100644 (file)
index 0000000..b28dbec
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef __LIB_REPLACE_REPLACE_TESTSUITE_H__
+#define __LIB_REPLACE_REPLACE_TESTSUITE_H__
+
+#include <stdbool.h>
+struct torture_context;
+
+bool torture_local_replace(struct torture_context *ctx);
+
+#endif /* __LIB_REPLACE_REPLACE_TESTSUITE_H__ */
+
diff --git a/ctdb/lib/replace/replace.c b/ctdb/lib/replace/replace.c
new file mode 100644 (file)
index 0000000..37edb31
--- /dev/null
@@ -0,0 +1,902 @@
+/* 
+   Unix SMB/CIFS implementation.
+   replacement routines for broken systems
+   Copyright (C) Andrew Tridgell 1992-1998
+   Copyright (C) Jelmer Vernooij 2005-2008
+   Copyright (C) Matthieu Patou  2010
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/passwd.h"
+#include "system/syslog.h"
+#include "system/locale.h"
+#include "system/wait.h"
+
+#ifdef _WIN32
+#define mkdir(d,m) _mkdir(d)
+#endif
+
+void replace_dummy(void);
+void replace_dummy(void) {}
+
+#ifndef HAVE_FTRUNCATE
+ /*******************************************************************
+ftruncate for operating systems that don't have it
+********************************************************************/
+int rep_ftruncate(int f, off_t l)
+{
+#ifdef HAVE_CHSIZE
+      return chsize(f,l);
+#elif defined(F_FREESP)
+      struct  flock   fl;
+
+      fl.l_whence = 0;
+      fl.l_len = 0;
+      fl.l_start = l;
+      fl.l_type = F_WRLCK;
+      return fcntl(f, F_FREESP, &fl);
+#else
+#error "you must have a ftruncate function"
+#endif
+}
+#endif /* HAVE_FTRUNCATE */
+
+
+#ifndef HAVE_STRLCPY
+/* like strncpy but does not 0 fill the buffer and always null 
+   terminates. bufsize is the size of the destination buffer */
+size_t rep_strlcpy(char *d, const char *s, size_t bufsize)
+{
+       size_t len = strlen(s);
+       size_t ret = len;
+       if (bufsize <= 0) return 0;
+       if (len >= bufsize) len = bufsize-1;
+       memcpy(d, s, len);
+       d[len] = 0;
+       return ret;
+}
+#endif
+
+#ifndef HAVE_STRLCAT
+/* like strncat but does not 0 fill the buffer and always null 
+   terminates. bufsize is the length of the buffer, which should
+   be one more than the maximum resulting string length */
+size_t rep_strlcat(char *d, const char *s, size_t bufsize)
+{
+       size_t len1 = strlen(d);
+       size_t len2 = strlen(s);
+       size_t ret = len1 + len2;
+
+       if (len1+len2 >= bufsize) {
+               if (bufsize < (len1+1)) {
+                       return ret;
+               }
+               len2 = bufsize - (len1+1);
+       }
+       if (len2 > 0) {
+               memcpy(d+len1, s, len2);
+               d[len1+len2] = 0;
+       }
+       return ret;
+}
+#endif
+
+#ifndef HAVE_MKTIME
+/*******************************************************************
+a mktime() replacement for those who don't have it - contributed by 
+C.A. Lademann <cal@zls.com>
+Corrections by richard.kettlewell@kewill.com
+********************************************************************/
+
+#define  MINUTE  60
+#define  HOUR    60*MINUTE
+#define  DAY             24*HOUR
+#define  YEAR    365*DAY
+time_t rep_mktime(struct tm *t)
+{
+  struct tm       *u;
+  time_t  epoch = 0;
+  int n;
+  int             mon [] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 },
+  y, m, i;
+
+  if(t->tm_year < 70)
+    return((time_t)-1);
+
+  n = t->tm_year + 1900 - 1;
+  epoch = (t->tm_year - 70) * YEAR + 
+    ((n / 4 - n / 100 + n / 400) - (1969 / 4 - 1969 / 100 + 1969 / 400)) * DAY;
+
+  y = t->tm_year + 1900;
+  m = 0;
+
+  for(i = 0; i < t->tm_mon; i++) {
+    epoch += mon [m] * DAY;
+    if(m == 1 && y % 4 == 0 && (y % 100 != 0 || y % 400 == 0))
+      epoch += DAY;
+    
+    if(++m > 11) {
+      m = 0;
+      y++;
+    }
+  }
+
+  epoch += (t->tm_mday - 1) * DAY;
+  epoch += t->tm_hour * HOUR + t->tm_min * MINUTE + t->tm_sec;
+  
+  if((u = localtime(&epoch)) != NULL) {
+    t->tm_sec = u->tm_sec;
+    t->tm_min = u->tm_min;
+    t->tm_hour = u->tm_hour;
+    t->tm_mday = u->tm_mday;
+    t->tm_mon = u->tm_mon;
+    t->tm_year = u->tm_year;
+    t->tm_wday = u->tm_wday;
+    t->tm_yday = u->tm_yday;
+    t->tm_isdst = u->tm_isdst;
+  }
+
+  return(epoch);
+}
+#endif /* !HAVE_MKTIME */
+
+
+#ifndef HAVE_INITGROUPS
+/****************************************************************************
+ some systems don't have an initgroups call 
+****************************************************************************/
+int rep_initgroups(char *name, gid_t id)
+{
+#ifndef HAVE_SETGROUPS
+       /* yikes! no SETGROUPS or INITGROUPS? how can this work? */
+       errno = ENOSYS;
+       return -1;
+#else /* HAVE_SETGROUPS */
+
+#include <grp.h>
+
+       gid_t *grouplst = NULL;
+       int max_gr = NGROUPS_MAX;
+       int ret;
+       int    i,j;
+       struct group *g;
+       char   *gr;
+       
+       if((grouplst = malloc(sizeof(gid_t) * max_gr)) == NULL) {
+               errno = ENOMEM;
+               return -1;
+       }
+
+       grouplst[0] = id;
+       i = 1;
+       while (i < max_gr && ((g = (struct group *)getgrent()) != (struct group *)NULL)) {
+               if (g->gr_gid == id)
+                       continue;
+               j = 0;
+               gr = g->gr_mem[0];
+               while (gr && (*gr != (char)NULL)) {
+                       if (strcmp(name,gr) == 0) {
+                               grouplst[i] = g->gr_gid;
+                               i++;
+                               gr = (char *)NULL;
+                               break;
+                       }
+                       gr = g->gr_mem[++j];
+               }
+       }
+       endgrent();
+       ret = setgroups(i, grouplst);
+       free(grouplst);
+       return ret;
+#endif /* HAVE_SETGROUPS */
+}
+#endif /* HAVE_INITGROUPS */
+
+
+#ifndef HAVE_MEMMOVE
+/*******************************************************************
+safely copies memory, ensuring no overlap problems.
+this is only used if the machine does not have its own memmove().
+this is not the fastest algorithm in town, but it will do for our
+needs.
+********************************************************************/
+void *rep_memmove(void *dest,const void *src,int size)
+{
+       unsigned long d,s;
+       int i;
+       if (dest==src || !size) return(dest);
+
+       d = (unsigned long)dest;
+       s = (unsigned long)src;
+
+       if ((d >= (s+size)) || (s >= (d+size))) {
+               /* no overlap */
+               memcpy(dest,src,size);
+               return(dest);
+       }
+
+       if (d < s) {
+               /* we can forward copy */
+               if (s-d >= sizeof(int) && 
+                   !(s%sizeof(int)) && 
+                   !(d%sizeof(int)) && 
+                   !(size%sizeof(int))) {
+                       /* do it all as words */
+                       int *idest = (int *)dest;
+                       int *isrc = (int *)src;
+                       size /= sizeof(int);
+                       for (i=0;i<size;i++) idest[i] = isrc[i];
+               } else {
+                       /* simplest */
+                       char *cdest = (char *)dest;
+                       char *csrc = (char *)src;
+                       for (i=0;i<size;i++) cdest[i] = csrc[i];
+               }
+       } else {
+               /* must backward copy */
+               if (d-s >= sizeof(int) && 
+                   !(s%sizeof(int)) && 
+                   !(d%sizeof(int)) && 
+                   !(size%sizeof(int))) {
+                       /* do it all as words */
+                       int *idest = (int *)dest;
+                       int *isrc = (int *)src;
+                       size /= sizeof(int);
+                       for (i=size-1;i>=0;i--) idest[i] = isrc[i];
+               } else {
+                       /* simplest */
+                       char *cdest = (char *)dest;
+                       char *csrc = (char *)src;
+                       for (i=size-1;i>=0;i--) cdest[i] = csrc[i];
+               }      
+       }
+       return(dest);
+}
+#endif /* HAVE_MEMMOVE */
+
+#ifndef HAVE_STRDUP
+/****************************************************************************
+duplicate a string
+****************************************************************************/
+char *rep_strdup(const char *s)
+{
+       size_t len;
+       char *ret;
+
+       if (!s) return(NULL);
+
+       len = strlen(s)+1;
+       ret = (char *)malloc(len);
+       if (!ret) return(NULL);
+       memcpy(ret,s,len);
+       return(ret);
+}
+#endif /* HAVE_STRDUP */
+
+#ifndef HAVE_SETLINEBUF
+void rep_setlinebuf(FILE *stream)
+{
+       setvbuf(stream, (char *)NULL, _IOLBF, 0);
+}
+#endif /* HAVE_SETLINEBUF */
+
+#ifndef HAVE_VSYSLOG
+#ifdef HAVE_SYSLOG
+void rep_vsyslog (int facility_priority, const char *format, va_list arglist)
+{
+       char *msg = NULL;
+       vasprintf(&msg, format, arglist);
+       if (!msg)
+               return;
+       syslog(facility_priority, "%s", msg);
+       free(msg);
+}
+#endif /* HAVE_SYSLOG */
+#endif /* HAVE_VSYSLOG */
+
+#ifndef HAVE_STRNLEN
+/**
+ Some platforms don't have strnlen
+**/
+ size_t rep_strnlen(const char *s, size_t max)
+{
+        size_t len;
+  
+        for (len = 0; len < max; len++) {
+                if (s[len] == '\0') {
+                        break;
+                }
+        }
+        return len;  
+}
+#endif
+  
+#ifndef HAVE_STRNDUP
+/**
+ Some platforms don't have strndup.
+**/
+char *rep_strndup(const char *s, size_t n)
+{
+       char *ret;
+       
+       n = strnlen(s, n);
+       ret = malloc(n+1);
+       if (!ret)
+               return NULL;
+       memcpy(ret, s, n);
+       ret[n] = 0;
+
+       return ret;
+}
+#endif
+
+#if !defined(HAVE_WAITPID) && defined(HAVE_WAIT4)
+int rep_waitpid(pid_t pid,int *status,int options)
+{
+  return wait4(pid, status, options, NULL);
+}
+#endif
+
+#ifndef HAVE_SETEUID
+int rep_seteuid(uid_t euid)
+{
+#ifdef HAVE_SETRESUID
+       return setresuid(-1, euid, -1);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+#endif
+
+#ifndef HAVE_SETEGID
+int rep_setegid(gid_t egid)
+{
+#ifdef HAVE_SETRESGID
+       return setresgid(-1, egid, -1);
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+#endif
+
+/*******************************************************************
+os/2 also doesn't have chroot
+********************************************************************/
+#ifndef HAVE_CHROOT
+int rep_chroot(const char *dname)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+/*****************************************************************
+ Possibly replace mkstemp if it is broken.
+*****************************************************************/  
+
+#ifndef HAVE_SECURE_MKSTEMP
+int rep_mkstemp(char *template)
+{
+       /* have a reasonable go at emulating it. Hope that
+          the system mktemp() isn't completely hopeless */
+       mktemp(template);
+       if (template[0] == 0)
+               return -1;
+       return open(template, O_CREAT|O_EXCL|O_RDWR, 0600);
+}
+#endif
+
+#ifndef HAVE_MKDTEMP
+char *rep_mkdtemp(char *template)
+{
+       char *dname;
+       
+       if ((dname = mktemp(template))) {
+               if (mkdir(dname, 0700) >= 0) {
+                       return dname;
+               }
+       }
+
+       return NULL;
+}
+#endif
+
+/*****************************************************************
+ Watch out: this is not thread safe.
+*****************************************************************/
+
+#ifndef HAVE_PREAD
+ssize_t rep_pread(int __fd, void *__buf, size_t __nbytes, off_t __offset)
+{
+       if (lseek(__fd, __offset, SEEK_SET) != __offset) {
+               return -1;
+       }
+       return read(__fd, __buf, __nbytes);
+}
+#endif
+
+/*****************************************************************
+ Watch out: this is not thread safe.
+*****************************************************************/
+
+#ifndef HAVE_PWRITE
+ssize_t rep_pwrite(int __fd, const void *__buf, size_t __nbytes, off_t __offset)
+{
+       if (lseek(__fd, __offset, SEEK_SET) != __offset) {
+               return -1;
+       }
+       return write(__fd, __buf, __nbytes);
+}
+#endif
+
+#ifndef HAVE_STRCASESTR
+char *rep_strcasestr(const char *haystack, const char *needle)
+{
+       const char *s;
+       size_t nlen = strlen(needle);
+       for (s=haystack;*s;s++) {
+               if (toupper(*needle) == toupper(*s) &&
+                   strncasecmp(s, needle, nlen) == 0) {
+                       return (char *)((uintptr_t)s);
+               }
+       }
+       return NULL;
+}
+#endif
+
+#ifndef HAVE_STRTOK_R
+/* based on GLIBC version, copyright Free Software Foundation */
+char *rep_strtok_r(char *s, const char *delim, char **save_ptr)
+{
+       char *token;
+
+       if (s == NULL) s = *save_ptr;
+
+       s += strspn(s, delim);
+       if (*s == '\0') {
+               *save_ptr = s;
+               return NULL;
+       }
+
+       token = s;
+       s = strpbrk(token, delim);
+       if (s == NULL) {
+               *save_ptr = token + strlen(token);
+       } else {
+               *s = '\0';
+               *save_ptr = s + 1;
+       }
+
+       return token;
+}
+#endif
+
+
+#ifndef HAVE_STRTOLL
+long long int rep_strtoll(const char *str, char **endptr, int base)
+{
+#ifdef HAVE_STRTOQ
+       return strtoq(str, endptr, base);
+#elif defined(HAVE___STRTOLL) 
+       return __strtoll(str, endptr, base);
+#elif SIZEOF_LONG == SIZEOF_LONG_LONG
+       return (long long int) strtol(str, endptr, base);
+#else
+# error "You need a strtoll function"
+#endif
+}
+#else
+#ifdef HAVE_BSD_STRTOLL
+#ifdef HAVE_STRTOQ
+long long int rep_strtoll(const char *str, char **endptr, int base)
+{
+       long long int nb = strtoq(str, endptr, base);
+       /* In linux EINVAL is only returned if base is not ok */
+       if (errno == EINVAL) {
+               if (base == 0 || (base >1 && base <37)) {
+                       /* Base was ok so it's because we were not
+                        * able to make the convertion.
+                        * Let's reset errno.
+                        */
+                       errno = 0;
+               }
+       }
+       return nb;
+}
+#else
+#error "You need the strtoq function"
+#endif /* HAVE_STRTOQ */
+#endif /* HAVE_BSD_STRTOLL */
+#endif /* HAVE_STRTOLL */
+
+
+#ifndef HAVE_STRTOULL
+unsigned long long int rep_strtoull(const char *str, char **endptr, int base)
+{
+#ifdef HAVE_STRTOUQ
+       return strtouq(str, endptr, base);
+#elif defined(HAVE___STRTOULL) 
+       return __strtoull(str, endptr, base);
+#elif SIZEOF_LONG == SIZEOF_LONG_LONG
+       return (unsigned long long int) strtoul(str, endptr, base);
+#else
+# error "You need a strtoull function"
+#endif
+}
+#else
+#ifdef HAVE_BSD_STRTOLL
+#ifdef HAVE_STRTOUQ
+unsigned long long int rep_strtoull(const char *str, char **endptr, int base)
+{
+       unsigned long long int nb = strtouq(str, endptr, base);
+       /* In linux EINVAL is only returned if base is not ok */
+       if (errno == EINVAL) {
+               if (base == 0 || (base >1 && base <37)) {
+                       /* Base was ok so it's because we were not
+                        * able to make the convertion.
+                        * Let's reset errno.
+                        */
+                       errno = 0;
+               }
+       }
+       return nb;
+}
+#else
+#error "You need the strtouq function"
+#endif /* HAVE_STRTOUQ */
+#endif /* HAVE_BSD_STRTOLL */
+#endif /* HAVE_STRTOULL */
+
+#ifndef HAVE_SETENV
+int rep_setenv(const char *name, const char *value, int overwrite) 
+{
+       char *p;
+       size_t l1, l2;
+       int ret;
+
+       if (!overwrite && getenv(name)) {
+               return 0;
+       }
+
+       l1 = strlen(name);
+       l2 = strlen(value);
+
+       p = malloc(l1+l2+2);
+       if (p == NULL) {
+               return -1;
+       }
+       memcpy(p, name, l1);
+       p[l1] = '=';
+       memcpy(p+l1+1, value, l2);
+       p[l1+l2+1] = 0;
+
+       ret = putenv(p);
+       if (ret != 0) {
+               free(p);
+       }
+
+       return ret;
+}
+#endif
+
+#ifndef HAVE_UNSETENV
+int rep_unsetenv(const char *name)
+{
+       extern char **environ;
+       size_t len = strlen(name);
+       size_t i, count;
+
+       if (environ == NULL || getenv(name) == NULL) {
+               return 0;
+       }
+
+       for (i=0;environ[i];i++) /* noop */ ;
+
+       count=i;
+       
+       for (i=0;i<count;) {
+               if (strncmp(environ[i], name, len) == 0 && environ[i][len] == '=') {
+                       /* note: we do _not_ free the old variable here. It is unsafe to 
+                          do so, as the pointer may not have come from malloc */
+                       memmove(&environ[i], &environ[i+1], (count-i)*sizeof(char *));
+                       count--;
+               } else {
+                       i++;
+               }
+       }
+
+       return 0;
+}
+#endif
+
+#ifndef HAVE_UTIME
+int rep_utime(const char *filename, const struct utimbuf *buf)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+#ifndef HAVE_UTIMES
+int rep_utimes(const char *filename, const struct timeval tv[2])
+{
+       struct utimbuf u;
+
+       u.actime = tv[0].tv_sec;
+       if (tv[0].tv_usec > 500000) {
+               u.actime += 1;
+       }
+
+       u.modtime = tv[1].tv_sec;
+       if (tv[1].tv_usec > 500000) {
+               u.modtime += 1;
+       }
+
+       return utime(filename, &u);
+}
+#endif
+
+#ifndef HAVE_DUP2
+int rep_dup2(int oldfd, int newfd) 
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+#ifndef HAVE_CHOWN
+/**
+chown isn't used much but OS/2 doesn't have it
+**/
+int rep_chown(const char *fname, uid_t uid, gid_t gid)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+#ifndef HAVE_LINK
+int rep_link(const char *oldpath, const char *newpath)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+#ifndef HAVE_READLINK
+int rep_readlink(const char *path, char *buf, size_t bufsiz)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+#ifndef HAVE_SYMLINK
+int rep_symlink(const char *oldpath, const char *newpath)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+#ifndef HAVE_LCHOWN
+int rep_lchown(const char *fname,uid_t uid,gid_t gid)
+{
+       errno = ENOSYS;
+       return -1;
+}
+#endif
+
+#ifndef HAVE_REALPATH
+char *rep_realpath(const char *path, char *resolved_path)
+{
+       /* As realpath is not a system call we can't return ENOSYS. */
+       errno = EINVAL;
+       return NULL;
+}
+#endif
+
+
+#ifndef HAVE_MEMMEM
+void *rep_memmem(const void *haystack, size_t haystacklen,
+                const void *needle, size_t needlelen)
+{
+       if (needlelen == 0) {
+               return discard_const(haystack);
+       }
+       while (haystacklen >= needlelen) {
+               char *p = (char *)memchr(haystack, *(const char *)needle,
+                                        haystacklen-(needlelen-1));
+               if (!p) return NULL;
+               if (memcmp(p, needle, needlelen) == 0) {
+                       return p;
+               }
+               haystack = p+1;
+               haystacklen -= (p - (const char *)haystack) + 1;
+       }
+       return NULL;
+}
+#endif
+
+#if !defined(HAVE_VDPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+int rep_vdprintf(int fd, const char *format, va_list ap)
+{
+       char *s = NULL;
+       int ret;
+
+       vasprintf(&s, format, ap);
+       if (s == NULL) {
+               errno = ENOMEM;
+               return -1;
+       }
+       ret = write(fd, s, strlen(s));
+       free(s);
+       return ret;
+}
+#endif
+
+#if !defined(HAVE_DPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+int rep_dprintf(int fd, const char *format, ...)
+{
+       int ret;
+       va_list ap;
+
+       va_start(ap, format);
+       ret = vdprintf(fd, format, ap);
+       va_end(ap);
+
+       return ret;
+}
+#endif
+
+#ifndef HAVE_GET_CURRENT_DIR_NAME
+char *rep_get_current_dir_name(void)
+{
+       char buf[PATH_MAX+1];
+       char *p;
+       p = getcwd(buf, sizeof(buf));
+       if (p == NULL) {
+               return NULL;
+       }
+       return strdup(p);
+}
+#endif
+
+#ifndef HAVE_STRERROR_R
+int rep_strerror_r(int errnum, char *buf, size_t buflen)
+{
+       char *s = strerror(errnum);
+       if (strlen(s)+1 > buflen) {
+               errno = ERANGE;
+               return -1;
+       }
+       strncpy(buf, s, buflen);
+       return 0;
+}
+#endif
+
+#ifndef HAVE_CLOCK_GETTIME
+int rep_clock_gettime(clockid_t clk_id, struct timespec *tp)
+{
+       struct timeval tval;
+       switch (clk_id) {
+               case 0: /* CLOCK_REALTIME :*/
+#ifdef HAVE_GETTIMEOFDAY_TZ
+                       gettimeofday(&tval,NULL);
+#else
+                       gettimeofday(&tval);
+#endif
+                       tp->tv_sec = tval.tv_sec;
+                       tp->tv_nsec = tval.tv_usec * 1000;
+                       break;
+               default:
+                       errno = EINVAL;
+                       return -1;
+       }
+       return 0;
+}
+#endif
+
+#ifndef HAVE_MEMALIGN
+void *rep_memalign( size_t align, size_t size )
+{
+#if defined(HAVE_POSIX_MEMALIGN)
+       void *p = NULL;
+       int ret = posix_memalign( &p, align, size );
+       if ( ret == 0 )
+               return p;
+
+       return NULL;
+#else
+       /* On *BSD systems memaligns doesn't exist, but memory will
+        * be aligned on allocations of > pagesize. */
+#if defined(SYSCONF_SC_PAGESIZE)
+       size_t pagesize = (size_t)sysconf(_SC_PAGESIZE);
+#elif defined(HAVE_GETPAGESIZE)
+       size_t pagesize = (size_t)getpagesize();
+#else
+       size_t pagesize = (size_t)-1;
+#endif
+       if (pagesize == (size_t)-1) {
+               errno = ENOSYS;
+               return NULL;
+       }
+       if (size < pagesize) {
+               size = pagesize;
+       }
+       return malloc(size);
+#endif
+}
+#endif
+
+#ifndef HAVE_GETPEEREID
+int rep_getpeereid(int s, uid_t *uid, gid_t *gid)
+{
+#if defined(HAVE_PEERCRED)
+       struct ucred cred;
+       socklen_t cred_len = sizeof(struct ucred);
+       int ret;
+
+#undef getsockopt
+       ret = getsockopt(s, SOL_SOCKET, SO_PEERCRED, (void *)&cred, &cred_len);
+       if (ret != 0) {
+               return -1;
+       }
+
+       if (cred_len != sizeof(struct ucred)) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       *uid = cred.uid;
+       *gid = cred.gid;
+       return 0;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+#endif
+
+#ifndef HAVE_USLEEP
+int rep_usleep(useconds_t sec)
+{
+       struct timeval tval;
+       /*
+        * Fake it with select...
+        */
+       tval.tv_sec = 0;
+       tval.tv_usec = usecs/1000;
+       select(0,NULL,NULL,NULL,&tval);
+       return 0;
+}
+#endif /* HAVE_USLEEP */
+
+#ifndef HAVE_SETPROCTITLE
+void rep_setproctitle(const char *fmt, ...)
+{
+}
+#endif
diff --git a/ctdb/lib/replace/replace.h b/ctdb/lib/replace/replace.h
new file mode 100644 (file)
index 0000000..c0b7997
--- /dev/null
@@ -0,0 +1,902 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   macros to go along with the lib/replace/ portability layer code
+
+   Copyright (C) Andrew Tridgell 2005
+   Copyright (C) Jelmer Vernooij 2006-2008
+   Copyright (C) Jeremy Allison 2007.
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _LIBREPLACE_REPLACE_H
+#define _LIBREPLACE_REPLACE_H
+
+#ifndef NO_CONFIG_H
+#include "config.h"
+#endif
+
+#ifdef HAVE_STANDARDS_H
+#include <standards.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+
+#if defined(_MSC_VER) || defined(__MINGW32__)
+#include "win32_replace.h"
+#endif
+
+
+#ifdef HAVE_INTTYPES_H
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+#elif HAVE_STDINT_H
+#include <stdint.h>
+/* force off HAVE_INTTYPES_H so that roken doesn't try to include both,
+   which causes a warning storm on irix */
+#undef HAVE_INTTYPES_H
+#endif
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#ifndef __PRI64_PREFIX
+# if __WORDSIZE == 64 && ! defined __APPLE__
+#  define __PRI64_PREFIX       "l"
+# else
+#  define __PRI64_PREFIX       "ll"
+# endif
+#endif
+
+/* Decimal notation.  */
+#ifndef PRId8
+# define PRId8         "d"
+#endif
+#ifndef PRId16
+# define PRId16                "d"
+#endif
+#ifndef PRId32
+# define PRId32                "d"
+#endif
+#ifndef PRId64
+# define PRId64                __PRI64_PREFIX "d"
+#endif
+
+#ifndef PRIi8
+# define PRIi8         "i"
+#endif
+#ifndef PRIi16
+# define PRIi16                "i"
+#endif
+#ifndef PRIi32
+# define PRIi32                "i"
+#endif
+#ifndef PRIi64
+# define PRIi64                __PRI64_PREFIX "i"
+#endif
+
+#ifndef PRIu8
+# define PRIu8         "u"
+#endif
+#ifndef PRIu16
+# define PRIu16                "u"
+#endif
+#ifndef PRIu32
+# define PRIu32                "u"
+#endif
+#ifndef PRIu64
+# define PRIu64                __PRI64_PREFIX "u"
+#endif
+
+#ifndef SCNd8
+# define SCNd8         "hhd"
+#endif
+#ifndef SCNd16
+# define SCNd16                "hd"
+#endif
+#ifndef SCNd32
+# define SCNd32                "d"
+#endif
+#ifndef SCNd64
+# define SCNd64                __PRI64_PREFIX "d"
+#endif
+
+#ifndef SCNi8
+# define SCNi8         "hhi"
+#endif
+#ifndef SCNi16
+# define SCNi16                "hi"
+#endif
+#ifndef SCNi32
+# define SCNi32                "i"
+#endif
+#ifndef SCNi64
+# define SCNi64                __PRI64_PREFIX "i"
+#endif
+
+#ifndef SCNu8
+# define SCNu8         "hhu"
+#endif
+#ifndef SCNu16
+# define SCNu16                "hu"
+#endif
+#ifndef SCNu32
+# define SCNu32                "u"
+#endif
+#ifndef SCNu64
+# define SCNu64                __PRI64_PREFIX "u"
+#endif
+
+#ifdef HAVE_BSD_STRING_H
+#include <bsd/string.h>
+#endif
+
+#ifdef HAVE_BSD_UNISTD_H
+#include <bsd/unistd.h>
+#endif
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+
+#ifdef HAVE_SYS_TYPES_H
+#include <sys/types.h>
+#endif
+
+#ifdef HAVE_SETPROCTITLE_H
+#include <setproctitle.h>
+#endif
+
+#if STDC_HEADERS
+#include <stdlib.h>
+#include <stddef.h>
+#endif
+
+#ifdef HAVE_LINUX_TYPES_H
+/*
+ * This is needed as some broken header files require this to be included early
+ */
+#include <linux/types.h>
+#endif
+
+#ifndef HAVE_STRERROR
+extern char *sys_errlist[];
+#define strerror(i) sys_errlist[i]
+#endif
+
+#ifndef HAVE_ERRNO_DECL
+extern int errno;
+#endif
+
+#ifndef HAVE_STRDUP
+#define strdup rep_strdup
+char *rep_strdup(const char *s);
+#endif
+
+#ifndef HAVE_MEMMOVE
+#define memmove rep_memmove
+void *rep_memmove(void *dest,const void *src,int size);
+#endif
+
+#ifndef HAVE_MEMMEM
+#define memmem rep_memmem
+void *rep_memmem(const void *haystack, size_t haystacklen,
+                const void *needle, size_t needlelen);
+#endif
+
+#ifndef HAVE_MEMALIGN
+#define memalign rep_memalign
+void *rep_memalign(size_t boundary, size_t size);
+#endif
+
+#ifndef HAVE_MKTIME
+#define mktime rep_mktime
+/* prototype is in "system/time.h" */
+#endif
+
+#ifndef HAVE_TIMEGM
+#define timegm rep_timegm
+/* prototype is in "system/time.h" */
+#endif
+
+#ifndef HAVE_UTIME
+#define utime rep_utime
+/* prototype is in "system/time.h" */
+#endif
+
+#ifndef HAVE_UTIMES
+#define utimes rep_utimes
+/* prototype is in "system/time.h" */
+#endif
+
+#ifndef HAVE_STRLCPY
+#define strlcpy rep_strlcpy
+size_t rep_strlcpy(char *d, const char *s, size_t bufsize);
+#endif
+
+#ifndef HAVE_STRLCAT
+#define strlcat rep_strlcat
+size_t rep_strlcat(char *d, const char *s, size_t bufsize);
+#endif
+
+#if (defined(BROKEN_STRNDUP) || !defined(HAVE_STRNDUP))
+#undef HAVE_STRNDUP
+#define strndup rep_strndup
+char *rep_strndup(const char *s, size_t n);
+#endif
+
+#if (defined(BROKEN_STRNLEN) || !defined(HAVE_STRNLEN))
+#undef HAVE_STRNLEN
+#define strnlen rep_strnlen
+size_t rep_strnlen(const char *s, size_t n);
+#endif
+
+#if !HAVE_DECL_ENVIRON
+#ifdef __APPLE__
+#include <crt_externs.h>
+#define environ (*_NSGetEnviron())
+#else
+extern char **environ;
+#endif
+#endif
+
+#ifndef HAVE_SETENV
+#define setenv rep_setenv
+int rep_setenv(const char *name, const char *value, int overwrite);
+#else
+#ifndef HAVE_SETENV_DECL
+int setenv(const char *name, const char *value, int overwrite);
+#endif
+#endif
+
+#ifndef HAVE_UNSETENV
+#define unsetenv rep_unsetenv
+int rep_unsetenv(const char *name);
+#endif
+
+#ifndef HAVE_SETEUID
+#define seteuid rep_seteuid
+int rep_seteuid(uid_t);
+#endif
+
+#ifndef HAVE_SETEGID
+#define setegid rep_setegid
+int rep_setegid(gid_t);
+#endif
+
+#if (defined(USE_SETRESUID) && !defined(HAVE_SETRESUID_DECL))
+/* stupid glibc */
+int setresuid(uid_t ruid, uid_t euid, uid_t suid);
+#endif
+#if (defined(USE_SETRESUID) && !defined(HAVE_SETRESGID_DECL))
+int setresgid(gid_t rgid, gid_t egid, gid_t sgid);
+#endif
+
+#ifndef HAVE_CHOWN
+#define chown rep_chown
+int rep_chown(const char *path, uid_t uid, gid_t gid);
+#endif
+
+#ifndef HAVE_CHROOT
+#define chroot rep_chroot
+int rep_chroot(const char *dirname);
+#endif
+
+#ifndef HAVE_LINK
+#define link rep_link
+int rep_link(const char *oldpath, const char *newpath);
+#endif
+
+#ifndef HAVE_READLINK
+#define readlink rep_readlink
+ssize_t rep_readlink(const char *path, char *buf, size_t bufsize);
+#endif
+
+#ifndef HAVE_SYMLINK
+#define symlink rep_symlink
+int rep_symlink(const char *oldpath, const char *newpath);
+#endif
+
+#ifndef HAVE_REALPATH
+#define realpath rep_realpath
+char *rep_realpath(const char *path, char *resolved_path);
+#endif
+
+#ifndef HAVE_LCHOWN
+#define lchown rep_lchown
+int rep_lchown(const char *fname,uid_t uid,gid_t gid);
+#endif
+
+#ifdef HAVE_UNIX_H
+#include <unix.h>
+#endif
+
+#ifndef HAVE_SETLINEBUF
+#define setlinebuf rep_setlinebuf
+void rep_setlinebuf(FILE *);
+#endif
+
+#ifndef HAVE_STRCASESTR
+#define strcasestr rep_strcasestr
+char *rep_strcasestr(const char *haystack, const char *needle);
+#endif
+
+#ifndef HAVE_STRTOK_R
+#define strtok_r rep_strtok_r
+char *rep_strtok_r(char *s, const char *delim, char **save_ptr);
+#endif
+
+
+
+#ifndef HAVE_STRTOLL
+#define strtoll rep_strtoll
+long long int rep_strtoll(const char *str, char **endptr, int base);
+#else
+#ifdef HAVE_BSD_STRTOLL
+#define strtoll rep_strtoll
+long long int rep_strtoll(const char *str, char **endptr, int base);
+#endif
+#endif
+
+#ifndef HAVE_STRTOULL
+#define strtoull rep_strtoull
+unsigned long long int rep_strtoull(const char *str, char **endptr, int base);
+#else
+#ifdef HAVE_BSD_STRTOLL /* yes, it's not HAVE_BSD_STRTOULL */
+#define strtoull rep_strtoull
+unsigned long long int rep_strtoull(const char *str, char **endptr, int base);
+#endif
+#endif
+
+#ifndef HAVE_FTRUNCATE
+#define ftruncate rep_ftruncate
+int rep_ftruncate(int,off_t);
+#endif
+
+#ifndef HAVE_INITGROUPS
+#define initgroups rep_initgroups
+int rep_initgroups(char *name, gid_t id);
+#endif
+
+#if !defined(HAVE_BZERO) && defined(HAVE_MEMSET)
+#define bzero(a,b) memset((a),'\0',(b))
+#endif
+
+#ifndef HAVE_DLERROR
+#define dlerror rep_dlerror
+char *rep_dlerror(void);
+#endif
+
+#ifndef HAVE_DLOPEN
+#define dlopen rep_dlopen
+#ifdef DLOPEN_TAKES_UNSIGNED_FLAGS
+void *rep_dlopen(const char *name, unsigned int flags);
+#else
+void *rep_dlopen(const char *name, int flags);
+#endif
+#endif
+
+#ifndef HAVE_DLSYM
+#define dlsym rep_dlsym
+void *rep_dlsym(void *handle, const char *symbol);
+#endif
+
+#ifndef HAVE_DLCLOSE
+#define dlclose rep_dlclose
+int rep_dlclose(void *handle);
+#endif
+
+#ifndef HAVE_SOCKETPAIR
+#define socketpair rep_socketpair
+/* prototype is in system/network.h */
+#endif
+
+#ifndef PRINTF_ATTRIBUTE
+#if (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1 )
+/** Use gcc attribute to check printf fns.  a1 is the 1-based index of
+ * the parameter containing the format, and a2 the index of the first
+ * argument. Note that some gcc 2.x versions don't handle this
+ * properly **/
+#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
+#else
+#define PRINTF_ATTRIBUTE(a1, a2)
+#endif
+#endif
+
+#ifndef _DEPRECATED_
+#if (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1 )
+#define _DEPRECATED_ __attribute__ ((deprecated))
+#else
+#define _DEPRECATED_
+#endif
+#endif
+
+#if !defined(HAVE_VDPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+#define vdprintf rep_vdprintf
+int rep_vdprintf(int fd, const char *format, va_list ap) PRINTF_ATTRIBUTE(2,0);
+#endif
+
+#if !defined(HAVE_DPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+#define dprintf rep_dprintf
+int rep_dprintf(int fd, const char *format, ...) PRINTF_ATTRIBUTE(2,3);
+#endif
+
+#if !defined(HAVE_VASPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+#define vasprintf rep_vasprintf
+int rep_vasprintf(char **ptr, const char *format, va_list ap) PRINTF_ATTRIBUTE(2,0);
+#endif
+
+#if !defined(HAVE_SNPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+#define snprintf rep_snprintf
+int rep_snprintf(char *,size_t ,const char *, ...) PRINTF_ATTRIBUTE(3,4);
+#endif
+
+#if !defined(HAVE_VSNPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+#define vsnprintf rep_vsnprintf
+int rep_vsnprintf(char *,size_t ,const char *, va_list ap) PRINTF_ATTRIBUTE(3,0);
+#endif
+
+#if !defined(HAVE_ASPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+#define asprintf rep_asprintf
+int rep_asprintf(char **,const char *, ...) PRINTF_ATTRIBUTE(2,3);
+#endif
+
+#if !defined(HAVE_C99_VSNPRINTF)
+#ifdef REPLACE_BROKEN_PRINTF
+/*
+ * We do not redefine printf by default
+ * as it breaks the build if system headers
+ * use __attribute__((format(printf, 3, 0)))
+ * instead of __attribute__((format(__printf__, 3, 0)))
+ */
+#define printf rep_printf
+#endif
+int rep_printf(const char *, ...) PRINTF_ATTRIBUTE(1,2);
+#endif
+
+#if !defined(HAVE_C99_VSNPRINTF)
+#define fprintf rep_fprintf
+int rep_fprintf(FILE *stream, const char *, ...) PRINTF_ATTRIBUTE(2,3);
+#endif
+
+#ifndef HAVE_VSYSLOG
+#ifdef HAVE_SYSLOG
+#define vsyslog rep_vsyslog
+void rep_vsyslog (int facility_priority, const char *format, va_list arglist) PRINTF_ATTRIBUTE(2,0);
+#endif
+#endif
+
+/* we used to use these fns, but now we have good replacements
+   for snprintf and vsnprintf */
+#define slprintf snprintf
+
+
+#ifndef HAVE_VA_COPY
+#undef va_copy
+#ifdef HAVE___VA_COPY
+#define va_copy(dest, src) __va_copy(dest, src)
+#else
+#define va_copy(dest, src) (dest) = (src)
+#endif
+#endif
+
+#ifndef HAVE_VOLATILE
+#define volatile
+#endif
+
+#ifndef HAVE_COMPARISON_FN_T
+typedef int (*comparison_fn_t)(const void *, const void *);
+#endif
+
+#ifndef HAVE_WORKING_STRPTIME
+#define strptime rep_strptime
+struct tm;
+char *rep_strptime(const char *buf, const char *format, struct tm *tm);
+#endif
+
+#ifndef HAVE_DUP2
+#define dup2 rep_dup2
+int rep_dup2(int oldfd, int newfd);
+#endif
+
+/* Load header file for dynamic linking stuff */
+#ifdef HAVE_DLFCN_H
+#include <dlfcn.h>
+#endif
+
+#ifndef RTLD_LAZY
+#define RTLD_LAZY 0
+#endif
+#ifndef RTLD_NOW
+#define RTLD_NOW 0
+#endif
+#ifndef RTLD_GLOBAL
+#define RTLD_GLOBAL 0
+#endif
+
+#ifndef HAVE_SECURE_MKSTEMP
+#define mkstemp(path) rep_mkstemp(path)
+int rep_mkstemp(char *temp);
+#endif
+
+#ifndef HAVE_MKDTEMP
+#define mkdtemp rep_mkdtemp
+char *rep_mkdtemp(char *template);
+#endif
+
+#ifndef HAVE_PREAD
+#define pread rep_pread
+ssize_t rep_pread(int __fd, void *__buf, size_t __nbytes, off_t __offset);
+#define LIBREPLACE_PREAD_REPLACED 1
+#else
+#define LIBREPLACE_PREAD_NOT_REPLACED 1
+#endif
+
+#ifndef HAVE_PWRITE
+#define pwrite rep_pwrite
+ssize_t rep_pwrite(int __fd, const void *__buf, size_t __nbytes, off_t __offset);
+#define LIBREPLACE_PWRITE_REPLACED 1
+#else
+#define LIBREPLACE_PWRITE_NOT_REPLACED 1
+#endif
+
+#if !defined(HAVE_INET_NTOA) || defined(REPLACE_INET_NTOA)
+#define inet_ntoa rep_inet_ntoa
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_INET_PTON
+#define inet_pton rep_inet_pton
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_INET_NTOP
+#define inet_ntop rep_inet_ntop
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_INET_ATON
+#define inet_aton rep_inet_aton
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_CONNECT
+#define connect rep_connect
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_GETHOSTBYNAME
+#define gethostbyname rep_gethostbyname
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_GETIFADDRS
+#define getifaddrs rep_getifaddrs
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_FREEIFADDRS
+#define freeifaddrs rep_freeifaddrs
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_GET_CURRENT_DIR_NAME
+#define get_current_dir_name rep_get_current_dir_name
+char *rep_get_current_dir_name(void);
+#endif
+
+#ifndef HAVE_STRERROR_R
+#define strerror_r rep_strerror_r
+int rep_strerror_r(int errnum, char *buf, size_t buflen);
+#endif
+
+#if !defined(HAVE_CLOCK_GETTIME)
+#define clock_gettime rep_clock_gettime
+#endif
+
+#ifdef HAVE_LIMITS_H
+#include <limits.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+/* The extra casts work around common compiler bugs.  */
+#define _TYPE_SIGNED(t) (! ((t) 0 < (t) -1))
+/* The outer cast is needed to work around a bug in Cray C 5.0.3.0.
+   It is necessary at least when t == time_t.  */
+#define _TYPE_MINIMUM(t) ((t) (_TYPE_SIGNED (t) \
+                             ? ~ (t) 0 << (sizeof (t) * CHAR_BIT - 1) : (t) 0))
+#define _TYPE_MAXIMUM(t) ((t) (~ (t) 0 - _TYPE_MINIMUM (t)))
+
+#ifndef UINT16_MAX
+#define UINT16_MAX 65535
+#endif
+
+#ifndef UINT32_MAX
+#define UINT32_MAX (4294967295U)
+#endif
+
+#ifndef UINT64_MAX
+#define UINT64_MAX ((uint64_t)-1)
+#endif
+
+#ifndef INT64_MAX
+#define INT64_MAX 9223372036854775807LL
+#endif
+
+#ifndef CHAR_BIT
+#define CHAR_BIT 8
+#endif
+
+#ifndef INT32_MAX
+#define INT32_MAX _TYPE_MAXIMUM(int32_t)
+#endif
+
+#ifdef HAVE_STDBOOL_H
+#include <stdbool.h>
+#endif
+
+#if !defined(HAVE_BOOL)
+#ifdef HAVE__Bool
+#define bool _Bool
+#else
+typedef int bool;
+#endif
+#endif
+
+#if !defined(HAVE_INTPTR_T)
+typedef long long intptr_t ;
+#endif
+
+#if !defined(HAVE_UINTPTR_T)
+typedef unsigned long long uintptr_t ;
+#endif
+
+#if !defined(HAVE_PTRDIFF_T)
+typedef unsigned long long ptrdiff_t ;
+#endif
+
+/*
+ * to prevent <rpcsvc/yp_prot.h> from doing a redefine of 'bool'
+ *
+ * IRIX, HPUX, MacOS 10 and Solaris need BOOL_DEFINED
+ * Tru64 needs _BOOL_EXISTS
+ * AIX needs _BOOL,_TRUE,_FALSE
+ */
+#ifndef BOOL_DEFINED
+#define BOOL_DEFINED
+#endif
+#ifndef _BOOL_EXISTS
+#define _BOOL_EXISTS
+#endif
+#ifndef _BOOL
+#define _BOOL
+#endif
+
+#ifndef __bool_true_false_are_defined
+#define __bool_true_false_are_defined
+#endif
+
+#ifndef true
+#define true (1)
+#endif
+#ifndef false
+#define false (0)
+#endif
+
+#ifndef _TRUE
+#define _TRUE true
+#endif
+#ifndef _FALSE
+#define _FALSE false
+#endif
+
+#ifndef HAVE_FUNCTION_MACRO
+#ifdef HAVE_func_MACRO
+#define __FUNCTION__ __func__
+#else
+#define __FUNCTION__ ("")
+#endif
+#endif
+
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#if !defined(HAVE_VOLATILE)
+#define volatile
+#endif
+
+/**
+  this is a warning hack. The idea is to use this everywhere that we
+  get the "discarding const" warning from gcc. That doesn't actually
+  fix the problem of course, but it means that when we do get to
+  cleaning them up we can do it by searching the code for
+  discard_const.
+
+  It also means that other error types aren't as swamped by the noise
+  of hundreds of const warnings, so we are more likely to notice when
+  we get new errors.
+
+  Please only add more uses of this macro when you find it
+  _really_ hard to fix const warnings. Our aim is to eventually use
+  this function in only a very few places.
+
+  Also, please call this via the discard_const_p() macro interface, as that
+  makes the return type safe.
+*/
+#define discard_const(ptr) ((void *)((uintptr_t)(ptr)))
+
+/** Type-safe version of discard_const */
+#define discard_const_p(type, ptr) ((type *)discard_const(ptr))
+
+#ifndef __STRING
+#define __STRING(x)    #x
+#endif
+
+#ifndef __STRINGSTRING
+#define __STRINGSTRING(x) __STRING(x)
+#endif
+
+#ifndef __LINESTR__
+#define __LINESTR__ __STRINGSTRING(__LINE__)
+#endif
+
+#ifndef __location__
+#define __location__ __FILE__ ":" __LINESTR__
+#endif
+
+/** 
+ * zero a structure 
+ */
+#define ZERO_STRUCT(x) memset((char *)&(x), 0, sizeof(x))
+
+/** 
+ * zero a structure given a pointer to the structure 
+ */
+#define ZERO_STRUCTP(x) do { if ((x) != NULL) memset((char *)(x), 0, sizeof(*(x))); } while(0)
+
+/** 
+ * zero a structure given a pointer to the structure - no zero check 
+ */
+#define ZERO_STRUCTPN(x) memset((char *)(x), 0, sizeof(*(x)))
+
+/* zero an array - note that sizeof(array) must work - ie. it must not be a
+   pointer */
+#define ZERO_ARRAY(x) memset((char *)(x), 0, sizeof(x))
+
+/**
+ * work out how many elements there are in a static array 
+ */
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+
+/** 
+ * pointer difference macro 
+ */
+#define PTR_DIFF(p1,p2) ((ptrdiff_t)(((const char *)(p1)) - (const char *)(p2)))
+
+#if MMAP_BLACKLIST
+#undef HAVE_MMAP
+#endif
+
+#ifdef __COMPAR_FN_T
+#define QSORT_CAST (__compar_fn_t)
+#endif
+
+#ifndef QSORT_CAST
+#define QSORT_CAST (int (*)(const void *, const void *))
+#endif
+
+#ifndef PATH_MAX
+#define PATH_MAX 1024
+#endif
+
+#ifndef MAX_DNS_NAME_LENGTH
+#define MAX_DNS_NAME_LENGTH 256 /* Actually 255 but +1 for terminating null. */
+#endif
+
+#ifndef HAVE_CRYPT
+char *ufc_crypt(const char *key, const char *salt);
+#define crypt ufc_crypt
+#else
+#ifdef HAVE_CRYPT_H
+#include <crypt.h>
+#endif
+#endif
+
+/* these macros gain us a few percent of speed on gcc */
+#if (__GNUC__ >= 3)
+/* the strange !! is to ensure that __builtin_expect() takes either 0 or 1
+   as its first argument */
+#ifndef likely
+#define likely(x)   __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#else
+#ifndef likely
+#define likely(x) (x)
+#endif
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+#endif
+
+#ifndef HAVE_FDATASYNC
+#define fdatasync(fd) fsync(fd)
+#elif !defined(HAVE_DECL_FDATASYNC)
+int fdatasync(int );
+#endif
+
+/* these are used to mark symbols as local to a shared lib, or
+ * publicly available via the shared lib API */
+#ifndef _PUBLIC_
+#ifdef HAVE_VISIBILITY_ATTR
+#define _PUBLIC_ __attribute__((visibility("default")))
+#else
+#define _PUBLIC_
+#endif
+#endif
+
+#ifndef _PRIVATE_
+#ifdef HAVE_VISIBILITY_ATTR
+#  define _PRIVATE_ __attribute__((visibility("hidden")))
+#else
+#  define _PRIVATE_
+#endif
+#endif
+
+#ifndef HAVE_POLL
+#define poll rep_poll
+/* prototype is in "system/network.h" */
+#endif
+
+#ifndef HAVE_GETPEEREID
+#define getpeereid rep_getpeereid
+int rep_getpeereid(int s, uid_t *uid, gid_t *gid);
+#endif
+
+#ifndef HAVE_USLEEP
+#define usleep rep_usleep
+typedef long useconds_t;
+int usleep(useconds_t);
+#endif
+
+#ifndef HAVE_SETPROCTITLE
+#define setproctitle rep_setproctitle
+void rep_setproctitle(const char *fmt, ...) PRINTF_ATTRIBUTE(1, 2);
+#endif
+
+#endif /* _LIBREPLACE_REPLACE_H */
diff --git a/ctdb/lib/replace/snprintf.c b/ctdb/lib/replace/snprintf.c
new file mode 100644 (file)
index 0000000..6b4a711
--- /dev/null
@@ -0,0 +1,1530 @@
+/*
+ * NOTE: If you change this file, please merge it into rsync, samba, etc.
+ */
+
+/*
+ * Copyright Patrick Powell 1995
+ * This code is based on code written by Patrick Powell (papowell@astart.com)
+ * It may be used for any purpose as long as this notice remains intact
+ * on all source code distributions
+ */
+
+/**************************************************************
+ * Original:
+ * Patrick Powell Tue Apr 11 09:48:21 PDT 1995
+ * A bombproof version of doprnt (dopr) included.
+ * Sigh.  This sort of thing is always nasty do deal with.  Note that
+ * the version here does not include floating point...
+ *
+ * snprintf() is used instead of sprintf() as it does limit checks
+ * for string length.  This covers a nasty loophole.
+ *
+ * The other functions are there to prevent NULL pointers from
+ * causing nast effects.
+ *
+ * More Recently:
+ *  Brandon Long <blong@fiction.net> 9/15/96 for mutt 0.43
+ *  This was ugly.  It is still ugly.  I opted out of floating point
+ *  numbers, but the formatter understands just about everything
+ *  from the normal C string format, at least as far as I can tell from
+ *  the Solaris 2.5 printf(3S) man page.
+ *
+ *  Brandon Long <blong@fiction.net> 10/22/97 for mutt 0.87.1
+ *    Ok, added some minimal floating point support, which means this
+ *    probably requires libm on most operating systems.  Don't yet
+ *    support the exponent (e,E) and sigfig (g,G).  Also, fmtint()
+ *    was pretty badly broken, it just wasn't being exercised in ways
+ *    which showed it, so that's been fixed.  Also, formated the code
+ *    to mutt conventions, and removed dead code left over from the
+ *    original.  Also, there is now a builtin-test, just compile with:
+ *           gcc -DTEST_SNPRINTF -o snprintf snprintf.c -lm
+ *    and run snprintf for results.
+ * 
+ *  Thomas Roessler <roessler@guug.de> 01/27/98 for mutt 0.89i
+ *    The PGP code was using unsigned hexadecimal formats. 
+ *    Unfortunately, unsigned formats simply didn't work.
+ *
+ *  Michael Elkins <me@cs.hmc.edu> 03/05/98 for mutt 0.90.8
+ *    The original code assumed that both snprintf() and vsnprintf() were
+ *    missing.  Some systems only have snprintf() but not vsnprintf(), so
+ *    the code is now broken down under HAVE_SNPRINTF and HAVE_VSNPRINTF.
+ *
+ *  Andrew Tridgell (tridge@samba.org) Oct 1998
+ *    fixed handling of %.0f
+ *    added test for HAVE_LONG_DOUBLE
+ *
+ * tridge@samba.org, idra@samba.org, April 2001
+ *    got rid of fcvt code (twas buggy and made testing harder)
+ *    added C99 semantics
+ *
+ * date: 2002/12/19 19:56:31;  author: herb;  state: Exp;  lines: +2 -0
+ * actually print args for %g and %e
+ * 
+ * date: 2002/06/03 13:37:52;  author: jmcd;  state: Exp;  lines: +8 -0
+ * Since includes.h isn't included here, VA_COPY has to be defined here.  I don't
+ * see any include file that is guaranteed to be here, so I'm defining it
+ * locally.  Fixes AIX and Solaris builds.
+ * 
+ * date: 2002/06/03 03:07:24;  author: tridge;  state: Exp;  lines: +5 -13
+ * put the ifdef for HAVE_VA_COPY in one place rather than in lots of
+ * functions
+ * 
+ * date: 2002/05/17 14:51:22;  author: jmcd;  state: Exp;  lines: +21 -4
+ * Fix usage of va_list passed as an arg.  Use __va_copy before using it
+ * when it exists.
+ * 
+ * date: 2002/04/16 22:38:04;  author: idra;  state: Exp;  lines: +20 -14
+ * Fix incorrect zpadlen handling in fmtfp.
+ * Thanks to Ollie Oldham <ollie.oldham@metro-optix.com> for spotting it.
+ * few mods to make it easier to compile the tests.
+ * addedd the "Ollie" test to the floating point ones.
+ *
+ * Martin Pool (mbp@samba.org) April 2003
+ *    Remove NO_CONFIG_H so that the test case can be built within a source
+ *    tree with less trouble.
+ *    Remove unnecessary SAFE_FREE() definition.
+ *
+ * Martin Pool (mbp@samba.org) May 2003
+ *    Put in a prototype for dummy_snprintf() to quiet compiler warnings.
+ *
+ *    Move #endif to make sure VA_COPY, LDOUBLE, etc are defined even
+ *    if the C library has some snprintf functions already.
+ *
+ * Darren Tucker (dtucker@zip.com.au) 2005
+ *    Fix bug allowing read overruns of the source string with "%.*s"
+ *    Usually harmless unless the read runs outside the process' allocation
+ *    (eg if your malloc does guard pages) in which case it will segfault.
+ *    From OpenSSH.  Also added test for same.
+ *
+ * Simo Sorce (idra@samba.org) Jan 2006
+ * 
+ *    Add support for position independent parameters 
+ *    fix fmtstr now it conforms to sprintf wrt min.max
+ *
+ **************************************************************/
+
+#include "replace.h"
+#include "system/locale.h"
+
+#ifdef TEST_SNPRINTF /* need math library headers for testing */
+
+/* In test mode, we pretend that this system doesn't have any snprintf
+ * functions, regardless of what config.h says. */
+#  undef HAVE_SNPRINTF
+#  undef HAVE_VSNPRINTF
+#  undef HAVE_C99_VSNPRINTF
+#  undef HAVE_ASPRINTF
+#  undef HAVE_VASPRINTF
+#  include <math.h>
+#endif /* TEST_SNPRINTF */
+
+#if defined(HAVE_SNPRINTF) && defined(HAVE_VSNPRINTF) && defined(HAVE_C99_VSNPRINTF)
+/* only include stdio.h if we are not re-defining snprintf or vsnprintf */
+#include <stdio.h>
+ /* make the compiler happy with an empty file */
+ void dummy_snprintf(void);
+ void dummy_snprintf(void) {} 
+#endif /* HAVE_SNPRINTF, etc */
+
+/* yes this really must be a ||. Don't muck with this (tridge) */
+#if !defined(HAVE_VSNPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+
+#ifdef HAVE_LONG_DOUBLE
+#define LDOUBLE long double
+#else
+#define LDOUBLE double
+#endif
+
+#ifdef HAVE_LONG_LONG
+#define LLONG long long
+#else
+#define LLONG long
+#endif
+
+#ifndef VA_COPY
+#ifdef HAVE_VA_COPY
+#define VA_COPY(dest, src) va_copy(dest, src)
+#else
+#ifdef HAVE___VA_COPY
+#define VA_COPY(dest, src) __va_copy(dest, src)
+#else
+#define VA_COPY(dest, src) (dest) = (src)
+#endif
+#endif
+
+/*
+ * dopr(): poor man's version of doprintf
+ */
+
+/* format read states */
+#define DP_S_DEFAULT 0
+#define DP_S_FLAGS   1
+#define DP_S_MIN     2
+#define DP_S_DOT     3
+#define DP_S_MAX     4
+#define DP_S_MOD     5
+#define DP_S_CONV    6
+#define DP_S_DONE    7
+
+/* format flags - Bits */
+#define DP_F_MINUS     (1 << 0)
+#define DP_F_PLUS      (1 << 1)
+#define DP_F_SPACE     (1 << 2)
+#define DP_F_NUM       (1 << 3)
+#define DP_F_ZERO      (1 << 4)
+#define DP_F_UP        (1 << 5)
+#define DP_F_UNSIGNED  (1 << 6)
+
+/* Conversion Flags */
+#define DP_C_CHAR    1
+#define DP_C_SHORT   2
+#define DP_C_LONG    3
+#define DP_C_LDOUBLE 4
+#define DP_C_LLONG   5
+#define DP_C_SIZET   6
+
+/* Chunk types */
+#define CNK_FMT_STR 0
+#define CNK_INT     1
+#define CNK_OCTAL   2
+#define CNK_UINT    3
+#define CNK_HEX     4
+#define CNK_FLOAT   5
+#define CNK_CHAR    6
+#define CNK_STRING  7
+#define CNK_PTR     8
+#define CNK_NUM     9
+#define CNK_PRCNT   10
+
+#define char_to_int(p) ((p)- '0')
+#ifndef MAX
+#define MAX(p,q) (((p) >= (q)) ? (p) : (q))
+#endif
+
+struct pr_chunk {
+       int type; /* chunk type */
+       int num; /* parameter number */
+       int min; 
+       int max;
+       int flags;
+       int cflags;
+       int start;
+       int len;
+       LLONG value;
+       LDOUBLE fvalue;
+       char *strvalue;
+       void *pnum;
+       struct pr_chunk *min_star;
+       struct pr_chunk *max_star;
+       struct pr_chunk *next;
+};
+
+struct pr_chunk_x {
+       struct pr_chunk **chunks;
+       int num;
+};
+
+static int dopr(char *buffer, size_t maxlen, const char *format, 
+                  va_list args_in);
+static void fmtstr(char *buffer, size_t *currlen, size_t maxlen,
+                   char *value, int flags, int min, int max);
+static void fmtint(char *buffer, size_t *currlen, size_t maxlen,
+                   LLONG value, int base, int min, int max, int flags);
+static void fmtfp(char *buffer, size_t *currlen, size_t maxlen,
+                  LDOUBLE fvalue, int min, int max, int flags);
+static void dopr_outch(char *buffer, size_t *currlen, size_t maxlen, char c);
+static struct pr_chunk *new_chunk(void);
+static int add_cnk_list_entry(struct pr_chunk_x **list,
+                               int max_num, struct pr_chunk *chunk);
+
+static int dopr(char *buffer, size_t maxlen, const char *format, va_list args_in)
+{
+       char ch;
+       int state;
+       int pflag;
+       int pnum;
+       int pfirst;
+       size_t currlen;
+       va_list args;
+       const char *base;
+       struct pr_chunk *chunks = NULL;
+       struct pr_chunk *cnk = NULL;
+       struct pr_chunk_x *clist = NULL;
+       int max_pos;
+       int ret = -1;
+
+       VA_COPY(args, args_in);
+
+       state = DP_S_DEFAULT;
+       pfirst = 1;
+       pflag = 0;
+       pnum = 0;
+
+       max_pos = 0;
+       base = format;
+       ch = *format++;
+       
+       /* retrieve the string structure as chunks */
+       while (state != DP_S_DONE) {
+               if (ch == '\0') 
+                       state = DP_S_DONE;
+
+               switch(state) {
+               case DP_S_DEFAULT:
+                       
+                       if (cnk) {
+                               cnk->next = new_chunk();
+                               cnk = cnk->next;
+                       } else {
+                               cnk = new_chunk();
+                       }
+                       if (!cnk) goto done;
+                       if (!chunks) chunks = cnk;
+                       
+                       if (ch == '%') {
+                               state = DP_S_FLAGS;
+                               ch = *format++;
+                       } else {
+                               cnk->type = CNK_FMT_STR;
+                               cnk->start = format - base -1;
+                               while ((ch != '\0') && (ch != '%')) ch = *format++;
+                               cnk->len = format - base - cnk->start -1;
+                       }
+                       break;
+               case DP_S_FLAGS:
+                       switch (ch) {
+                       case '-':
+                               cnk->flags |= DP_F_MINUS;
+                               ch = *format++;
+                               break;
+                       case '+':
+                               cnk->flags |= DP_F_PLUS;
+                               ch = *format++;
+                               break;
+                       case ' ':
+                               cnk->flags |= DP_F_SPACE;
+                               ch = *format++;
+                               break;
+                       case '#':
+                               cnk->flags |= DP_F_NUM;
+                               ch = *format++;
+                               break;
+                       case '0':
+                               cnk->flags |= DP_F_ZERO;
+                               ch = *format++;
+                               break;
+                       case 'I':
+                               /* internationalization not supported yet */
+                               ch = *format++;
+                               break;
+                       default:
+                               state = DP_S_MIN;
+                               break;
+                       }
+                       break;
+               case DP_S_MIN:
+                       if (isdigit((unsigned char)ch)) {
+                               cnk->min = 10 * cnk->min + char_to_int (ch);
+                               ch = *format++;
+                       } else if (ch == '$') {
+                               if (!pfirst && !pflag) {
+                                       /* parameters must be all positioned or none */
+                                       goto done;
+                               }
+                               if (pfirst) {
+                                       pfirst = 0;
+                                       pflag = 1;
+                               }
+                               if (cnk->min == 0) /* what ?? */
+                                       goto done;
+                               cnk->num = cnk->min;
+                               cnk->min = 0;
+                               ch = *format++;
+                       } else if (ch == '*') {
+                               if (pfirst) pfirst = 0;
+                               cnk->min_star = new_chunk();
+                               if (!cnk->min_star) /* out of memory :-( */
+                                       goto done;
+                               cnk->min_star->type = CNK_INT;
+                               if (pflag) {
+                                       int num;
+                                       ch = *format++;
+                                       if (!isdigit((unsigned char)ch)) {
+                                               /* parameters must be all positioned or none */
+                                               goto done;
+                                       }
+                                       for (num = 0; isdigit((unsigned char)ch); ch = *format++) {
+                                               num = 10 * num + char_to_int(ch);
+                                       }
+                                       cnk->min_star->num = num;
+                                       if (ch != '$') /* what ?? */
+                                               goto done;
+                               } else {
+                                       cnk->min_star->num = ++pnum;
+                               }
+                               max_pos = add_cnk_list_entry(&clist, max_pos, cnk->min_star);
+                               if (max_pos == 0) /* out of memory :-( */
+                                       goto done;
+                               ch = *format++;
+                               state = DP_S_DOT;
+                       } else {
+                               if (pfirst) pfirst = 0;
+                               state = DP_S_DOT;
+                       }
+                       break;
+               case DP_S_DOT:
+                       if (ch == '.') {
+                               state = DP_S_MAX;
+                               ch = *format++;
+                       } else { 
+                               state = DP_S_MOD;
+                       }
+                       break;
+               case DP_S_MAX:
+                       if (isdigit((unsigned char)ch)) {
+                               if (cnk->max < 0)
+                                       cnk->max = 0;
+                               cnk->max = 10 * cnk->max + char_to_int (ch);
+                               ch = *format++;
+                       } else if (ch == '$') {
+                               if (!pfirst && !pflag) {
+                                       /* parameters must be all positioned or none */
+                                       goto done;
+                               }
+                               if (cnk->max <= 0) /* what ?? */
+                                       goto done;
+                               cnk->num = cnk->max;
+                               cnk->max = -1;
+                               ch = *format++;
+                       } else if (ch == '*') {
+                               cnk->max_star = new_chunk();
+                               if (!cnk->max_star) /* out of memory :-( */
+                                       goto done;
+                               cnk->max_star->type = CNK_INT;
+                               if (pflag) {
+                                       int num;
+                                       ch = *format++;
+                                       if (!isdigit((unsigned char)ch)) {
+                                               /* parameters must be all positioned or none */
+                                               goto done;
+                                       }
+                                       for (num = 0; isdigit((unsigned char)ch); ch = *format++) {
+                                               num = 10 * num + char_to_int(ch);
+                                       }
+                                       cnk->max_star->num = num;
+                                       if (ch != '$') /* what ?? */
+                                               goto done;
+                               } else {
+                                       cnk->max_star->num = ++pnum;
+                               }
+                               max_pos = add_cnk_list_entry(&clist, max_pos, cnk->max_star);
+                               if (max_pos == 0) /* out of memory :-( */
+                                       goto done;
+
+                               ch = *format++;
+                               state = DP_S_MOD;
+                       } else {
+                               state = DP_S_MOD;
+                       }
+                       break;
+               case DP_S_MOD:
+                       switch (ch) {
+                       case 'h':
+                               cnk->cflags = DP_C_SHORT;
+                               ch = *format++;
+                               if (ch == 'h') {
+                                       cnk->cflags = DP_C_CHAR;
+                                       ch = *format++;
+                               }
+                               break;
+                       case 'l':
+                               cnk->cflags = DP_C_LONG;
+                               ch = *format++;
+                               if (ch == 'l') {        /* It's a long long */
+                                       cnk->cflags = DP_C_LLONG;
+                                       ch = *format++;
+                               }
+                               break;
+                       case 'L':
+                               cnk->cflags = DP_C_LDOUBLE;
+                               ch = *format++;
+                               break;
+                       case 'z':
+                               cnk->cflags = DP_C_SIZET;
+                               ch = *format++;
+                               break;
+                       default:
+                               break;
+                       }
+                       state = DP_S_CONV;
+                       break;
+               case DP_S_CONV:
+                       if (cnk->num == 0) cnk->num = ++pnum;
+                       max_pos = add_cnk_list_entry(&clist, max_pos, cnk);
+                       if (max_pos == 0) /* out of memory :-( */
+                               goto done;
+                       
+                       switch (ch) {
+                       case 'd':
+                       case 'i':
+                               cnk->type = CNK_INT;
+                               break;
+                       case 'o':
+                               cnk->type = CNK_OCTAL;
+                               cnk->flags |= DP_F_UNSIGNED;
+                               break;
+                       case 'u':
+                               cnk->type = CNK_UINT;
+                               cnk->flags |= DP_F_UNSIGNED;
+                               break;
+                       case 'X':
+                               cnk->flags |= DP_F_UP;
+                       case 'x':
+                               cnk->type = CNK_HEX;
+                               cnk->flags |= DP_F_UNSIGNED;
+                               break;
+                       case 'A':
+                               /* hex float not supported yet */
+                       case 'E':
+                       case 'G':
+                       case 'F':
+                               cnk->flags |= DP_F_UP;
+                       case 'a':
+                               /* hex float not supported yet */
+                       case 'e':
+                       case 'f':
+                       case 'g':
+                               cnk->type = CNK_FLOAT;
+                               break;
+                       case 'c':
+                               cnk->type = CNK_CHAR;
+                               break;
+                       case 's':
+                               cnk->type = CNK_STRING;
+                               break;
+                       case 'p':
+                               cnk->type = CNK_PTR;
+                               cnk->flags |= DP_F_UNSIGNED;
+                               break;
+                       case 'n':
+                               cnk->type = CNK_NUM;
+                               break;
+                       case '%':
+                               cnk->type = CNK_PRCNT;
+                               break;
+                       default:
+                               /* Unknown, bail out*/
+                               goto done;
+                       }
+                       ch = *format++;
+                       state = DP_S_DEFAULT;
+                       break;
+               case DP_S_DONE:
+                       break;
+               default:
+                       /* hmm? */
+                       break; /* some picky compilers need this */
+               }
+       }
+
+       /* retrieve the format arguments */
+       for (pnum = 0; pnum < max_pos; pnum++) {
+               int i;
+
+               if (clist[pnum].num == 0) {
+                       /* ignoring a parameter should not be permitted
+                        * all parameters must be matched at least once
+                        * BUT seem some system ignore this rule ...
+                        * at least my glibc based system does --SSS
+                        */
+#ifdef DEBUG_SNPRINTF
+                       printf("parameter at position %d not used\n", pnum+1);
+#endif
+                       /* eat the parameter */
+                       va_arg (args, int);
+                       continue;
+               }
+               for (i = 1; i < clist[pnum].num; i++) {
+                       if (clist[pnum].chunks[0]->type != clist[pnum].chunks[i]->type) {
+                               /* nooo noo no!
+                                * all the references to a parameter
+                                * must be of the same type
+                                */
+                               goto done;
+                       }
+               }
+               cnk = clist[pnum].chunks[0];
+               switch (cnk->type) {
+               case CNK_INT:
+                       if (cnk->cflags == DP_C_SHORT) 
+                               cnk->value = va_arg (args, int);
+                       else if (cnk->cflags == DP_C_LONG)
+                               cnk->value = va_arg (args, long int);
+                       else if (cnk->cflags == DP_C_LLONG)
+                               cnk->value = va_arg (args, LLONG);
+                       else if (cnk->cflags == DP_C_SIZET)
+                               cnk->value = va_arg (args, ssize_t);
+                       else
+                               cnk->value = va_arg (args, int);
+
+                       for (i = 1; i < clist[pnum].num; i++) {
+                               clist[pnum].chunks[i]->value = cnk->value;
+                       }
+                       break;
+
+               case CNK_OCTAL:
+               case CNK_UINT:
+               case CNK_HEX:
+                       if (cnk->cflags == DP_C_SHORT)
+                               cnk->value = va_arg (args, unsigned int);
+                       else if (cnk->cflags == DP_C_LONG)
+                               cnk->value = (unsigned long int)va_arg (args, unsigned long int);
+                       else if (cnk->cflags == DP_C_LLONG)
+                               cnk->value = (LLONG)va_arg (args, unsigned LLONG);
+                       else if (cnk->cflags == DP_C_SIZET)
+                               cnk->value = (size_t)va_arg (args, size_t);
+                       else
+                               cnk->value = (unsigned int)va_arg (args, unsigned int);
+
+                       for (i = 1; i < clist[pnum].num; i++) {
+                               clist[pnum].chunks[i]->value = cnk->value;
+                       }
+                       break;
+
+               case CNK_FLOAT:
+                       if (cnk->cflags == DP_C_LDOUBLE)
+                               cnk->fvalue = va_arg (args, LDOUBLE);
+                       else
+                               cnk->fvalue = va_arg (args, double);
+
+                       for (i = 1; i < clist[pnum].num; i++) {
+                               clist[pnum].chunks[i]->fvalue = cnk->fvalue;
+                       }
+                       break;
+
+               case CNK_CHAR:
+                       cnk->value = va_arg (args, int);
+
+                       for (i = 1; i < clist[pnum].num; i++) {
+                               clist[pnum].chunks[i]->value = cnk->value;
+                       }
+                       break;
+
+               case CNK_STRING:
+                       cnk->strvalue = va_arg (args, char *);
+                       if (!cnk->strvalue) cnk->strvalue = "(NULL)";
+
+                       for (i = 1; i < clist[pnum].num; i++) {
+                               clist[pnum].chunks[i]->strvalue = cnk->strvalue;
+                       }
+                       break;
+
+               case CNK_PTR:
+                       cnk->strvalue = va_arg (args, void *);
+                       for (i = 1; i < clist[pnum].num; i++) {
+                               clist[pnum].chunks[i]->strvalue = cnk->strvalue;
+                       }
+                       break;
+
+               case CNK_NUM:
+                       if (cnk->cflags == DP_C_CHAR)
+                               cnk->pnum = va_arg (args, char *);
+                       else if (cnk->cflags == DP_C_SHORT)
+                               cnk->pnum = va_arg (args, short int *);
+                       else if (cnk->cflags == DP_C_LONG)
+                               cnk->pnum = va_arg (args, long int *);
+                       else if (cnk->cflags == DP_C_LLONG)
+                               cnk->pnum = va_arg (args, LLONG *);
+                       else if (cnk->cflags == DP_C_SIZET)
+                               cnk->pnum = va_arg (args, ssize_t *);
+                       else
+                               cnk->pnum = va_arg (args, int *);
+
+                       for (i = 1; i < clist[pnum].num; i++) {
+                               clist[pnum].chunks[i]->pnum = cnk->pnum;
+                       }
+                       break;
+
+               case CNK_PRCNT:
+                       break;
+
+               default:
+                       /* what ?? */
+                       goto done;
+               }
+       }
+       /* print out the actual string from chunks */
+       currlen = 0;
+       cnk = chunks;
+       while (cnk) {
+               int len, min, max;
+
+               if (cnk->min_star) min = cnk->min_star->value;
+               else min = cnk->min;
+               if (cnk->max_star) max = cnk->max_star->value;
+               else max = cnk->max;
+
+               switch (cnk->type) {
+
+               case CNK_FMT_STR:
+                       if (maxlen != 0 && maxlen > currlen) {
+                               if (maxlen > (currlen + cnk->len)) len = cnk->len;
+                               else len = maxlen - currlen;
+
+                               memcpy(&(buffer[currlen]), &(base[cnk->start]), len);
+                       }
+                       currlen += cnk->len;
+                               
+                       break;
+
+               case CNK_INT:
+               case CNK_UINT:
+                       fmtint (buffer, &currlen, maxlen, cnk->value, 10, min, max, cnk->flags);
+                       break;
+
+               case CNK_OCTAL:
+                       fmtint (buffer, &currlen, maxlen, cnk->value, 8, min, max, cnk->flags);
+                       break;
+
+               case CNK_HEX:
+                       fmtint (buffer, &currlen, maxlen, cnk->value, 16, min, max, cnk->flags);
+                       break;
+
+               case CNK_FLOAT:
+                       fmtfp (buffer, &currlen, maxlen, cnk->fvalue, min, max, cnk->flags);
+                       break;
+
+               case CNK_CHAR:
+                       dopr_outch (buffer, &currlen, maxlen, cnk->value);
+                       break;
+
+               case CNK_STRING:
+                       if (max == -1) {
+                               max = strlen(cnk->strvalue);
+                       }
+                       fmtstr (buffer, &currlen, maxlen, cnk->strvalue, cnk->flags, min, max);
+                       break;
+
+               case CNK_PTR:
+                       fmtint (buffer, &currlen, maxlen, (long)(cnk->strvalue), 16, min, max, cnk->flags);
+                       break;
+
+               case CNK_NUM:
+                       if (cnk->cflags == DP_C_CHAR)
+                               *((char *)(cnk->pnum)) = (char)currlen;
+                       else if (cnk->cflags == DP_C_SHORT)
+                               *((short int *)(cnk->pnum)) = (short int)currlen;
+                       else if (cnk->cflags == DP_C_LONG)
+                               *((long int *)(cnk->pnum)) = (long int)currlen;
+                       else if (cnk->cflags == DP_C_LLONG)
+                               *((LLONG *)(cnk->pnum)) = (LLONG)currlen;
+                       else if (cnk->cflags == DP_C_SIZET)
+                               *((ssize_t *)(cnk->pnum)) = (ssize_t)currlen;
+                       else
+                               *((int *)(cnk->pnum)) = (int)currlen;
+                       break;
+
+               case CNK_PRCNT:
+                       dopr_outch (buffer, &currlen, maxlen, '%');
+                       break;
+
+               default:
+                       /* what ?? */
+                       goto done;
+               }
+               cnk = cnk->next;
+       }
+       if (maxlen != 0) {
+               if (currlen < maxlen - 1) 
+                       buffer[currlen] = '\0';
+               else if (maxlen > 0) 
+                       buffer[maxlen - 1] = '\0';
+       }
+       ret = currlen;
+
+done:
+       va_end(args);
+
+       while (chunks) {
+               cnk = chunks->next;
+               free(chunks);
+               chunks = cnk;
+       }
+       if (clist) {
+               for (pnum = 0; pnum < max_pos; pnum++) {
+                       if (clist[pnum].chunks) free(clist[pnum].chunks);
+               }
+               free(clist);
+       }
+       return ret;
+}
+
+static void fmtstr(char *buffer, size_t *currlen, size_t maxlen,
+                   char *value, int flags, int min, int max)
+{
+       int padlen, strln;     /* amount to pad */
+       int cnt = 0;
+
+#ifdef DEBUG_SNPRINTF
+       printf("fmtstr min=%d max=%d s=[%s]\n", min, max, value);
+#endif
+       if (value == 0) {
+               value = "<NULL>";
+       }
+
+       for (strln = 0; strln < max && value[strln]; ++strln); /* strlen */
+       padlen = min - strln;
+       if (padlen < 0) 
+               padlen = 0;
+       if (flags & DP_F_MINUS) 
+               padlen = -padlen; /* Left Justify */
+       
+       while (padlen > 0) {
+               dopr_outch (buffer, currlen, maxlen, ' ');
+               --padlen;
+       }
+       while (*value && (cnt < max)) {
+               dopr_outch (buffer, currlen, maxlen, *value++);
+               ++cnt;
+       }
+       while (padlen < 0) {
+               dopr_outch (buffer, currlen, maxlen, ' ');
+               ++padlen;
+       }
+}
+
+/* Have to handle DP_F_NUM (ie 0x and 0 alternates) */
+
+static void fmtint(char *buffer, size_t *currlen, size_t maxlen,
+                   LLONG value, int base, int min, int max, int flags)
+{
+       int signvalue = 0;
+       unsigned LLONG uvalue;
+       char convert[20];
+       int place = 0;
+       int spadlen = 0; /* amount to space pad */
+       int zpadlen = 0; /* amount to zero pad */
+       int caps = 0;
+       
+       if (max < 0)
+               max = 0;
+       
+       uvalue = value;
+       
+       if(!(flags & DP_F_UNSIGNED)) {
+               if( value < 0 ) {
+                       signvalue = '-';
+                       uvalue = -value;
+               } else {
+                       if (flags & DP_F_PLUS)  /* Do a sign (+/i) */
+                               signvalue = '+';
+                       else if (flags & DP_F_SPACE)
+                               signvalue = ' ';
+               }
+       }
+  
+       if (flags & DP_F_UP) caps = 1; /* Should characters be upper case? */
+
+       do {
+               convert[place++] =
+                       (caps? "0123456789ABCDEF":"0123456789abcdef")
+                       [uvalue % (unsigned)base  ];
+               uvalue = (uvalue / (unsigned)base );
+       } while(uvalue && (place < 20));
+       if (place == 20) place--;
+       convert[place] = 0;
+
+       zpadlen = max - place;
+       spadlen = min - MAX (max, place) - (signvalue ? 1 : 0);
+       if (zpadlen < 0) zpadlen = 0;
+       if (spadlen < 0) spadlen = 0;
+       if (flags & DP_F_ZERO) {
+               zpadlen = MAX(zpadlen, spadlen);
+               spadlen = 0;
+       }
+       if (flags & DP_F_MINUS) 
+               spadlen = -spadlen; /* Left Justifty */
+
+#ifdef DEBUG_SNPRINTF
+       printf("zpad: %d, spad: %d, min: %d, max: %d, place: %d\n",
+              zpadlen, spadlen, min, max, place);
+#endif
+
+       /* Spaces */
+       while (spadlen > 0) {
+               dopr_outch (buffer, currlen, maxlen, ' ');
+               --spadlen;
+       }
+
+       /* Sign */
+       if (signvalue) 
+               dopr_outch (buffer, currlen, maxlen, signvalue);
+
+       /* Zeros */
+       if (zpadlen > 0) {
+               while (zpadlen > 0) {
+                       dopr_outch (buffer, currlen, maxlen, '0');
+                       --zpadlen;
+               }
+       }
+
+       /* Digits */
+       while (place > 0) 
+               dopr_outch (buffer, currlen, maxlen, convert[--place]);
+  
+       /* Left Justified spaces */
+       while (spadlen < 0) {
+               dopr_outch (buffer, currlen, maxlen, ' ');
+               ++spadlen;
+       }
+}
+
+static LDOUBLE abs_val(LDOUBLE value)
+{
+       LDOUBLE result = value;
+
+       if (value < 0)
+               result = -value;
+       
+       return result;
+}
+
+static LDOUBLE POW10(int exp)
+{
+       LDOUBLE result = 1;
+       
+       while (exp) {
+               result *= 10;
+               exp--;
+       }
+  
+       return result;
+}
+
+static LLONG ROUND(LDOUBLE value)
+{
+       LLONG intpart;
+
+       intpart = (LLONG)value;
+       value = value - intpart;
+       if (value >= 0.5) intpart++;
+       
+       return intpart;
+}
+
+/* a replacement for modf that doesn't need the math library. Should
+   be portable, but slow */
+static double my_modf(double x0, double *iptr)
+{
+       int i;
+       LLONG l=0;
+       double x = x0;
+       double f = 1.0;
+
+       for (i=0;i<100;i++) {
+               l = (long)x;
+               if (l <= (x+1) && l >= (x-1)) break;
+               x *= 0.1;
+               f *= 10.0;
+       }
+
+       if (i == 100) {
+               /* yikes! the number is beyond what we can handle. What do we do? */
+               (*iptr) = 0;
+               return 0;
+       }
+
+       if (i != 0) {
+               double i2;
+               double ret;
+
+               ret = my_modf(x0-l*f, &i2);
+               (*iptr) = l*f + i2;
+               return ret;
+       } 
+
+       (*iptr) = l;
+       return x - (*iptr);
+}
+
+
+static void fmtfp (char *buffer, size_t *currlen, size_t maxlen,
+                  LDOUBLE fvalue, int min, int max, int flags)
+{
+       int signvalue = 0;
+       double ufvalue;
+       char iconvert[311];
+       char fconvert[311];
+       int iplace = 0;
+       int fplace = 0;
+       int padlen = 0; /* amount to pad */
+       int zpadlen = 0; 
+       int caps = 0;
+       int idx;
+       double intpart;
+       double fracpart;
+       double temp;
+  
+       /* 
+        * AIX manpage says the default is 0, but Solaris says the default
+        * is 6, and sprintf on AIX defaults to 6
+        */
+       if (max < 0)
+               max = 6;
+
+       ufvalue = abs_val (fvalue);
+
+       if (fvalue < 0) {
+               signvalue = '-';
+       } else {
+               if (flags & DP_F_PLUS) { /* Do a sign (+/i) */
+                       signvalue = '+';
+               } else {
+                       if (flags & DP_F_SPACE)
+                               signvalue = ' ';
+               }
+       }
+
+#if 0
+       if (flags & DP_F_UP) caps = 1; /* Should characters be upper case? */
+#endif
+
+#if 0
+        if (max == 0) ufvalue += 0.5; /* if max = 0 we must round */
+#endif
+
+       /* 
+        * Sorry, we only support 9 digits past the decimal because of our 
+        * conversion method
+        */
+       if (max > 9)
+               max = 9;
+
+       /* We "cheat" by converting the fractional part to integer by
+        * multiplying by a factor of 10
+        */
+
+       temp = ufvalue;
+       my_modf(temp, &intpart);
+
+       fracpart = ROUND((POW10(max)) * (ufvalue - intpart));
+       
+       if (fracpart >= POW10(max)) {
+               intpart++;
+               fracpart -= POW10(max);
+       }
+
+
+       /* Convert integer part */
+       do {
+               temp = intpart*0.1;
+               my_modf(temp, &intpart);
+               idx = (int) ((temp -intpart +0.05)* 10.0);
+               /* idx = (int) (((double)(temp*0.1) -intpart +0.05) *10.0); */
+               /* printf ("%llf, %f, %x\n", temp, intpart, idx); */
+               iconvert[iplace++] =
+                       (caps? "0123456789ABCDEF":"0123456789abcdef")[idx];
+       } while (intpart && (iplace < 311));
+       if (iplace == 311) iplace--;
+       iconvert[iplace] = 0;
+
+       /* Convert fractional part */
+       if (fracpart)
+       {
+               do {
+                       temp = fracpart*0.1;
+                       my_modf(temp, &fracpart);
+                       idx = (int) ((temp -fracpart +0.05)* 10.0);
+                       /* idx = (int) ((((temp/10) -fracpart) +0.05) *10); */
+                       /* printf ("%lf, %lf, %ld\n", temp, fracpart, idx ); */
+                       fconvert[fplace++] =
+                       (caps? "0123456789ABCDEF":"0123456789abcdef")[idx];
+               } while(fracpart && (fplace < 311));
+               if (fplace == 311) fplace--;
+       }
+       fconvert[fplace] = 0;
+  
+       /* -1 for decimal point, another -1 if we are printing a sign */
+       padlen = min - iplace - max - 1 - ((signvalue) ? 1 : 0); 
+       zpadlen = max - fplace;
+       if (zpadlen < 0) zpadlen = 0;
+       if (padlen < 0) 
+               padlen = 0;
+       if (flags & DP_F_MINUS) 
+               padlen = -padlen; /* Left Justifty */
+       
+       if ((flags & DP_F_ZERO) && (padlen > 0)) {
+               if (signvalue) {
+                       dopr_outch (buffer, currlen, maxlen, signvalue);
+                       --padlen;
+                       signvalue = 0;
+               }
+               while (padlen > 0) {
+                       dopr_outch (buffer, currlen, maxlen, '0');
+                       --padlen;
+               }
+       }
+       while (padlen > 0) {
+               dopr_outch (buffer, currlen, maxlen, ' ');
+               --padlen;
+       }
+       if (signvalue) 
+               dopr_outch (buffer, currlen, maxlen, signvalue);
+       
+       while (iplace > 0) 
+               dopr_outch (buffer, currlen, maxlen, iconvert[--iplace]);
+
+#ifdef DEBUG_SNPRINTF
+       printf("fmtfp: fplace=%d zpadlen=%d\n", fplace, zpadlen);
+#endif
+
+       /*
+        * Decimal point.  This should probably use locale to find the correct
+        * char to print out.
+        */
+       if (max > 0) {
+               dopr_outch (buffer, currlen, maxlen, '.');
+               
+               while (zpadlen > 0) {
+                       dopr_outch (buffer, currlen, maxlen, '0');
+                       --zpadlen;
+               }
+
+               while (fplace > 0) 
+                       dopr_outch (buffer, currlen, maxlen, fconvert[--fplace]);
+       }
+
+       while (padlen < 0) {
+               dopr_outch (buffer, currlen, maxlen, ' ');
+               ++padlen;
+       }
+}
+
+static void dopr_outch(char *buffer, size_t *currlen, size_t maxlen, char c)
+{
+       if (*currlen < maxlen) {
+               buffer[(*currlen)] = c;
+       }
+       (*currlen)++;
+}
+
+static struct pr_chunk *new_chunk(void) {
+       struct pr_chunk *new_c = (struct pr_chunk *)malloc(sizeof(struct pr_chunk));
+
+       if (!new_c)
+               return NULL;
+
+       new_c->type = 0;
+       new_c->num = 0;
+       new_c->min = 0;
+       new_c->min_star = NULL;
+       new_c->max = -1;
+       new_c->max_star = NULL;
+       new_c->flags = 0;
+       new_c->cflags = 0;
+       new_c->start = 0;
+       new_c->len = 0;
+       new_c->value = 0;
+       new_c->fvalue = 0;
+       new_c->strvalue = NULL;
+       new_c->pnum = NULL;
+       new_c->next = NULL;
+
+       return new_c;
+}
+
+static int add_cnk_list_entry(struct pr_chunk_x **list,
+                               int max_num, struct pr_chunk *chunk) {
+       struct pr_chunk_x *l;
+       struct pr_chunk **c;
+       int max;
+       int cnum;
+       int i, pos;
+
+       if (chunk->num > max_num) {
+               max = chunk->num;
+       
+               if (*list == NULL) {
+                       l = (struct pr_chunk_x *)malloc(sizeof(struct pr_chunk_x) * max);
+                       pos = 0;
+               } else {
+                       l = (struct pr_chunk_x *)realloc(*list, sizeof(struct pr_chunk_x) * max);
+                       pos = max_num;
+               }
+               if (l == NULL) {
+                       for (i = 0; i < max; i++) {
+                               if ((*list)[i].chunks) free((*list)[i].chunks);
+                       }
+                       return 0;
+               }
+               for (i = pos; i < max; i++) {
+                       l[i].chunks = NULL;
+                       l[i].num = 0;
+               }
+       } else {
+               l = *list;
+               max = max_num;
+       }
+
+       i = chunk->num - 1;
+       cnum = l[i].num + 1;
+       if (l[i].chunks == NULL) {
+               c = (struct pr_chunk **)malloc(sizeof(struct pr_chunk *) * cnum); 
+       } else {
+               c = (struct pr_chunk **)realloc(l[i].chunks, sizeof(struct pr_chunk *) * cnum);
+       }
+       if (c == NULL) {
+               for (i = 0; i < max; i++) {
+                       if (l[i].chunks) free(l[i].chunks);
+               }
+               return 0;
+       }
+       c[l[i].num] = chunk;
+       l[i].chunks = c;
+       l[i].num = cnum;
+
+       *list = l;
+       return max;
+}
+
+ int rep_vsnprintf (char *str, size_t count, const char *fmt, va_list args)
+{
+       return dopr(str, count, fmt, args);
+}
+#endif
+
+/* yes this really must be a ||. Don't muck with this (tridge)
+ *
+ * The logic for these two is that we need our own definition if the
+ * OS *either* has no definition of *sprintf, or if it does have one
+ * that doesn't work properly according to the autoconf test.
+ */
+#if !defined(HAVE_SNPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+ int rep_snprintf(char *str,size_t count,const char *fmt,...)
+{
+       size_t ret;
+       va_list ap;
+    
+       va_start(ap, fmt);
+       ret = vsnprintf(str, count, fmt, ap);
+       va_end(ap);
+       return ret;
+}
+#endif
+
+#ifndef HAVE_C99_VSNPRINTF
+ int rep_printf(const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+       char *s;
+
+       s = NULL;
+       va_start(ap, fmt);
+       ret = vasprintf(&s, fmt, ap);
+       va_end(ap);
+
+       if (s) {
+               fwrite(s, 1, strlen(s), stdout);
+       }
+       free(s);
+
+       return ret;
+}
+#endif
+
+#ifndef HAVE_C99_VSNPRINTF
+ int rep_fprintf(FILE *stream, const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+       char *s;
+
+       s = NULL;
+       va_start(ap, fmt);
+       ret = vasprintf(&s, fmt, ap);
+       va_end(ap);
+
+       if (s) {
+               fwrite(s, 1, strlen(s), stream);
+       }
+       free(s);
+
+       return ret;
+}
+#endif
+
+#endif 
+
+#if !defined(HAVE_VASPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+ int rep_vasprintf(char **ptr, const char *format, va_list ap)
+{
+       int ret;
+       va_list ap2;
+
+       VA_COPY(ap2, ap);
+       ret = vsnprintf(NULL, 0, format, ap2);
+       va_end(ap2);
+       if (ret < 0) return ret;
+
+       (*ptr) = (char *)malloc(ret+1);
+       if (!*ptr) return -1;
+
+       VA_COPY(ap2, ap);
+       ret = vsnprintf(*ptr, ret+1, format, ap2);
+       va_end(ap2);
+
+       return ret;
+}
+#endif
+
+#if !defined(HAVE_ASPRINTF) || !defined(HAVE_C99_VSNPRINTF)
+ int rep_asprintf(char **ptr, const char *format, ...)
+{
+       va_list ap;
+       int ret;
+       
+       *ptr = NULL;
+       va_start(ap, format);
+       ret = vasprintf(ptr, format, ap);
+       va_end(ap);
+
+       return ret;
+}
+#endif
+
+#ifdef TEST_SNPRINTF
+
+ int sprintf(char *str,const char *fmt,...);
+ int printf(const char *fmt,...);
+
+ int main (void)
+{
+       char buf1[1024];
+       char buf2[1024];
+       char *buf3;
+       char *fp_fmt[] = {
+               "%1.1f",
+               "%-1.5f",
+               "%1.5f",
+               "%123.9f",
+               "%10.5f",
+               "% 10.5f",
+               "%+22.9f",
+               "%+4.9f",
+               "%01.3f",
+               "%4f",
+               "%3.1f",
+               "%3.2f",
+               "%.0f",
+               "%f",
+               "%-8.8f",
+               "%-9.9f",
+               NULL
+       };
+       double fp_nums[] = { 6442452944.1234, -1.5, 134.21, 91340.2, 341.1234, 203.9, 0.96, 0.996, 
+                            0.9996, 1.996, 4.136, 5.030201, 0.00205,
+                            /* END LIST */ 0};
+       char *int_fmt[] = {
+               "%-1.5d",
+               "%1.5d",
+               "%123.9d",
+               "%5.5d",
+               "%10.5d",
+               "% 10.5d",
+               "%+22.33d",
+               "%01.3d",
+               "%4d",
+               "%d",
+               NULL
+       };
+       long int_nums[] = { -1, 134, 91340, 341, 0203, 1234567890, 0};
+       char *str_fmt[] = {
+               "%10.5s",
+               "%-10.5s",
+               "%5.10s",
+               "%-5.10s",
+               "%10.1s",
+               "%0.10s",
+               "%10.0s",
+               "%1.10s",
+               "%s",
+               "%.1s",
+               "%.10s",
+               "%10s",
+               NULL
+       };
+       char *str_vals[] = {"hello", "a", "", "a longer string", NULL};
+#ifdef HAVE_LONG_LONG
+       char *ll_fmt[] = {
+               "%llu",
+               NULL
+       };
+       LLONG ll_nums[] = { 134, 91340, 341, 0203, 1234567890, 128006186140000000LL, 0};
+#endif
+       int x, y;
+       int fail = 0;
+       int num = 0;
+       int l1, l2;
+       char *ss_fmt[] = {
+               "%zd",
+               "%zu",
+               NULL
+       };
+       size_t ss_nums[] = {134, 91340, 123456789, 0203, 1234567890, 0};
+
+       printf ("Testing snprintf format codes against system sprintf...\n");
+
+       for (x = 0; fp_fmt[x] ; x++) {
+               for (y = 0; fp_nums[y] != 0 ; y++) {
+                       buf1[0] = buf2[0] = '\0';
+                       l1 = snprintf(buf1, sizeof(buf1), fp_fmt[x], fp_nums[y]);
+                       l2 = sprintf (buf2, fp_fmt[x], fp_nums[y]);
+                       buf1[1023] = buf2[1023] = '\0';
+                       if (strcmp (buf1, buf2) || (l1 != l2)) {
+                               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n", 
+                                      fp_fmt[x], l1, buf1, l2, buf2);
+                               fail++;
+                       }
+                       num++;
+               }
+       }
+
+       for (x = 0; int_fmt[x] ; x++) {
+               for (y = 0; int_nums[y] != 0 ; y++) {
+                       buf1[0] = buf2[0] = '\0';
+                       l1 = snprintf(buf1, sizeof(buf1), int_fmt[x], int_nums[y]);
+                       l2 = sprintf (buf2, int_fmt[x], int_nums[y]);
+                       buf1[1023] = buf2[1023] = '\0';
+                       if (strcmp (buf1, buf2) || (l1 != l2)) {
+                               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n", 
+                                      int_fmt[x], l1, buf1, l2, buf2);
+                               fail++;
+                       }
+                       num++;
+               }
+       }
+
+       for (x = 0; str_fmt[x] ; x++) {
+               for (y = 0; str_vals[y] != 0 ; y++) {
+                       buf1[0] = buf2[0] = '\0';
+                       l1 = snprintf(buf1, sizeof(buf1), str_fmt[x], str_vals[y]);
+                       l2 = sprintf (buf2, str_fmt[x], str_vals[y]);
+                       buf1[1023] = buf2[1023] = '\0';
+                       if (strcmp (buf1, buf2) || (l1 != l2)) {
+                               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n", 
+                                      str_fmt[x], l1, buf1, l2, buf2);
+                               fail++;
+                       }
+                       num++;
+               }
+       }
+
+#ifdef HAVE_LONG_LONG
+       for (x = 0; ll_fmt[x] ; x++) {
+               for (y = 0; ll_nums[y] != 0 ; y++) {
+                       buf1[0] = buf2[0] = '\0';
+                       l1 = snprintf(buf1, sizeof(buf1), ll_fmt[x], ll_nums[y]);
+                       l2 = sprintf (buf2, ll_fmt[x], ll_nums[y]);
+                       buf1[1023] = buf2[1023] = '\0';
+                       if (strcmp (buf1, buf2) || (l1 != l2)) {
+                               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n", 
+                                      ll_fmt[x], l1, buf1, l2, buf2);
+                               fail++;
+                       }
+                       num++;
+               }
+       }
+#endif
+
+#define BUFSZ 2048
+
+       buf1[0] = buf2[0] = '\0';
+       if ((buf3 = malloc(BUFSZ)) == NULL) {
+               fail++;
+       } else {
+               num++;
+               memset(buf3, 'a', BUFSZ);
+               snprintf(buf1, sizeof(buf1), "%.*s", 1, buf3);
+               buf1[1023] = '\0';
+               if (strcmp(buf1, "a") != 0) {
+                       printf("length limit buf1 '%s' expected 'a'\n", buf1);
+                       fail++;
+               }
+        }
+
+       buf1[0] = buf2[0] = '\0';
+       l1 = snprintf(buf1, sizeof(buf1), "%4$*1$d %2$s %3$*1$.*1$f", 3, "pos test", 12.3456, 9);
+       l2 = sprintf(buf2, "%4$*1$d %2$s %3$*1$.*1$f", 3, "pos test", 12.3456, 9);
+       buf1[1023] = buf2[1023] = '\0';
+       if (strcmp(buf1, buf2) || (l1 != l2)) {
+               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n",
+                               "%4$*1$d %2$s %3$*1$.*1$f", l1, buf1, l2, buf2);
+               fail++;
+       }
+
+       buf1[0] = buf2[0] = '\0';
+       l1 = snprintf(buf1, sizeof(buf1), "%4$*4$d %2$s %3$*4$.*4$f", 3, "pos test", 12.3456, 9);
+       l2 = sprintf(buf2, "%4$*4$d %2$s %3$*4$.*4$f", 3, "pos test", 12.3456, 9);
+       buf1[1023] = buf2[1023] = '\0';
+       if (strcmp(buf1, buf2)) {
+               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n",
+                               "%4$*1$d %2$s %3$*1$.*1$f", l1, buf1, l2, buf2);
+               fail++;
+       }
+
+       for (x = 0; ss_fmt[x] ; x++) {
+               for (y = 0; ss_nums[y] != 0 ; y++) {
+                       buf1[0] = buf2[0] = '\0';
+                       l1 = snprintf(buf1, sizeof(buf1), ss_fmt[x], ss_nums[y]);
+                       l2 = sprintf (buf2, ss_fmt[x], ss_nums[y]);
+                       buf1[1023] = buf2[1023] = '\0';
+                       if (strcmp (buf1, buf2) || (l1 != l2)) {
+                               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n", 
+                                      ss_fmt[x], l1, buf1, l2, buf2);
+                               fail++;
+                       }
+                       num++;
+               }
+       }
+#if 0
+       buf1[0] = buf2[0] = '\0';
+       l1 = snprintf(buf1, sizeof(buf1), "%lld", (LLONG)1234567890);
+       l2 = sprintf(buf2, "%lld", (LLONG)1234567890);
+       buf1[1023] = buf2[1023] = '\0';
+       if (strcmp(buf1, buf2)) {
+               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n",
+                               "%lld", l1, buf1, l2, buf2);
+               fail++;
+       }
+
+       buf1[0] = buf2[0] = '\0';
+       l1 = snprintf(buf1, sizeof(buf1), "%Lf", (LDOUBLE)890.1234567890123);
+       l2 = sprintf(buf2, "%Lf", (LDOUBLE)890.1234567890123);
+       buf1[1023] = buf2[1023] = '\0';
+       if (strcmp(buf1, buf2)) {
+               printf("snprintf doesn't match Format: %s\n\tsnprintf(%d) = [%s]\n\t sprintf(%d) = [%s]\n",
+                               "%Lf", l1, buf1, l2, buf2);
+               fail++;
+       }
+#endif
+       printf ("%d tests failed out of %d.\n", fail, num);
+
+       printf("seeing how many digits we support\n");
+       {
+               double v0 = 0.12345678901234567890123456789012345678901;
+               for (x=0; x<100; x++) {
+                       double p = pow(10, x); 
+                       double r = v0*p;
+                       snprintf(buf1, sizeof(buf1), "%1.1f", r);
+                       sprintf(buf2,                "%1.1f", r);
+                       if (strcmp(buf1, buf2)) {
+                               printf("we seem to support %d digits\n", x-1);
+                               break;
+                       }
+               }
+       }
+
+       return 0;
+}
+#endif /* TEST_SNPRINTF */
diff --git a/ctdb/lib/replace/socket.c b/ctdb/lib/replace/socket.c
new file mode 100644 (file)
index 0000000..35e975f
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+ * Unix SMB/CIFS implementation.
+ *
+ * Dummy replacements for socket functions.
+ *
+ * Copyright (C) Michael Adam <obnox@samba.org> 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "replace.h"
+#include "system/network.h"
+
+int rep_connect(int sockfd, const struct sockaddr *serv_addr, socklen_t addrlen)
+{
+       errno = ENOSYS;
+       return -1;
+}
+
+struct hostent *rep_gethostbyname(const char *name)
+{
+       errno = ENOSYS;
+       return NULL;
+}
diff --git a/ctdb/lib/replace/socketpair.c b/ctdb/lib/replace/socketpair.c
new file mode 100644 (file)
index 0000000..c775730
--- /dev/null
@@ -0,0 +1,46 @@
+/*
+ * Unix SMB/CIFS implementation.
+ * replacement routines for broken systems
+ * Copyright (C) Jelmer Vernooij <jelmer@samba.org> 2006
+ * Copyright (C) Michael Adam <obnox@samba.org> 2008
+ *
+ *  ** NOTE! The following LGPL license applies to the replace
+ *  ** library. This does NOT imply that all of Samba is released
+ *  ** under the LGPL
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "replace.h"
+#include "system/network.h"
+
+int rep_socketpair(int d, int type, int protocol, int sv[2])
+{
+       if (d != AF_UNIX) {
+               errno = EAFNOSUPPORT;
+               return -1;
+       }
+
+       if (protocol != 0) {
+               errno = EPROTONOSUPPORT;
+               return -1;
+       }
+
+       if (type != SOCK_STREAM) {
+               errno = EOPNOTSUPP;
+               return -1;
+       }
+
+       return pipe(sv);
+}
diff --git a/ctdb/lib/replace/strptime.c b/ctdb/lib/replace/strptime.c
new file mode 100644 (file)
index 0000000..20e5d8c
--- /dev/null
@@ -0,0 +1,993 @@
+/* Convert a string representation of time to a time value.
+   Copyright (C) 1996, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Library General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; see the file COPYING.LIB.  If not, 
+   see <http://www.gnu.org/licenses/>.  */
+
+/* XXX This version of the implementation is not really complete.
+   Some of the fields cannot add information alone.  But if seeing
+   some of them in the same format (such as year, week and weekday)
+   this is enough information for determining the date.  */
+
+#include "replace.h"
+#include "system/locale.h"
+#include "system/time.h"
+
+#ifndef __P
+# if defined (__GNUC__) || (defined (__STDC__) && __STDC__)
+#  define __P(args) args
+# else
+#  define __P(args) ()
+# endif  /* GCC.  */
+#endif  /* Not __P.  */
+
+#if ! HAVE_LOCALTIME_R && ! defined localtime_r
+# ifdef _LIBC
+#  define localtime_r __localtime_r
+# else
+/* Approximate localtime_r as best we can in its absence.  */
+#  define localtime_r my_localtime_r
+static struct tm *localtime_r __P ((const time_t *, struct tm *));
+static struct tm *
+localtime_r (t, tp)
+     const time_t *t;
+     struct tm *tp;
+{
+  struct tm *l = localtime (t);
+  if (! l)
+    return 0;
+  *tp = *l;
+  return tp;
+}
+# endif /* ! _LIBC */
+#endif /* ! HAVE_LOCALTIME_R && ! defined (localtime_r) */
+
+
+#define match_char(ch1, ch2) if (ch1 != ch2) return NULL
+#if defined __GNUC__ && __GNUC__ >= 2
+# define match_string(cs1, s2) \
+  ({ size_t len = strlen (cs1);                                                      \
+     int result = strncasecmp ((cs1), (s2), len) == 0;                       \
+     if (result) (s2) += len;                                                \
+     result; })
+#else
+/* Oh come on.  Get a reasonable compiler.  */
+# define match_string(cs1, s2) \
+  (strncasecmp ((cs1), (s2), strlen (cs1)) ? 0 : ((s2) += strlen (cs1), 1))
+#endif
+/* We intentionally do not use isdigit() for testing because this will
+   lead to problems with the wide character version.  */
+#define get_number(from, to, n) \
+  do {                                                                       \
+    int __n = n;                                                             \
+    val = 0;                                                                 \
+    while (*rp == ' ')                                                       \
+      ++rp;                                                                  \
+    if (*rp < '0' || *rp > '9')                                                      \
+      return NULL;                                                           \
+    do {                                                                     \
+      val *= 10;                                                             \
+      val += *rp++ - '0';                                                    \
+    } while (--__n > 0 && val * 10 <= to && *rp >= '0' && *rp <= '9');       \
+    if (val < from || val > to)                                                      \
+      return NULL;                                                           \
+  } while (0)
+#ifdef _NL_CURRENT
+# define get_alt_number(from, to, n) \
+  ({                                                                         \
+    __label__ do_normal;                                                     \
+    if (*decided != raw)                                                     \
+      {                                                                              \
+       const char *alts = _NL_CURRENT (LC_TIME, ALT_DIGITS);                 \
+       int __n = n;                                                          \
+       int any = 0;                                                          \
+       while (*rp == ' ')                                                    \
+         ++rp;                                                               \
+       val = 0;                                                              \
+       do {                                                                  \
+         val *= 10;                                                          \
+         while (*alts != '\0')                                               \
+           {                                                                 \
+             size_t len = strlen (alts);                                     \
+             if (strncasecmp (alts, rp, len) == 0)                           \
+               break;                                                        \
+             alts += len + 1;                                                \
+             ++val;                                                          \
+           }                                                                 \
+         if (*alts == '\0')                                                  \
+           {                                                                 \
+             if (*decided == not && ! any)                                   \
+               goto do_normal;                                               \
+             /* If we haven't read anything it's an error.  */               \
+             if (! any)                                                      \
+               return NULL;                                                  \
+             /* Correct the premature multiplication.  */                    \
+             val /= 10;                                                      \
+             break;                                                          \
+           }                                                                 \
+         else                                                                \
+           *decided = loc;                                                   \
+       } while (--__n > 0 && val * 10 <= to);                                \
+       if (val < from || val > to)                                           \
+         return NULL;                                                        \
+      }                                                                              \
+    else                                                                     \
+      {                                                                              \
+       do_normal:                                                            \
+        get_number (from, to, n);                                            \
+      }                                                                              \
+    0;                                                                       \
+  })
+#else
+# define get_alt_number(from, to, n) \
+  /* We don't have the alternate representation.  */                         \
+  get_number(from, to, n)
+#endif
+#define recursive(new_fmt) \
+  (*(new_fmt) != '\0'                                                        \
+   && (rp = strptime_internal (rp, (new_fmt), tm, decided, era_cnt)) != NULL)
+
+
+#ifdef _LIBC
+/* This is defined in locale/C-time.c in the GNU libc.  */
+extern const struct locale_data _nl_C_LC_TIME;
+extern const unsigned short int __mon_yday[2][13];
+
+# define weekday_name (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (DAY_1)].string)
+# define ab_weekday_name \
+  (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (ABDAY_1)].string)
+# define month_name (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (MON_1)].string)
+# define ab_month_name (&_nl_C_LC_TIME.values[_NL_ITEM_INDEX (ABMON_1)].string)
+# define HERE_D_T_FMT (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (D_T_FMT)].string)
+# define HERE_D_FMT (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (D_FMT)].string)
+# define HERE_AM_STR (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (AM_STR)].string)
+# define HERE_PM_STR (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (PM_STR)].string)
+# define HERE_T_FMT_AMPM \
+  (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (T_FMT_AMPM)].string)
+# define HERE_T_FMT (_nl_C_LC_TIME.values[_NL_ITEM_INDEX (T_FMT)].string)
+
+# define strncasecmp(s1, s2, n) __strncasecmp (s1, s2, n)
+#else
+static char const weekday_name[][10] =
+  {
+    "Sunday", "Monday", "Tuesday", "Wednesday",
+    "Thursday", "Friday", "Saturday"
+  };
+static char const ab_weekday_name[][4] =
+  {
+    "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
+  };
+static char const month_name[][10] =
+  {
+    "January", "February", "March", "April", "May", "June",
+    "July", "August", "September", "October", "November", "December"
+  };
+static char const ab_month_name[][4] =
+  {
+    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+  };
+# define HERE_D_T_FMT "%a %b %e %H:%M:%S %Y"
+# define HERE_D_FMT "%m/%d/%y"
+# define HERE_AM_STR "AM"
+# define HERE_PM_STR "PM"
+# define HERE_T_FMT_AMPM "%I:%M:%S %p"
+# define HERE_T_FMT "%H:%M:%S"
+
+static const unsigned short int __mon_yday[2][13] =
+  {
+    /* Normal years.  */
+    { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365 },
+    /* Leap years.  */
+    { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366 }
+  };
+#endif
+
+/* Status of lookup: do we use the locale data or the raw data?  */
+enum locale_status { not, loc, raw };
+
+
+#ifndef __isleap
+/* Nonzero if YEAR is a leap year (every 4 years,
+   except every 100th isn't, and every 400th is).  */
+# define __isleap(year)        \
+  ((year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0))
+#endif
+
+/* Compute the day of the week.  */
+static void
+day_of_the_week (struct tm *tm)
+{
+  /* We know that January 1st 1970 was a Thursday (= 4).  Compute the
+     the difference between this data in the one on TM and so determine
+     the weekday.  */
+  int corr_year = 1900 + tm->tm_year - (tm->tm_mon < 2);
+  int wday = (-473
+             + (365 * (tm->tm_year - 70))
+             + (corr_year / 4)
+             - ((corr_year / 4) / 25) + ((corr_year / 4) % 25 < 0)
+             + (((corr_year / 4) / 25) / 4)
+             + __mon_yday[0][tm->tm_mon]
+             + tm->tm_mday - 1);
+  tm->tm_wday = ((wday % 7) + 7) % 7;
+}
+
+/* Compute the day of the year.  */
+static void
+day_of_the_year (struct tm *tm)
+{
+  tm->tm_yday = (__mon_yday[__isleap (1900 + tm->tm_year)][tm->tm_mon]
+                + (tm->tm_mday - 1));
+}
+
+static char *
+#ifdef _LIBC
+internal_function
+#endif
+strptime_internal __P ((const char *rp, const char *fmt, struct tm *tm,
+                       enum locale_status *decided, int era_cnt));
+
+static char *
+#ifdef _LIBC
+internal_function
+#endif
+strptime_internal (rp, fmt, tm, decided, era_cnt)
+     const char *rp;
+     const char *fmt;
+     struct tm *tm;
+     enum locale_status *decided;
+     int era_cnt;
+{
+  int cnt;
+  size_t val;
+  int have_I, is_pm;
+  int century, want_century;
+  int want_era;
+  int have_wday, want_xday;
+  int have_yday;
+  int have_mon, have_mday;
+#ifdef _NL_CURRENT
+  const char *rp_backup;
+  size_t num_eras;
+  struct era_entry *era;
+
+  era = NULL;
+#endif
+
+  have_I = is_pm = 0;
+  century = -1;
+  want_century = 0;
+  want_era = 0;
+
+  have_wday = want_xday = have_yday = have_mon = have_mday = 0;
+
+  while (*fmt != '\0')
+    {
+      /* A white space in the format string matches 0 more or white
+        space in the input string.  */
+      if (isspace (*fmt))
+       {
+         while (isspace (*rp))
+           ++rp;
+         ++fmt;
+         continue;
+       }
+
+      /* Any character but `%' must be matched by the same character
+        in the iput string.  */
+      if (*fmt != '%')
+       {
+         match_char (*fmt++, *rp++);
+         continue;
+       }
+
+      ++fmt;
+#ifndef _NL_CURRENT
+      /* We need this for handling the `E' modifier.  */
+    start_over:
+#endif
+
+#ifdef _NL_CURRENT
+      /* Make back up of current processing pointer.  */
+      rp_backup = rp;
+#endif
+
+      switch (*fmt++)
+       {
+       case '%':
+         /* Match the `%' character itself.  */
+         match_char ('%', *rp++);
+         break;
+       case 'a':
+       case 'A':
+         /* Match day of week.  */
+         for (cnt = 0; cnt < 7; ++cnt)
+           {
+#ifdef _NL_CURRENT
+             if (*decided !=raw)
+               {
+                 if (match_string (_NL_CURRENT (LC_TIME, DAY_1 + cnt), rp))
+                   {
+                     if (*decided == not
+                         && strcmp (_NL_CURRENT (LC_TIME, DAY_1 + cnt),
+                                    weekday_name[cnt]))
+                       *decided = loc;
+                     break;
+                   }
+                 if (match_string (_NL_CURRENT (LC_TIME, ABDAY_1 + cnt), rp))
+                   {
+                     if (*decided == not
+                         && strcmp (_NL_CURRENT (LC_TIME, ABDAY_1 + cnt),
+                                    ab_weekday_name[cnt]))
+                       *decided = loc;
+                     break;
+                   }
+               }
+#endif
+             if (*decided != loc
+                 && (match_string (weekday_name[cnt], rp)
+                     || match_string (ab_weekday_name[cnt], rp)))
+               {
+                 *decided = raw;
+                 break;
+               }
+           }
+         if (cnt == 7)
+           /* Does not match a weekday name.  */
+           return NULL;
+         tm->tm_wday = cnt;
+         have_wday = 1;
+         break;
+       case 'b':
+       case 'B':
+       case 'h':
+         /* Match month name.  */
+         for (cnt = 0; cnt < 12; ++cnt)
+           {
+#ifdef _NL_CURRENT
+             if (*decided !=raw)
+               {
+                 if (match_string (_NL_CURRENT (LC_TIME, MON_1 + cnt), rp))
+                   {
+                     if (*decided == not
+                         && strcmp (_NL_CURRENT (LC_TIME, MON_1 + cnt),
+                                    month_name[cnt]))
+                       *decided = loc;
+                     break;
+                   }
+                 if (match_string (_NL_CURRENT (LC_TIME, ABMON_1 + cnt), rp))
+                   {
+                     if (*decided == not
+                         && strcmp (_NL_CURRENT (LC_TIME, ABMON_1 + cnt),
+                                    ab_month_name[cnt]))
+                       *decided = loc;
+                     break;
+                   }
+               }
+#endif
+             if (match_string (month_name[cnt], rp)
+                 || match_string (ab_month_name[cnt], rp))
+               {
+                 *decided = raw;
+                 break;
+               }
+           }
+         if (cnt == 12)
+           /* Does not match a month name.  */
+           return NULL;
+         tm->tm_mon = cnt;
+         want_xday = 1;
+         break;
+       case 'c':
+         /* Match locale's date and time format.  */
+#ifdef _NL_CURRENT
+         if (*decided != raw)
+           {
+             if (!recursive (_NL_CURRENT (LC_TIME, D_T_FMT)))
+               {
+                 if (*decided == loc)
+                   return NULL;
+                 else
+                   rp = rp_backup;
+               }
+             else
+               {
+                 if (*decided == not &&
+                     strcmp (_NL_CURRENT (LC_TIME, D_T_FMT), HERE_D_T_FMT))
+                   *decided = loc;
+                 want_xday = 1;
+                 break;
+               }
+             *decided = raw;
+           }
+#endif
+         if (!recursive (HERE_D_T_FMT))
+           return NULL;
+         want_xday = 1;
+         break;
+       case 'C':
+         /* Match century number.  */
+#ifdef _NL_CURRENT
+       match_century:
+#endif
+         get_number (0, 99, 2);
+         century = val;
+         want_xday = 1;
+         break;
+       case 'd':
+       case 'e':
+         /* Match day of month.  */
+         get_number (1, 31, 2);
+         tm->tm_mday = val;
+         have_mday = 1;
+         want_xday = 1;
+         break;
+       case 'F':
+         if (!recursive ("%Y-%m-%d"))
+           return NULL;
+         want_xday = 1;
+         break;
+       case 'x':
+#ifdef _NL_CURRENT
+         if (*decided != raw)
+           {
+             if (!recursive (_NL_CURRENT (LC_TIME, D_FMT)))
+               {
+                 if (*decided == loc)
+                   return NULL;
+                 else
+                   rp = rp_backup;
+               }
+             else
+               {
+                 if (*decided == not
+                     && strcmp (_NL_CURRENT (LC_TIME, D_FMT), HERE_D_FMT))
+                   *decided = loc;
+                 want_xday = 1;
+                 break;
+               }
+             *decided = raw;
+           }
+#endif
+         /* Fall through.  */
+       case 'D':
+         /* Match standard day format.  */
+         if (!recursive (HERE_D_FMT))
+           return NULL;
+         want_xday = 1;
+         break;
+       case 'k':
+       case 'H':
+         /* Match hour in 24-hour clock.  */
+         get_number (0, 23, 2);
+         tm->tm_hour = val;
+         have_I = 0;
+         break;
+       case 'I':
+         /* Match hour in 12-hour clock.  */
+         get_number (1, 12, 2);
+         tm->tm_hour = val % 12;
+         have_I = 1;
+         break;
+       case 'j':
+         /* Match day number of year.  */
+         get_number (1, 366, 3);
+         tm->tm_yday = val - 1;
+         have_yday = 1;
+         break;
+       case 'm':
+         /* Match number of month.  */
+         get_number (1, 12, 2);
+         tm->tm_mon = val - 1;
+         have_mon = 1;
+         want_xday = 1;
+         break;
+       case 'M':
+         /* Match minute.  */
+         get_number (0, 59, 2);
+         tm->tm_min = val;
+         break;
+       case 'n':
+       case 't':
+         /* Match any white space.  */
+         while (isspace (*rp))
+           ++rp;
+         break;
+       case 'p':
+         /* Match locale's equivalent of AM/PM.  */
+#ifdef _NL_CURRENT
+         if (*decided != raw)
+           {
+             if (match_string (_NL_CURRENT (LC_TIME, AM_STR), rp))
+               {
+                 if (strcmp (_NL_CURRENT (LC_TIME, AM_STR), HERE_AM_STR))
+                   *decided = loc;
+                 break;
+               }
+             if (match_string (_NL_CURRENT (LC_TIME, PM_STR), rp))
+               {
+                 if (strcmp (_NL_CURRENT (LC_TIME, PM_STR), HERE_PM_STR))
+                   *decided = loc;
+                 is_pm = 1;
+                 break;
+               }
+             *decided = raw;
+           }
+#endif
+         if (!match_string (HERE_AM_STR, rp)) {
+           if (match_string (HERE_PM_STR, rp)) {
+             is_pm = 1;
+           } else {
+             return NULL;
+           }
+         }
+         break;
+       case 'r':
+#ifdef _NL_CURRENT
+         if (*decided != raw)
+           {
+             if (!recursive (_NL_CURRENT (LC_TIME, T_FMT_AMPM)))
+               {
+                 if (*decided == loc)
+                   return NULL;
+                 else
+                   rp = rp_backup;
+               }
+             else
+               {
+                 if (*decided == not &&
+                     strcmp (_NL_CURRENT (LC_TIME, T_FMT_AMPM),
+                             HERE_T_FMT_AMPM))
+                   *decided = loc;
+                 break;
+               }
+             *decided = raw;
+           }
+#endif
+         if (!recursive (HERE_T_FMT_AMPM))
+           return NULL;
+         break;
+       case 'R':
+         if (!recursive ("%H:%M"))
+           return NULL;
+         break;
+       case 's':
+         {
+           /* The number of seconds may be very high so we cannot use
+              the `get_number' macro.  Instead read the number
+              character for character and construct the result while
+              doing this.  */
+           time_t secs = 0;
+           if (*rp < '0' || *rp > '9')
+             /* We need at least one digit.  */
+             return NULL;
+
+           do
+             {
+               secs *= 10;
+               secs += *rp++ - '0';
+             }
+           while (*rp >= '0' && *rp <= '9');
+
+           if (localtime_r (&secs, tm) == NULL)
+             /* Error in function.  */
+             return NULL;
+         }
+         break;
+       case 'S':
+         get_number (0, 61, 2);
+         tm->tm_sec = val;
+         break;
+       case 'X':
+#ifdef _NL_CURRENT
+         if (*decided != raw)
+           {
+             if (!recursive (_NL_CURRENT (LC_TIME, T_FMT)))
+               {
+                 if (*decided == loc)
+                   return NULL;
+                 else
+                   rp = rp_backup;
+               }
+             else
+               {
+                 if (strcmp (_NL_CURRENT (LC_TIME, T_FMT), HERE_T_FMT))
+                   *decided = loc;
+                 break;
+               }
+             *decided = raw;
+           }
+#endif
+         /* Fall through.  */
+       case 'T':
+         if (!recursive (HERE_T_FMT))
+           return NULL;
+         break;
+       case 'u':
+         get_number (1, 7, 1);
+         tm->tm_wday = val % 7;
+         have_wday = 1;
+         break;
+       case 'g':
+         get_number (0, 99, 2);
+         /* XXX This cannot determine any field in TM.  */
+         break;
+       case 'G':
+         if (*rp < '0' || *rp > '9')
+           return NULL;
+         /* XXX Ignore the number since we would need some more
+            information to compute a real date.  */
+         do
+           ++rp;
+         while (*rp >= '0' && *rp <= '9');
+         break;
+       case 'U':
+       case 'V':
+       case 'W':
+         get_number (0, 53, 2);
+         /* XXX This cannot determine any field in TM without some
+            information.  */
+         break;
+       case 'w':
+         /* Match number of weekday.  */
+         get_number (0, 6, 1);
+         tm->tm_wday = val;
+         have_wday = 1;
+         break;
+       case 'y':
+#ifdef _NL_CURRENT
+       match_year_in_century:
+#endif
+         /* Match year within century.  */
+         get_number (0, 99, 2);
+         /* The "Year 2000: The Millennium Rollover" paper suggests that
+            values in the range 69-99 refer to the twentieth century.  */
+         tm->tm_year = val >= 69 ? val : val + 100;
+         /* Indicate that we want to use the century, if specified.  */
+         want_century = 1;
+         want_xday = 1;
+         break;
+       case 'Y':
+         /* Match year including century number.  */
+         get_number (0, 9999, 4);
+         tm->tm_year = val - 1900;
+         want_century = 0;
+         want_xday = 1;
+         break;
+       case 'Z':
+         /* XXX How to handle this?  */
+         break;
+       case 'E':
+#ifdef _NL_CURRENT
+         switch (*fmt++)
+           {
+           case 'c':
+             /* Match locale's alternate date and time format.  */
+             if (*decided != raw)
+               {
+                 const char *fmt = _NL_CURRENT (LC_TIME, ERA_D_T_FMT);
+
+                 if (*fmt == '\0')
+                   fmt = _NL_CURRENT (LC_TIME, D_T_FMT);
+
+                 if (!recursive (fmt))
+                   {
+                     if (*decided == loc)
+                       return NULL;
+                     else
+                       rp = rp_backup;
+                   }
+                 else
+                   {
+                     if (strcmp (fmt, HERE_D_T_FMT))
+                       *decided = loc;
+                     want_xday = 1;
+                     break;
+                   }
+                 *decided = raw;
+               }
+             /* The C locale has no era information, so use the
+                normal representation.  */
+             if (!recursive (HERE_D_T_FMT))
+               return NULL;
+             want_xday = 1;
+             break;
+           case 'C':
+             if (*decided != raw)
+               {
+                 if (era_cnt >= 0)
+                   {
+                     era = _nl_select_era_entry (era_cnt);
+                     if (match_string (era->era_name, rp))
+                       {
+                         *decided = loc;
+                         break;
+                       }
+                     else
+                       return NULL;
+                   }
+                 else
+                   {
+                     num_eras = _NL_CURRENT_WORD (LC_TIME,
+                                                  _NL_TIME_ERA_NUM_ENTRIES);
+                     for (era_cnt = 0; era_cnt < (int) num_eras;
+                          ++era_cnt, rp = rp_backup)
+                       {
+                         era = _nl_select_era_entry (era_cnt);
+                         if (match_string (era->era_name, rp))
+                           {
+                             *decided = loc;
+                             break;
+                           }
+                       }
+                     if (era_cnt == (int) num_eras)
+                       {
+                         era_cnt = -1;
+                         if (*decided == loc)
+                           return NULL;
+                       }
+                     else
+                       break;
+                   }
+
+                 *decided = raw;
+               }
+             /* The C locale has no era information, so use the
+                normal representation.  */
+             goto match_century;
+           case 'y':
+             if (*decided == raw)
+               goto match_year_in_century;
+
+             get_number(0, 9999, 4);
+             tm->tm_year = val;
+             want_era = 1;
+             want_xday = 1;
+             break;
+           case 'Y':
+             if (*decided != raw)
+               {
+                 num_eras = _NL_CURRENT_WORD (LC_TIME,
+                                              _NL_TIME_ERA_NUM_ENTRIES);
+                 for (era_cnt = 0; era_cnt < (int) num_eras;
+                      ++era_cnt, rp = rp_backup)
+                   {
+                     era = _nl_select_era_entry (era_cnt);
+                     if (recursive (era->era_format))
+                       break;
+                   }
+                 if (era_cnt == (int) num_eras)
+                   {
+                     era_cnt = -1;
+                     if (*decided == loc)
+                       return NULL;
+                     else
+                       rp = rp_backup;
+                   }
+                 else
+                   {
+                     *decided = loc;
+                     era_cnt = -1;
+                     break;
+                   }
+
+                 *decided = raw;
+               }
+             get_number (0, 9999, 4);
+             tm->tm_year = val - 1900;
+             want_century = 0;
+             want_xday = 1;
+             break;
+           case 'x':
+             if (*decided != raw)
+               {
+                 const char *fmt = _NL_CURRENT (LC_TIME, ERA_D_FMT);
+
+                 if (*fmt == '\0')
+                   fmt = _NL_CURRENT (LC_TIME, D_FMT);
+
+                 if (!recursive (fmt))
+                   {
+                     if (*decided == loc)
+                       return NULL;
+                     else
+                       rp = rp_backup;
+                   }
+                 else
+                   {
+                     if (strcmp (fmt, HERE_D_FMT))
+                       *decided = loc;
+                     break;
+                   }
+                 *decided = raw;
+               }
+             if (!recursive (HERE_D_FMT))
+               return NULL;
+             break;
+           case 'X':
+             if (*decided != raw)
+               {
+                 const char *fmt = _NL_CURRENT (LC_TIME, ERA_T_FMT);
+
+                 if (*fmt == '\0')
+                   fmt = _NL_CURRENT (LC_TIME, T_FMT);
+
+                 if (!recursive (fmt))
+                   {
+                     if (*decided == loc)
+                       return NULL;
+                     else
+                       rp = rp_backup;
+                   }
+                 else
+                   {
+                     if (strcmp (fmt, HERE_T_FMT))
+                       *decided = loc;
+                     break;
+                   }
+                 *decided = raw;
+               }
+             if (!recursive (HERE_T_FMT))
+               return NULL;
+             break;
+           default:
+             return NULL;
+           }
+         break;
+#else
+         /* We have no information about the era format.  Just use
+            the normal format.  */
+         if (*fmt != 'c' && *fmt != 'C' && *fmt != 'y' && *fmt != 'Y'
+             && *fmt != 'x' && *fmt != 'X')
+           /* This is an illegal format.  */
+           return NULL;
+
+         goto start_over;
+#endif
+       case 'O':
+         switch (*fmt++)
+           {
+           case 'd':
+           case 'e':
+             /* Match day of month using alternate numeric symbols.  */
+             get_alt_number (1, 31, 2);
+             tm->tm_mday = val;
+             have_mday = 1;
+             want_xday = 1;
+             break;
+           case 'H':
+             /* Match hour in 24-hour clock using alternate numeric
+                symbols.  */
+             get_alt_number (0, 23, 2);
+             tm->tm_hour = val;
+             have_I = 0;
+             break;
+           case 'I':
+             /* Match hour in 12-hour clock using alternate numeric
+                symbols.  */
+             get_alt_number (1, 12, 2);
+             tm->tm_hour = val - 1;
+             have_I = 1;
+             break;
+           case 'm':
+             /* Match month using alternate numeric symbols.  */
+             get_alt_number (1, 12, 2);
+             tm->tm_mon = val - 1;
+             have_mon = 1;
+             want_xday = 1;
+             break;
+           case 'M':
+             /* Match minutes using alternate numeric symbols.  */
+             get_alt_number (0, 59, 2);
+             tm->tm_min = val;
+             break;
+           case 'S':
+             /* Match seconds using alternate numeric symbols.  */
+             get_alt_number (0, 61, 2);
+             tm->tm_sec = val;
+             break;
+           case 'U':
+           case 'V':
+           case 'W':
+             get_alt_number (0, 53, 2);
+             /* XXX This cannot determine any field in TM without
+                further information.  */
+             break;
+           case 'w':
+             /* Match number of weekday using alternate numeric symbols.  */
+             get_alt_number (0, 6, 1);
+             tm->tm_wday = val;
+             have_wday = 1;
+             break;
+           case 'y':
+             /* Match year within century using alternate numeric symbols.  */
+             get_alt_number (0, 99, 2);
+             tm->tm_year = val >= 69 ? val : val + 100;
+             want_xday = 1;
+             break;
+           default:
+             return NULL;
+           }
+         break;
+       default:
+         return NULL;
+       }
+    }
+
+  if (have_I && is_pm)
+    tm->tm_hour += 12;
+
+  if (century != -1)
+    {
+      if (want_century)
+       tm->tm_year = tm->tm_year % 100 + (century - 19) * 100;
+      else
+       /* Only the century, but not the year.  Strange, but so be it.  */
+       tm->tm_year = (century - 19) * 100;
+    }
+
+#ifdef _NL_CURRENT
+  if (era_cnt != -1)
+    {
+      era = _nl_select_era_entry(era_cnt);
+      if (want_era)
+       tm->tm_year = (era->start_date[0]
+                      + ((tm->tm_year - era->offset)
+                         * era->absolute_direction));
+      else
+       /* Era start year assumed.  */
+       tm->tm_year = era->start_date[0];
+    }
+  else
+#endif
+    if (want_era)
+      return NULL;
+
+  if (want_xday && !have_wday)
+    {
+      if ( !(have_mon && have_mday) && have_yday)
+       {
+         /* We don't have tm_mon and/or tm_mday, compute them.  */
+         int t_mon = 0;
+         while (__mon_yday[__isleap(1900 + tm->tm_year)][t_mon] <= tm->tm_yday)
+             t_mon++;
+         if (!have_mon)
+             tm->tm_mon = t_mon - 1;
+         if (!have_mday)
+             tm->tm_mday =
+               (tm->tm_yday
+                - __mon_yday[__isleap(1900 + tm->tm_year)][t_mon - 1] + 1);
+       }
+      day_of_the_week (tm);
+    }
+  if (want_xday && !have_yday)
+    day_of_the_year (tm);
+
+  return discard_const_p(char, rp);
+}
+
+
+char *rep_strptime(const char *buf, const char *format, struct tm *tm)
+{
+  enum locale_status decided;
+
+#ifdef _NL_CURRENT
+  decided = not;
+#else
+  decided = raw;
+#endif
+  return strptime_internal (buf, format, tm, &decided, -1);
+}
diff --git a/ctdb/lib/replace/strptime.m4 b/ctdb/lib/replace/strptime.m4
new file mode 100644 (file)
index 0000000..8ac22f6
--- /dev/null
@@ -0,0 +1,16 @@
+AC_CHECK_FUNCS(strptime)
+AC_CHECK_DECLS(strptime, [], [], [#include <time.h>])
+AC_CACHE_CHECK([whether strptime is available and works],libreplace_cv_STRPTIME_OK,[
+       AC_TRY_RUN([
+               #define LIBREPLACE_CONFIGURE_TEST_STRPTIME
+               #include "$libreplacedir/test/strptime.c"
+               ],
+               [libreplace_cv_STRPTIME_OK=yes],
+               [libreplace_cv_STRPTIME_OK=no],
+               [libreplace_cv_STRPTIME_OK="assuming not"])
+])
+if test x"$libreplace_cv_STRPTIME_OK" != x"yes"; then
+        LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/strptime.o"
+else
+        AC_DEFINE(HAVE_WORKING_STRPTIME,1,[Whether strptime is working correct])
+fi
diff --git a/ctdb/lib/replace/system/README b/ctdb/lib/replace/system/README
new file mode 100644 (file)
index 0000000..69a2b80
--- /dev/null
@@ -0,0 +1,4 @@
+This directory contains wrappers around logical groups of system
+include files. The idea is to avoid #ifdef blocks in the main code,
+and instead put all the necessary conditional includes in subsystem
+specific header files in this directory.
diff --git a/ctdb/lib/replace/system/aio.h b/ctdb/lib/replace/system/aio.h
new file mode 100644 (file)
index 0000000..784d77f
--- /dev/null
@@ -0,0 +1,32 @@
+#ifndef _system_aio_h
+#define _system_aio_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   AIO system include wrappers
+
+   Copyright (C) Andrew Tridgell 2006
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef HAVE_LIBAIO_H
+#include <libaio.h>
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/capability.h b/ctdb/lib/replace/system/capability.h
new file mode 100644 (file)
index 0000000..a7b78f0
--- /dev/null
@@ -0,0 +1,55 @@
+#ifndef _system_capability_h
+#define _system_capability_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   capability system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef HAVE_SYS_CAPABILITY_H
+
+#if defined(BROKEN_REDHAT_7_SYSTEM_HEADERS) && !defined(_I386_STATFS_H) && !defined(_PPC_STATFS_H)
+#define _I386_STATFS_H
+#define _PPC_STATFS_H
+#define BROKEN_REDHAT_7_STATFS_WORKAROUND
+#endif
+
+#if defined(BROKEN_RHEL5_SYS_CAP_HEADER) && !defined(_LINUX_TYPES_H)
+#define BROKEN_RHEL5_SYS_CAP_HEADER_WORKAROUND
+#endif
+
+#include <sys/capability.h>
+
+#ifdef BROKEN_RHEL5_SYS_CAP_HEADER_WORKAROUND
+#undef _LINUX_TYPES_H
+#undef BROKEN_RHEL5_SYS_CAP_HEADER_WORKAROUND
+#endif
+
+#ifdef BROKEN_REDHAT_7_STATFS_WORKAROUND
+#undef _PPC_STATFS_H
+#undef _I386_STATFS_H
+#undef BROKEN_REDHAT_7_STATFS_WORKAROUND
+#endif
+
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/config.m4 b/ctdb/lib/replace/system/config.m4
new file mode 100644 (file)
index 0000000..b7cdf14
--- /dev/null
@@ -0,0 +1,145 @@
+# filesys
+AC_HEADER_DIRENT 
+AC_CHECK_HEADERS(fcntl.h sys/fcntl.h sys/resource.h sys/ioctl.h sys/mode.h sys/filio.h sys/fs/s5param.h sys/filsys.h)
+AC_CHECK_HEADERS(sys/acl.h acl/libacl.h sys/file.h)
+
+# select
+AC_CHECK_HEADERS(sys/select.h)
+
+# poll
+AC_CHECK_HEADERS(poll.h)
+AC_CHECK_FUNCS(poll,[],[LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/poll.o"])
+
+# time
+AC_CHECK_HEADERS(sys/time.h utime.h)
+AC_HEADER_TIME
+AC_CHECK_FUNCS(utime utimes)
+
+AC_CACHE_CHECK([if gettimeofday takes TZ argument],libreplace_cv_HAVE_GETTIMEOFDAY_TZ,[
+AC_TRY_RUN([
+#include <sys/time.h>
+#include <unistd.h>
+main() { struct timeval tv; exit(gettimeofday(&tv, NULL));}],
+           libreplace_cv_HAVE_GETTIMEOFDAY_TZ=yes,libreplace_cv_HAVE_GETTIMEOFDAY_TZ=no,libreplace_cv_HAVE_GETTIMEOFDAY_TZ=yes)])
+if test x"$libreplace_cv_HAVE_GETTIMEOFDAY_TZ" = x"yes"; then
+    AC_DEFINE(HAVE_GETTIMEOFDAY_TZ,1,[Whether gettimeofday() is available])
+fi
+
+# wait
+AC_HEADER_SYS_WAIT
+
+# capability
+AC_CHECK_HEADERS(sys/capability.h)
+
+case "$host_os" in
+*linux*)
+AC_CACHE_CHECK([for broken RedHat 7.2 system header files],libreplace_cv_BROKEN_REDHAT_7_SYSTEM_HEADERS,[
+AC_TRY_COMPILE([
+       #ifdef HAVE_SYS_VFS_H
+       #include <sys/vfs.h>
+       #endif
+       #ifdef HAVE_SYS_CAPABILITY_H
+       #include <sys/capability.h>
+       #endif
+       ],[
+       int i;
+       ],
+       libreplace_cv_BROKEN_REDHAT_7_SYSTEM_HEADERS=no,
+       libreplace_cv_BROKEN_REDHAT_7_SYSTEM_HEADERS=yes
+)])
+if test x"$libreplace_cv_BROKEN_REDHAT_7_SYSTEM_HEADERS" = x"yes"; then
+       AC_DEFINE(BROKEN_REDHAT_7_SYSTEM_HEADERS,1,[Broken RedHat 7.2 system header files])
+fi
+
+AC_CACHE_CHECK([for broken RHEL5 sys/capability.h],libreplace_cv_BROKEN_RHEL5_SYS_CAP_HEADER,[
+AC_TRY_COMPILE([
+       #ifdef HAVE_SYS_CAPABILITY_H
+       #include <sys/capability.h>
+       #endif
+       #include <linux/types.h>
+       ],[
+       __s8 i;
+       ],
+       libreplace_cv_BROKEN_RHEL5_SYS_CAP_HEADER=no,
+       libreplace_cv_BROKEN_RHEL5_SYS_CAP_HEADER=yes
+)])
+if test x"$libreplace_cv_BROKEN_RHEL5_SYS_CAP_HEADER" = x"yes"; then
+       AC_DEFINE(BROKEN_RHEL5_SYS_CAP_HEADER,1,[Broken RHEL5 sys/capability.h])
+fi
+;;
+esac
+
+# passwd
+AC_CHECK_HEADERS(grp.h sys/id.h compat.h shadow.h sys/priv.h pwd.h sys/security.h)
+AC_CHECK_FUNCS(getpwnam_r getpwuid_r getpwent_r)
+AC_HAVE_DECL(getpwent_r, [
+       #include <unistd.h>
+       #include <pwd.h>
+       ])
+AC_VERIFY_C_PROTOTYPE([struct passwd *getpwent_r(struct passwd *src, char *buf, int buflen)],
+       [
+       #ifndef HAVE_GETPWENT_R_DECL
+       #error missing getpwent_r prototype
+       #endif
+       return NULL;
+       ],[
+       AC_DEFINE(SOLARIS_GETPWENT_R, 1, [getpwent_r solaris function prototype])
+       ],[],[
+       #include <unistd.h>
+       #include <pwd.h>
+       ])
+AC_VERIFY_C_PROTOTYPE([struct passwd *getpwent_r(struct passwd *src, char *buf, size_t buflen)],
+       [
+       #ifndef HAVE_GETPWENT_R_DECL
+       #error missing getpwent_r prototype
+       #endif
+       return NULL;
+       ],[
+       AC_DEFINE(SOLARIS_GETPWENT_R, 1, [getpwent_r irix (similar to solaris) function prototype])
+       ],[],[
+       #include <unistd.h>
+       #include <pwd.h>
+       ])
+AC_CHECK_FUNCS(getgrnam_r getgrgid_r getgrent_r)
+AC_HAVE_DECL(getgrent_r, [
+       #include <unistd.h>
+       #include <grp.h>
+       ])
+AC_VERIFY_C_PROTOTYPE([struct group *getgrent_r(struct group *src, char *buf, int buflen)],
+       [
+       #ifndef HAVE_GETGRENT_R_DECL
+       #error missing getgrent_r prototype
+       #endif
+       return NULL;
+       ],[
+       AC_DEFINE(SOLARIS_GETGRENT_R, 1, [getgrent_r solaris function prototype])
+       ],[],[
+       #include <unistd.h>
+       #include <grp.h>
+       ])
+
+AC_VERIFY_C_PROTOTYPE([struct group *getgrent_r(struct group *src, char *buf, size_t buflen)],
+       [
+       #ifndef HAVE_GETGRENT_R_DECL
+       #error missing getgrent_r prototype
+       #endif
+       return NULL;
+       ],[
+       AC_DEFINE(SOLARIS_GETGRENT_R, 1, [getgrent_r irix (similar to solaris)  function prototype])
+       ],[],[
+       #include <unistd.h>
+       #include <grp.h>
+       ])
+AC_CHECK_FUNCS(getgrouplist)
+
+# locale
+AC_CHECK_HEADERS(ctype.h locale.h langinfo.h)
+
+# glob
+AC_CHECK_HEADERS(fnmatch.h)
+
+# shmem
+AC_CHECK_HEADERS(sys/ipc.h sys/mman.h sys/shm.h )
+
+# terminal
+AC_CHECK_HEADERS(termios.h termio.h sys/termio.h )
diff --git a/ctdb/lib/replace/system/dir.h b/ctdb/lib/replace/system/dir.h
new file mode 100644 (file)
index 0000000..dec2d54
--- /dev/null
@@ -0,0 +1,67 @@
+#ifndef _system_dir_h
+#define _system_dir_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   directory system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#if HAVE_DIRENT_H
+# include <dirent.h>
+# define NAMLEN(dirent) strlen((dirent)->d_name)
+#else
+# define dirent direct
+# define NAMLEN(dirent) (dirent)->d_namlen
+# if HAVE_SYS_NDIR_H
+#  include <sys/ndir.h>
+# endif
+# if HAVE_SYS_DIR_H
+#  include <sys/dir.h>
+# endif
+# if HAVE_NDIR_H
+#  include <ndir.h>
+# endif
+#endif
+
+#ifndef HAVE_MKDIR_MODE
+#define mkdir(dir, mode) mkdir(dir)
+#endif
+
+/* Test whether a file name is the "." or ".." directory entries.
+ * These really should be inline functions.
+ */
+#ifndef ISDOT
+#define ISDOT(path) ( \
+                       *((const char *)(path)) == '.' && \
+                       *(((const char *)(path)) + 1) == '\0' \
+                   )
+#endif
+
+#ifndef ISDOTDOT
+#define ISDOTDOT(path) ( \
+                           *((const char *)(path)) == '.' && \
+                           *(((const char *)(path)) + 1) == '.' && \
+                           *(((const char *)(path)) + 2) == '\0' \
+                       )
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/filesys.h b/ctdb/lib/replace/system/filesys.h
new file mode 100644 (file)
index 0000000..c8ac2b4
--- /dev/null
@@ -0,0 +1,277 @@
+#ifndef _system_filesys_h
+#define _system_filesys_h
+/*
+   Unix SMB/CIFS implementation.
+
+   filesystem system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#include <unistd.h>
+#include <sys/stat.h>
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_SYS_MOUNT_H
+#include <sys/mount.h>
+#endif
+
+#ifdef HAVE_MNTENT_H
+#include <mntent.h>
+#endif
+
+#ifdef HAVE_SYS_VFS_H
+#include <sys/vfs.h>
+#endif
+
+#ifdef HAVE_SYS_ACL_H
+#include <sys/acl.h>
+#endif
+
+#ifdef HAVE_ACL_LIBACL_H
+#include <acl/libacl.h>
+#endif
+
+#ifdef HAVE_SYS_FS_S5PARAM_H
+#include <sys/fs/s5param.h>
+#endif
+
+#if defined (HAVE_SYS_FILSYS_H) && !defined (_CRAY)
+#include <sys/filsys.h>
+#endif
+
+#ifdef HAVE_SYS_STATFS_H
+# include <sys/statfs.h>
+#endif
+
+#ifdef HAVE_DUSTAT_H
+#include <sys/dustat.h>
+#endif
+
+#ifdef HAVE_SYS_STATVFS_H
+#include <sys/statvfs.h>
+#endif
+
+#ifdef HAVE_SYS_FILIO_H
+#include <sys/filio.h>
+#endif
+
+#ifdef HAVE_SYS_FILE_H
+#include <sys/file.h>
+#endif
+
+#ifdef HAVE_FCNTL_H
+#include <fcntl.h>
+#else
+#ifdef HAVE_SYS_FCNTL_H
+#include <sys/fcntl.h>
+#endif
+#endif
+
+#ifdef HAVE_SYS_MODE_H
+/* apparently AIX needs this for S_ISLNK */
+#ifndef S_ISLNK
+#include <sys/mode.h>
+#endif
+#endif
+
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+
+#ifdef HAVE_SYS_UIO_H
+#include <sys/uio.h>
+#endif
+
+/*
+ * Veritas File System.  Often in addition to native.
+ * Quotas different.
+ */
+#if defined(HAVE_SYS_FS_VX_QUOTA_H)
+#define VXFS_QUOTA
+#endif
+
+#if HAVE_SYS_ATTRIBUTES_H
+#include <sys/attributes.h>
+#elif HAVE_ATTR_ATTRIBUTES_H
+#include <attr/attributes.h>
+#endif
+
+/* mutually exclusive (SuSE 8.2) */
+#if HAVE_ATTR_XATTR_H
+#include <attr/xattr.h>
+#elif HAVE_SYS_XATTR_H
+#include <sys/xattr.h>
+#endif
+
+#ifdef HAVE_SYS_EA_H
+#include <sys/ea.h>
+#endif
+
+#ifdef HAVE_SYS_EXTATTR_H
+#include <sys/extattr.h>
+#endif
+
+#ifdef HAVE_SYS_RESOURCE_H
+#include <sys/resource.h>
+#endif
+
+#ifndef XATTR_CREATE
+#define XATTR_CREATE  0x1       /* set value, fail if attr already exists */
+#endif
+
+#ifndef XATTR_REPLACE
+#define XATTR_REPLACE 0x2       /* set value, fail if attr does not exist */
+#endif
+
+/* Some POSIX definitions for those without */
+
+#ifndef S_IFDIR
+#define S_IFDIR         0x4000
+#endif
+#ifndef S_ISDIR
+#define S_ISDIR(mode)   ((mode & 0xF000) == S_IFDIR)
+#endif
+#ifndef S_IRWXU
+#define S_IRWXU 00700           /* read, write, execute: owner */
+#endif
+#ifndef S_IRUSR
+#define S_IRUSR 00400           /* read permission: owner */
+#endif
+#ifndef S_IWUSR
+#define S_IWUSR 00200           /* write permission: owner */
+#endif
+#ifndef S_IXUSR
+#define S_IXUSR 00100           /* execute permission: owner */
+#endif
+#ifndef S_IRWXG
+#define S_IRWXG 00070           /* read, write, execute: group */
+#endif
+#ifndef S_IRGRP
+#define S_IRGRP 00040           /* read permission: group */
+#endif
+#ifndef S_IWGRP
+#define S_IWGRP 00020           /* write permission: group */
+#endif
+#ifndef S_IXGRP
+#define S_IXGRP 00010           /* execute permission: group */
+#endif
+#ifndef S_IRWXO
+#define S_IRWXO 00007           /* read, write, execute: other */
+#endif
+#ifndef S_IROTH
+#define S_IROTH 00004           /* read permission: other */
+#endif
+#ifndef S_IWOTH
+#define S_IWOTH 00002           /* write permission: other */
+#endif
+#ifndef S_IXOTH
+#define S_IXOTH 00001           /* execute permission: other */
+#endif
+
+#ifndef O_ACCMODE
+#define O_ACCMODE (O_RDONLY | O_WRONLY | O_RDWR)
+#endif
+
+#ifndef MAXPATHLEN
+#define MAXPATHLEN 256
+#endif
+
+#ifndef SEEK_SET
+#define SEEK_SET 0
+#endif
+
+#ifdef _WIN32
+#define mkdir(d,m) _mkdir(d)
+#endif
+
+#ifdef UID_WRAPPER
+# ifndef UID_WRAPPER_DISABLE
+#  ifndef UID_WRAPPER_NOT_REPLACE
+#   define UID_WRAPPER_REPLACE
+#  endif /* UID_WRAPPER_NOT_REPLACE */
+#  include "../uid_wrapper/uid_wrapper.h"
+# endif /* UID_WRAPPER_DISABLE */
+#else /* UID_WRAPPER */
+# define uwrap_enabled() 0
+#endif /* UID_WRAPPER */
+
+/*
+   this allows us to use a uniform error handling for our xattr
+   wrappers
+*/
+#ifndef ENOATTR
+#define ENOATTR ENODATA
+#endif
+
+
+#if !defined(HAVE_GETXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+ssize_t rep_getxattr (const char *path, const char *name, void *value, size_t size);
+#define getxattr(path, name, value, size) rep_getxattr(path, name, value, size)
+/* define is in "replace.h" */
+#endif
+
+#if !defined(HAVE_FGETXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+ssize_t rep_fgetxattr (int filedes, const char *name, void *value, size_t size);
+#define fgetxattr(filedes, name, value, size) rep_fgetxattr(filedes, name, value, size)
+/* define is in "replace.h" */
+#endif
+
+#if !defined(HAVE_LISTXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+ssize_t rep_listxattr (const char *path, char *list, size_t size);
+#define listxattr(path, list, size) rep_listxattr(path, list, size)
+/* define is in "replace.h" */
+#endif
+
+#if !defined(HAVE_FLISTXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+ssize_t rep_flistxattr (int filedes, char *list, size_t size);
+#define flistxattr(filedes, value, size) rep_flistxattr(filedes, value, size)
+/* define is in "replace.h" */
+#endif
+
+#if !defined(HAVE_REMOVEXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+int rep_removexattr (const char *path, const char *name);
+#define removexattr(path, name) rep_removexattr(path, name)
+/* define is in "replace.h" */
+#endif
+
+#if !defined(HAVE_FREMOVEXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+int rep_fremovexattr (int filedes, const char *name);
+#define fremovexattr(filedes, name) rep_fremovexattr(filedes, name)
+/* define is in "replace.h" */
+#endif
+
+#if !defined(HAVE_SETXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+int rep_setxattr (const char *path, const char *name, const void *value, size_t size, int flags);
+#define setxattr(path, name, value, size, flags) rep_setxattr(path, name, value, size, flags)
+/* define is in "replace.h" */
+#endif
+
+#if !defined(HAVE_FSETXATTR) || defined(XATTR_ADDITIONAL_OPTIONS)
+int rep_fsetxattr (int filedes, const char *name, const void *value, size_t size, int flags);
+#define fsetxattr(filedes, name, value, size, flags) rep_fsetxattr(filedes, name, value, size, flags)
+/* define is in "replace.h" */
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/glob.h b/ctdb/lib/replace/system/glob.h
new file mode 100644 (file)
index 0000000..3e23db6
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef _system_glob_h
+#define _system_glob_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   glob system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_GLOB_H
+#include <glob.h>
+#endif
+
+#ifdef HAVE_FNMATCH_H
+#include <fnmatch.h>
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/gssapi.h b/ctdb/lib/replace/system/gssapi.h
new file mode 100644 (file)
index 0000000..6386c7b
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef _system_gssapi_h
+#define _system_gssapi_h
+
+/*
+   Unix SMB/CIFS implementation.
+
+   GSSAPI system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_GSSAPI
+
+#ifdef HAVE_GSSAPI_GSSAPI_EXT_H
+#include <gssapi/gssapi_ext.h>
+#elif HAVE_GSSAPI_GSSAPI_H
+#include <gssapi/gssapi.h>
+#elif HAVE_GSSAPI_GSSAPI_GENERIC_H
+#include <gssapi/gssapi_generic.h>
+#elif HAVE_GSSAPI_H
+#include <gssapi.h>
+#endif
+
+#if HAVE_GSSAPI_GSSAPI_KRB5_H
+#include <gssapi/gssapi_krb5.h>
+#endif
+
+#if HAVE_GSSAPI_GSSAPI_SPNEGO_H
+#include <gssapi/gssapi_spnego.h>
+#elif HAVE_GSSAPI_SPNEGO_H
+#include <gssapi_spnego.h>
+#endif
+
+#endif
+#endif
diff --git a/ctdb/lib/replace/system/iconv.h b/ctdb/lib/replace/system/iconv.h
new file mode 100644 (file)
index 0000000..3c8a71f
--- /dev/null
@@ -0,0 +1,57 @@
+#ifndef _system_iconv_h
+#define _system_iconv_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   iconv memory system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#if !defined(HAVE_ICONV) && defined(HAVE_ICONV_H)
+#define HAVE_ICONV
+#endif
+
+#if !defined(HAVE_GICONV) && defined(HAVE_GICONV_H)
+#define HAVE_GICONV
+#endif
+
+#if !defined(HAVE_BICONV) && defined(HAVE_BICONV_H)
+#define HAVE_BICONV
+#endif
+
+#ifdef HAVE_NATIVE_ICONV
+#if defined(HAVE_ICONV)
+#include <iconv.h>
+#elif defined(HAVE_GICONV)
+#include <giconv.h>
+#elif defined(HAVE_BICONV)
+#include <biconv.h>
+#endif
+#endif /* HAVE_NATIVE_ICONV */
+
+/* needed for some systems without iconv. Doesn't really matter
+   what error code we use */
+#ifndef EILSEQ
+#define EILSEQ EIO
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/kerberos.h b/ctdb/lib/replace/system/kerberos.h
new file mode 100644 (file)
index 0000000..636ce0f
--- /dev/null
@@ -0,0 +1,41 @@
+#ifndef _system_kerberos_h
+#define _system_kerberos_h
+
+/* 
+   Unix SMB/CIFS implementation.
+
+   kerberos system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_KRB5
+
+#if HAVE_KRB5_H
+#include <krb5.h>
+#endif
+
+#if HAVE_COM_ERR_H
+#include <com_err.h>
+#endif
+
+#endif
+#endif
diff --git a/ctdb/lib/replace/system/locale.h b/ctdb/lib/replace/system/locale.h
new file mode 100644 (file)
index 0000000..504a3bb
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef _system_locale_h
+#define _system_locale_h
+
+/* 
+   Unix SMB/CIFS implementation.
+
+   locale include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+
+#ifdef HAVE_LOCALE_H
+#include <locale.h>
+#endif
+
+#ifdef HAVE_LANGINFO_H
+#include <langinfo.h>
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/network.h b/ctdb/lib/replace/system/network.h
new file mode 100644 (file)
index 0000000..7cb8d7b
--- /dev/null
@@ -0,0 +1,390 @@
+#ifndef _system_network_h
+#define _system_network_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   networking system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   Copyright (C) Jelmer Vernooij 2007
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifndef LIBREPLACE_NETWORK_CHECKS
+#error "AC_LIBREPLACE_NETWORK_CHECKS missing in configure"
+#endif
+
+#include <unistd.h>
+
+#ifdef HAVE_SYS_SOCKET_H
+#include <sys/socket.h>
+#endif
+
+#ifdef HAVE_UNIXSOCKET
+#include <sys/un.h>
+#endif
+
+#ifdef HAVE_NETINET_IN_H
+#include <netinet/in.h>
+#endif
+#ifdef HAVE_ARPA_INET_H
+#include <arpa/inet.h>
+#endif
+
+#ifdef HAVE_NETDB_H
+#include <netdb.h>
+#endif
+
+#ifdef HAVE_NETINET_TCP_H
+#include <netinet/tcp.h>
+#endif
+
+/*
+ * The next three defines are needed to access the IPTOS_* options
+ * on some systems.
+ */
+
+#ifdef HAVE_NETINET_IN_SYSTM_H
+#include <netinet/in_systm.h>
+#endif
+
+#ifdef HAVE_NETINET_IN_IP_H
+#include <netinet/in_ip.h>
+#endif
+
+#ifdef HAVE_NETINET_IP_H
+#include <netinet/ip.h>
+#endif
+
+#ifdef HAVE_NET_IF_H
+#include <net/if.h>
+#endif
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+
+#ifdef HAVE_SYS_IOCTL_H
+#include <sys/ioctl.h>
+#endif
+
+#ifdef HAVE_SYS_UIO_H
+#include <sys/uio.h>
+#endif
+
+#ifdef HAVE_STROPTS_H
+#include <stropts.h>
+#endif
+
+#ifndef HAVE_SOCKLEN_T
+#define HAVE_SOCKLEN_T
+typedef int socklen_t;
+#endif
+
+#if !defined (HAVE_INET_NTOA) || defined(REPLACE_INET_NTOA)
+/* define is in "replace.h" */
+char *rep_inet_ntoa(struct in_addr ip);
+#endif
+
+#ifndef HAVE_INET_PTON
+/* define is in "replace.h" */
+int rep_inet_pton(int af, const char *src, void *dst);
+#endif
+
+#ifndef HAVE_INET_NTOP
+/* define is in "replace.h" */
+const char *rep_inet_ntop(int af, const void *src, char *dst, socklen_t size);
+#endif
+
+#ifndef HAVE_INET_ATON
+/* define is in "replace.h" */
+int rep_inet_aton(const char *src, struct in_addr *dst);
+#endif
+
+#ifndef HAVE_CONNECT
+/* define is in "replace.h" */
+int rep_connect(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
+#endif
+
+#ifndef HAVE_GETHOSTBYNAME
+/* define is in "replace.h" */
+struct hostent *rep_gethostbyname(const char *name);
+#endif
+
+#ifdef HAVE_IFADDRS_H
+#include <ifaddrs.h>
+#endif
+
+#ifndef HAVE_STRUCT_IFADDRS
+struct ifaddrs {
+       struct ifaddrs   *ifa_next;         /* Pointer to next struct */
+       char             *ifa_name;         /* Interface name */
+       unsigned int     ifa_flags;         /* Interface flags */
+       struct sockaddr  *ifa_addr;         /* Interface address */
+       struct sockaddr  *ifa_netmask;      /* Interface netmask */
+#undef ifa_dstaddr
+       struct sockaddr  *ifa_dstaddr;      /* P2P interface destination */
+       void             *ifa_data;         /* Address specific data */
+};
+#endif
+
+#ifndef HAVE_GETIFADDRS
+int rep_getifaddrs(struct ifaddrs **);
+#endif
+
+#ifndef HAVE_FREEIFADDRS
+void rep_freeifaddrs(struct ifaddrs *);
+#endif
+
+#ifndef HAVE_SOCKETPAIR
+/* define is in "replace.h" */
+int rep_socketpair(int d, int type, int protocol, int sv[2]);
+#endif
+
+/*
+ * Some systems have getaddrinfo but not the
+ * defines needed to use it.
+ */
+
+/* Various macros that ought to be in <netdb.h>, but might not be */
+
+#ifndef EAI_FAIL
+#define EAI_BADFLAGS   (-1)
+#define EAI_NONAME     (-2)
+#define EAI_AGAIN      (-3)
+#define EAI_FAIL       (-4)
+#define EAI_FAMILY     (-6)
+#define EAI_SOCKTYPE   (-7)
+#define EAI_SERVICE    (-8)
+#define EAI_MEMORY     (-10)
+#define EAI_SYSTEM     (-11)
+#endif   /* !EAI_FAIL */
+
+#ifndef AI_PASSIVE
+#define AI_PASSIVE     0x0001
+#endif
+
+#ifndef AI_CANONNAME
+#define AI_CANONNAME   0x0002
+#endif
+
+#ifndef AI_NUMERICHOST
+/*
+ * some platforms don't support AI_NUMERICHOST; define as zero if using
+ * the system version of getaddrinfo...
+ */
+#if defined(HAVE_STRUCT_ADDRINFO) && defined(HAVE_GETADDRINFO)
+#define AI_NUMERICHOST 0
+#else
+#define AI_NUMERICHOST 0x0004
+#endif
+#endif
+
+/*
+ * Some of the functions in source3/lib/util_sock.c use AI_ADDRCONFIG. On QNX
+ * 6.3.0, this macro is defined but, if it's used, getaddrinfo will fail. This
+ * prevents smbd from opening any sockets.
+ *
+ * If I undefine AI_ADDRCONFIG on such systems and define it to be 0,
+ * this works around the issue.
+ */
+#ifdef __QNX__
+#include <sys/neutrino.h>
+#if _NTO_VERSION == 630
+#undef AI_ADDRCONFIG
+#endif
+#endif
+#ifndef AI_ADDRCONFIG
+/*
+ * logic copied from AI_NUMERICHOST
+ */
+#if defined(HAVE_STRUCT_ADDRINFO) && defined(HAVE_GETADDRINFO)
+#define AI_ADDRCONFIG  0
+#else
+#define AI_ADDRCONFIG  0x0020
+#endif
+#endif
+
+#ifndef AI_NUMERICSERV
+/*
+ * logic copied from AI_NUMERICHOST
+ */
+#if defined(HAVE_STRUCT_ADDRINFO) && defined(HAVE_GETADDRINFO)
+#define AI_NUMERICSERV 0
+#else
+#define AI_NUMERICSERV 0x0400
+#endif
+#endif
+
+#ifndef NI_NUMERICHOST
+#define NI_NUMERICHOST 1
+#endif
+
+#ifndef NI_NUMERICSERV
+#define NI_NUMERICSERV 2
+#endif
+
+#ifndef NI_NOFQDN
+#define NI_NOFQDN      4
+#endif
+
+#ifndef NI_NAMEREQD
+#define NI_NAMEREQD    8
+#endif
+
+#ifndef NI_DGRAM
+#define NI_DGRAM       16
+#endif
+
+
+#ifndef NI_MAXHOST
+#define NI_MAXHOST     1025
+#endif
+
+#ifndef NI_MAXSERV
+#define NI_MAXSERV     32
+#endif
+
+/*
+ * glibc on linux doesn't seem to have MSG_WAITALL
+ * defined. I think the kernel has it though..
+ */
+#ifndef MSG_WAITALL
+#define MSG_WAITALL 0
+#endif
+
+#ifndef INADDR_LOOPBACK
+#define INADDR_LOOPBACK 0x7f000001
+#endif
+
+#ifndef INADDR_NONE
+#define INADDR_NONE 0xffffffff
+#endif
+
+#ifndef EAFNOSUPPORT
+#define EAFNOSUPPORT EINVAL
+#endif
+
+#ifndef INET_ADDRSTRLEN
+#define INET_ADDRSTRLEN 16
+#endif
+
+#ifndef INET6_ADDRSTRLEN
+#define INET6_ADDRSTRLEN 46
+#endif
+
+#ifndef HOST_NAME_MAX
+#define HOST_NAME_MAX 255
+#endif
+
+#ifndef MAXHOSTNAMELEN
+#define MAXHOSTNAMELEN HOST_NAME_MAX
+#endif
+
+#ifndef HAVE_SA_FAMILY_T
+#define HAVE_SA_FAMILY_T
+typedef unsigned short int sa_family_t;
+#endif
+
+#ifndef HAVE_STRUCT_SOCKADDR_STORAGE
+#define HAVE_STRUCT_SOCKADDR_STORAGE
+#ifdef HAVE_STRUCT_SOCKADDR_IN6
+#define sockaddr_storage sockaddr_in6
+#define ss_family sin6_family
+#define HAVE_SS_FAMILY 1
+#else /*HAVE_STRUCT_SOCKADDR_IN6*/
+#define sockaddr_storage sockaddr_in
+#define ss_family sin_family
+#define HAVE_SS_FAMILY 1
+#endif /*HAVE_STRUCT_SOCKADDR_IN6*/
+#endif /*HAVE_STRUCT_SOCKADDR_STORAGE*/
+
+#ifndef HAVE_SS_FAMILY
+#ifdef HAVE___SS_FAMILY
+#define ss_family __ss_family
+#define HAVE_SS_FAMILY 1
+#endif
+#endif
+
+#ifndef IOV_MAX
+# ifdef UIO_MAXIOV
+#  define IOV_MAX UIO_MAXIOV
+# else
+#  ifdef __sgi
+    /*
+     * IRIX 6.5 has sysconf(_SC_IOV_MAX)
+     * which might return 512 or bigger
+     */
+#   define IOV_MAX 512
+#  endif
+# endif
+#endif
+
+#ifndef HAVE_STRUCT_ADDRINFO
+#define HAVE_STRUCT_ADDRINFO
+struct addrinfo {
+       int                     ai_flags;
+       int                     ai_family;
+       int                     ai_socktype;
+       int                     ai_protocol;
+       socklen_t               ai_addrlen;
+       struct sockaddr         *ai_addr;
+       char                    *ai_canonname;
+       struct addrinfo         *ai_next;
+};
+#endif   /* HAVE_STRUCT_ADDRINFO */
+
+#if !defined(HAVE_GETADDRINFO)
+#include "getaddrinfo.h"
+#endif
+
+/* Needed for some systems that don't define it (Solaris). */
+#ifndef ifr_netmask
+#define ifr_netmask ifr_addr
+#endif
+
+/* Some old Linux systems have broken header files */
+#ifdef HAVE_IPV6
+#ifdef HAVE_LINUX_IPV6_V6ONLY_26
+#define IPV6_V6ONLY 26
+#endif /* HAVE_LINUX_IPV6_V6ONLY_26 */
+#endif /* HAVE_IPV6 */
+
+#ifdef SOCKET_WRAPPER
+#ifndef SOCKET_WRAPPER_DISABLE
+#ifndef SOCKET_WRAPPER_NOT_REPLACE
+#define SOCKET_WRAPPER_REPLACE
+#endif /* SOCKET_WRAPPER_NOT_REPLACE */
+#include "../socket_wrapper/socket_wrapper.h"
+#endif /* SOCKET_WRAPPER_DISABLE */
+#endif /* SOCKET_WRAPPER */
+
+#ifdef UID_WRAPPER
+# ifndef UID_WRAPPER_DISABLE
+#  ifndef UID_WRAPPER_NOT_REPLACE
+#   define UID_WRAPPER_REPLACE
+#  endif /* UID_WRAPPER_NOT_REPLACE */
+#  include "../uid_wrapper/uid_wrapper.h"
+# endif /* UID_WRAPPER_DISABLE */
+#else /* UID_WRAPPER */
+# define uwrap_enabled() 0
+#endif /* UID_WRAPPER */
+
+#endif
diff --git a/ctdb/lib/replace/system/passwd.h b/ctdb/lib/replace/system/passwd.h
new file mode 100644 (file)
index 0000000..8257e06
--- /dev/null
@@ -0,0 +1,114 @@
+#ifndef _system_passwd_h
+#define _system_passwd_h
+
+/*
+   Unix SMB/CIFS implementation.
+
+   passwd system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+/* this needs to be included before nss_wrapper.h on some systems */
+#include <unistd.h>
+
+#ifdef HAVE_PWD_H
+#include <pwd.h>
+#endif
+#ifdef HAVE_GRP_H
+#include <grp.h>
+#endif
+#ifdef HAVE_SYS_PRIV_H
+#include <sys/priv.h>
+#endif
+#ifdef HAVE_SYS_ID_H
+#include <sys/id.h>
+#endif
+
+#ifdef HAVE_CRYPT_H
+#include <crypt.h>
+#endif
+
+#ifdef HAVE_SHADOW_H
+#include <shadow.h>
+#endif
+
+#ifdef HAVE_SYS_SECURITY_H
+#include <sys/security.h>
+#include <prot.h>
+#define PASSWORD_LENGTH 16
+#endif  /* HAVE_SYS_SECURITY_H */
+
+#ifdef HAVE_GETPWANAM
+#include <sys/label.h>
+#include <sys/audit.h>
+#include <pwdadj.h>
+#endif
+
+#ifdef HAVE_COMPAT_H
+#include <compat.h>
+#endif
+
+#ifndef NGROUPS_MAX
+#define NGROUPS_MAX 32 /* Guess... */
+#endif
+
+/* what is the longest significant password available on your system?
+ Knowing this speeds up password searches a lot */
+#ifndef PASSWORD_LENGTH
+#define PASSWORD_LENGTH 8
+#endif
+
+#if defined(HAVE_PUTPRPWNAM) && defined(AUTH_CLEARTEXT_SEG_CHARS)
+#define OSF1_ENH_SEC 1
+#endif
+
+#ifndef ALLOW_CHANGE_PASSWORD
+#if (defined(HAVE_TERMIOS_H) && defined(HAVE_DUP2) && defined(HAVE_SETSID))
+#define ALLOW_CHANGE_PASSWORD 1
+#endif
+#endif
+
+#if defined(HAVE_CRYPT16) && defined(HAVE_GETAUTHUID)
+#define ULTRIX_AUTH 1
+#endif
+
+#ifdef NSS_WRAPPER
+#ifndef NSS_WRAPPER_DISABLE
+#ifndef NSS_WRAPPER_NOT_REPLACE
+#define NSS_WRAPPER_REPLACE
+#endif /* NSS_WRAPPER_NOT_REPLACE */
+#include "../nss_wrapper/nss_wrapper.h"
+#endif /* NSS_WRAPPER_DISABLE */
+#endif /* NSS_WRAPPER */
+
+#ifdef UID_WRAPPER
+# ifndef UID_WRAPPER_DISABLE
+#  ifndef UID_WRAPPER_NOT_REPLACE
+#   define UID_WRAPPER_REPLACE
+#  endif /* UID_WRAPPER_NOT_REPLACE */
+#  include "../uid_wrapper/uid_wrapper.h"
+# endif /* UID_WRAPPER_DISABLE */
+#else /* UID_WRAPPER */
+# define uwrap_enabled() 0
+#endif /* UID_WRAPPER */
+
+#endif
diff --git a/ctdb/lib/replace/system/readline.h b/ctdb/lib/replace/system/readline.h
new file mode 100644 (file)
index 0000000..e6b8fb9
--- /dev/null
@@ -0,0 +1,58 @@
+#ifndef _system_readline_h
+#define _system_readline_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   Readline wrappers
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_LIBREADLINE
+#  ifdef HAVE_READLINE_READLINE_H
+#    include <readline/readline.h>
+#    ifdef HAVE_READLINE_HISTORY_H
+#      include <readline/history.h>
+#    endif
+#  else
+#    ifdef HAVE_READLINE_H
+#      include <readline.h>
+#      ifdef HAVE_HISTORY_H
+#        include <history.h>
+#      endif
+#    else
+#      undef HAVE_LIBREADLINE
+#    endif
+#  endif
+#endif
+
+#ifdef HAVE_NEW_LIBREADLINE
+#ifdef HAVE_CPPFUNCTION
+#  define RL_COMPLETION_CAST (CPPFunction *)
+#elif HAVE_RL_COMPLETION_T
+#  define RL_COMPLETION_CAST (rl_completion_t *)
+#else
+#  define RL_COMPLETION_CAST
+#endif
+#else
+/* This type is missing from libreadline<4.0  (approximately) */
+#  define RL_COMPLETION_CAST
+#endif /* HAVE_NEW_LIBREADLINE */
+
+#endif
diff --git a/ctdb/lib/replace/system/select.h b/ctdb/lib/replace/system/select.h
new file mode 100644 (file)
index 0000000..11c5390
--- /dev/null
@@ -0,0 +1,77 @@
+#ifndef _system_select_h
+#define _system_select_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   select system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+
+#ifdef HAVE_SYS_EPOLL_H
+#include <sys/epoll.h>
+#endif
+
+#ifndef SELECT_CAST
+#define SELECT_CAST
+#endif
+
+#ifdef HAVE_POLL
+
+#include <poll.h>
+
+#else
+
+/* Type used for the number of file descriptors.  */
+typedef unsigned long int nfds_t;
+
+/* Data structure describing a polling request.  */
+struct pollfd
+{
+       int fd;            /* File descriptor to poll.  */
+       short int events;  /* Types of events poller cares about.  */
+       short int revents; /* Types of events that actually occurred.  */
+};
+
+/* Event types that can be polled for.  These bits may be set in `events'
+   to indicate the interesting event types; they will appear in `revents'
+   to indicate the status of the file descriptor.  */
+#define POLLIN         0x001           /* There is data to read.  */
+#define POLLPRI                0x002           /* There is urgent data to read.  */
+#define POLLOUT                0x004           /* Writing now will not block.  */
+#define POLLRDNORM     0x040           /* Normal data may be read.  */
+#define POLLRDBAND     0x080           /* Priority data may be read.  */
+#define POLLWRNORM     0x100           /* Writing now will not block.  */
+#define POLLWRBAND     0x200           /* Priority data may be written.  */
+#define POLLERR                0x008           /* Error condition.  */
+#define POLLHUP                0x010           /* Hung up.  */
+#define POLLNVAL       0x020           /* Invalid polling request.  */
+
+/* define is in "replace.h" */
+int rep_poll(struct pollfd *fds, nfds_t nfds, int timeout);
+
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/shmem.h b/ctdb/lib/replace/system/shmem.h
new file mode 100644 (file)
index 0000000..64fe39b
--- /dev/null
@@ -0,0 +1,59 @@
+#ifndef _system_shmem_h
+#define _system_shmem_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   shared memory system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#if defined(HAVE_SYS_IPC_H)
+#include <sys/ipc.h>
+#endif /* HAVE_SYS_IPC_H */
+
+#if defined(HAVE_SYS_SHM_H)
+#include <sys/shm.h>
+#endif /* HAVE_SYS_SHM_H */
+
+#ifdef HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+
+/* NetBSD doesn't have these */
+#ifndef SHM_R
+#define SHM_R 0400
+#endif
+
+#ifndef SHM_W
+#define SHM_W 0200
+#endif
+
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
+
+#ifndef MAP_FAILED
+#define MAP_FAILED ((void *)-1)
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/syslog.h b/ctdb/lib/replace/system/syslog.h
new file mode 100644 (file)
index 0000000..104be1d
--- /dev/null
@@ -0,0 +1,70 @@
+#ifndef _system_syslog_h
+#define _system_syslog_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   syslog system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_SYSLOG_H
+#include <syslog.h>
+#else
+#ifdef HAVE_SYS_SYSLOG_H
+#include <sys/syslog.h>
+#endif
+#endif
+
+/* For sys_adminlog(). */
+#ifndef LOG_EMERG
+#define LOG_EMERG       0       /* system is unusable */
+#endif
+
+#ifndef LOG_ALERT
+#define LOG_ALERT       1       /* action must be taken immediately */
+#endif
+
+#ifndef LOG_CRIT
+#define LOG_CRIT        2       /* critical conditions */
+#endif
+
+#ifndef LOG_ERR
+#define LOG_ERR         3       /* error conditions */
+#endif
+
+#ifndef LOG_WARNING
+#define LOG_WARNING     4       /* warning conditions */
+#endif
+
+#ifndef LOG_NOTICE
+#define LOG_NOTICE      5       /* normal but significant condition */
+#endif
+
+#ifndef LOG_INFO
+#define LOG_INFO        6       /* informational */
+#endif
+
+#ifndef LOG_DEBUG
+#define LOG_DEBUG       7       /* debug-level messages */
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/terminal.h b/ctdb/lib/replace/system/terminal.h
new file mode 100644 (file)
index 0000000..9ad601a
--- /dev/null
@@ -0,0 +1,46 @@
+#ifndef _system_terminal_h
+#define _system_terminal_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   terminal system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef SUNOS4
+/* on SUNOS4 termios.h conflicts with sys/ioctl.h */
+#undef HAVE_TERMIOS_H
+#endif
+
+
+#if defined(HAVE_TERMIOS_H)
+/* POSIX terminal handling. */
+#include <termios.h>
+#elif defined(HAVE_TERMIO_H)
+/* Older SYSV terminal handling - don't use if we can avoid it. */
+#include <termio.h>
+#elif defined(HAVE_SYS_TERMIO_H)
+/* Older SYSV terminal handling - don't use if we can avoid it. */
+#include <sys/termio.h>
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/time.h b/ctdb/lib/replace/system/time.h
new file mode 100644 (file)
index 0000000..b6d2609
--- /dev/null
@@ -0,0 +1,91 @@
+#ifndef _system_time_h
+#define _system_time_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   time system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef TIME_WITH_SYS_TIME
+#include <sys/time.h>
+#include <time.h>
+#else
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+#endif
+
+#ifdef HAVE_UTIME_H
+#include <utime.h>
+#else
+struct utimbuf {
+       time_t actime;       /* access time */
+       time_t modtime;      /* modification time */
+};
+#endif
+
+#ifndef HAVE_STRUCT_TIMESPEC
+struct timespec {
+       time_t tv_sec;            /* Seconds.  */
+       long tv_nsec;           /* Nanoseconds.  */
+};
+#endif
+
+#ifndef HAVE_MKTIME
+/* define is in "replace.h" */
+time_t rep_mktime(struct tm *t);
+#endif
+
+#ifndef HAVE_TIMEGM
+/* define is in "replace.h" */
+time_t rep_timegm(struct tm *tm);
+#endif
+
+#ifndef HAVE_UTIME
+/* define is in "replace.h" */
+int rep_utime(const char *filename, const struct utimbuf *buf);
+#endif
+
+#ifndef HAVE_UTIMES
+/* define is in "replace.h" */
+int rep_utimes(const char *filename, const struct timeval tv[2]);
+#endif
+
+#ifndef HAVE_CLOCK_GETTIME
+/* CLOCK_REALTIME is required by POSIX */
+#define CLOCK_REALTIME 0
+typedef int clockid_t;
+int rep_clock_gettime(clockid_t clk_id, struct timespec *tp);
+#endif
+/* make sure we have a best effort CUSTOM_CLOCK_MONOTONIC we can rely on */
+#if defined(CLOCK_MONOTONIC)
+#define CUSTOM_CLOCK_MONOTONIC CLOCK_MONOTONIC
+#elif defined(CLOCK_HIGHRES)
+#define CUSTOM_CLOCK_MONOTONIC CLOCK_HIGHRES
+#else
+#define CUSTOM_CLOCK_MONOTONIC CLOCK_REALTIME
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/wait.h b/ctdb/lib/replace/system/wait.h
new file mode 100644 (file)
index 0000000..146c61a
--- /dev/null
@@ -0,0 +1,55 @@
+#ifndef _system_wait_h
+#define _system_wait_h
+/* 
+   Unix SMB/CIFS implementation.
+
+   waitpid system include wrappers
+
+   Copyright (C) Andrew Tridgell 2004
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+#ifdef HAVE_SYS_WAIT_H
+#include <sys/wait.h>
+#endif
+
+#include <signal.h>
+
+#ifndef SIGCLD
+#define SIGCLD SIGCHLD
+#endif
+
+#ifdef HAVE_SETJMP_H
+#include <setjmp.h>
+#endif
+
+#ifdef HAVE_SYS_UCONTEXT_H
+#include <sys/ucontext.h>
+#endif
+
+#if !defined(HAVE_SIG_ATOMIC_T_TYPE)
+typedef int sig_atomic_t;
+#endif
+
+#if !defined(HAVE_WAITPID) && defined(HAVE_WAIT4)
+int rep_waitpid(pid_t pid,int *status,int options)
+#endif
+
+#endif
diff --git a/ctdb/lib/replace/system/wscript_configure b/ctdb/lib/replace/system/wscript_configure
new file mode 100644 (file)
index 0000000..2035474
--- /dev/null
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+
+conf.CHECK_HEADERS('sys/capability.h')
+conf.CHECK_FUNCS('getpwnam_r getpwuid_r getpwent_r')
+
+# solaris varients of getXXent_r
+conf.CHECK_C_PROTOTYPE('getpwent_r',
+                       'struct passwd *getpwent_r(struct passwd *src, char *buf, int buflen)',
+                       define='SOLARIS_GETPWENT_R', headers='pwd.h')
+conf.CHECK_C_PROTOTYPE('getgrent_r',
+                       'struct group *getgrent_r(struct group *src, char *buf, int buflen)',
+                       define='SOLARIS_GETGRENT_R', headers='grp.h')
+
+# the irix varients
+conf.CHECK_C_PROTOTYPE('getpwent_r',
+                       'struct passwd *getpwent_r(struct passwd *src, char *buf, size_t buflen)',
+                       define='SOLARIS_GETPWENT_R', headers='pwd.h')
+conf.CHECK_C_PROTOTYPE('getgrent_r',
+                       'struct group *getgrent_r(struct group *src, char *buf, size_t buflen)',
+                       define='SOLARIS_GETGRENT_R', headers='grp.h')
+
+conf.CHECK_FUNCS('getgrouplist')
+conf.CHECK_HEADERS('ctype.h locale.h langinfo.h')
+conf.CHECK_HEADERS('fnmatch.h locale.h langinfo.h')
+conf.CHECK_HEADERS('sys/ipc.h sys/mman.h sys/shm.h')
+conf.CHECK_HEADERS('termios.h termio.h sys/termio.h')
diff --git a/ctdb/lib/replace/test/getifaddrs.c b/ctdb/lib/replace/test/getifaddrs.c
new file mode 100644 (file)
index 0000000..d325d8b
--- /dev/null
@@ -0,0 +1,101 @@
+/*
+ * Unix SMB/CIFS implementation.
+ *
+ * libreplace getifaddrs test
+ *
+ * Copyright (C) Michael Adam <obnox@samba.org> 2008
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef AUTOCONF_TEST
+#include "replace.h"
+#include "system/network.h"
+#include "replace-test.h"
+#endif
+
+#ifdef HAVE_INET_NTOP
+#define rep_inet_ntop inet_ntop
+#endif
+
+static const char *format_sockaddr(struct sockaddr *addr,
+                                  char *addrstring,
+                                  socklen_t addrlen)
+{
+       const char *result = NULL;
+
+       if (addr->sa_family == AF_INET) {
+               result = rep_inet_ntop(AF_INET,
+                                      &((struct sockaddr_in *)addr)->sin_addr,
+                                      addrstring,
+                                      addrlen);
+#ifdef HAVE_STRUCT_SOCKADDR_IN6
+       } else if (addr->sa_family == AF_INET6) {
+               result = rep_inet_ntop(AF_INET6,
+                                      &((struct sockaddr_in6 *)addr)->sin6_addr,
+                                      addrstring,
+                                      addrlen);
+#endif
+       }
+       return result;
+}
+
+int getifaddrs_test(void)
+{
+       struct ifaddrs *ifs = NULL;
+       struct ifaddrs *ifs_head = NULL;
+       int ret;
+
+       ret = getifaddrs(&ifs);
+       ifs_head = ifs;
+       if (ret != 0) {
+               fprintf(stderr, "getifaddrs() failed: %s\n", strerror(errno));
+               return 1;
+       }
+
+       while (ifs) {
+               printf("%-10s ", ifs->ifa_name);
+               if (ifs->ifa_addr != NULL) {
+                       char addrstring[INET6_ADDRSTRLEN];
+                       const char *result;
+
+                       result = format_sockaddr(ifs->ifa_addr,
+                                                addrstring,
+                                                sizeof(addrstring));
+                       if (result != NULL) {
+                               printf("IP=%s ", addrstring);
+                       }
+
+                       if (ifs->ifa_netmask != NULL) {
+                               result = format_sockaddr(ifs->ifa_netmask,
+                                                        addrstring,
+                                                        sizeof(addrstring));
+                               if (result != NULL) {
+                                       printf("NETMASK=%s", addrstring);
+                               }
+                       } else {
+                               printf("AF=%d ", ifs->ifa_addr->sa_family);
+                       }
+               } else {
+                       printf("<no address>");
+               }
+
+               printf("\n");
+               ifs = ifs->ifa_next;
+       }
+
+       freeifaddrs(ifs_head);
+
+       return 0;
+}
diff --git a/ctdb/lib/replace/test/incoherent_mmap.c b/ctdb/lib/replace/test/incoherent_mmap.c
new file mode 100644 (file)
index 0000000..ee288fd
--- /dev/null
@@ -0,0 +1,83 @@
+/* In OpenBSD, if you write to a file, another process doesn't see it
+ * in its mmap.  Returns with exit status 0 if that is the case, 1 if
+ * it's coherent, and other if there's a problem. */
+#include <err.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+
+#define DATA "coherent.mmap"
+
+int main(int argc, char *argv[])
+{
+       int tochild[2], toparent[2];
+       int fd;
+       volatile unsigned char *map;
+       unsigned char *page;
+        const char *fname = argv[1];
+       char c = 0;
+
+       if (pipe(tochild) != 0 || pipe(toparent) != 0)
+               err(2, "Creating pipe");
+
+       if (!fname)
+               fname = DATA;
+
+       fd = open(fname, O_RDWR|O_CREAT|O_TRUNC, 0600);
+       if (fd < 0)
+               err(2, "opening %s", fname);
+       unlink(fname);
+
+       switch (fork()) {
+       case -1:
+               err(2, "Fork");
+       case 0:
+               close(tochild[1]);
+               close(toparent[0]);
+
+               /* Wait for parent to create file. */
+               if (read(tochild[0], &c, 1) != 1)
+                       err(2, "reading from parent");
+
+               /* Alter first byte. */
+               pwrite(fd, &c, 1, 0);
+
+               if (write(toparent[1], &c, 1) != 1)
+                       err(2, "writing to parent");
+               exit(0);
+
+       default:
+               close(tochild[0]);
+               close(toparent[1]);
+
+               /* Create a file and mmap it. */
+               page = malloc(getpagesize());
+               memset(page, 0x42, getpagesize());
+               if (write(fd, page, getpagesize()) != getpagesize())
+                       err(2, "writing first page");
+               map = mmap(NULL, getpagesize(), PROT_READ|PROT_WRITE,
+                          MAP_SHARED, fd, 0);
+               if (map == MAP_FAILED)
+                       err(2, "mapping file");
+
+               if (*map != 0x42)
+                       errx(2, "first byte isn't 0x42!");
+
+               /* Tell child to alter file. */
+               if (write(tochild[1], &c, 1) != 1)
+                       err(2, "writing to child");
+
+               if (read(toparent[0], &c, 1) != 1)
+                       err(2, "reading from child");
+
+               if (*map)
+                       errx(0, "mmap incoherent: first byte isn't 0.");
+
+               exit(1);
+       }
+}
diff --git a/ctdb/lib/replace/test/main.c b/ctdb/lib/replace/test/main.c
new file mode 100644 (file)
index 0000000..94264d7
--- /dev/null
@@ -0,0 +1,35 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   libreplace tests
+
+   Copyright (C) Jelmer Vernooij 2006
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "replace-testsuite.h"
+
+int main(void)
+{
+       bool ret = torture_local_replace(NULL);
+       if (ret)
+               return 0;
+       return -1;
+}
diff --git a/ctdb/lib/replace/test/os2_delete.c b/ctdb/lib/replace/test/os2_delete.c
new file mode 100644 (file)
index 0000000..a11ed3b
--- /dev/null
@@ -0,0 +1,133 @@
+/*
+  test readdir/unlink pattern that OS/2 uses
+  tridge@samba.org July 2005
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include "replace-test.h"
+
+#define NUM_FILES 700
+#define READDIR_SIZE 100
+#define DELETE_SIZE 4
+
+#define TESTDIR "test.dir"
+
+static int test_readdir_os2_delete_ret;
+
+#define FAILED(d) (printf("failure: readdir [\nFailed for %s - %d = %s\n]\n", d, errno, strerror(errno)), test_readdir_os2_delete_ret = 1, 1)
+
+#ifndef MIN
+#define MIN(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifdef _WIN32
+#define mkdir(d,m) _mkdir(d)
+#endif
+
+static void cleanup(void)
+{
+       /* I'm a lazy bastard */
+       if (system("rm -rf " TESTDIR)) {
+               FAILED("system");
+       }
+       mkdir(TESTDIR, 0700) == 0 || FAILED("mkdir");
+}
+
+static void create_files(void)
+{
+       int i;
+       for (i=0;i<NUM_FILES;i++) {
+               char fname[40];
+               int fd;
+               snprintf(fname, sizeof(fname), TESTDIR "/test%u.txt", i);
+               fd = open(fname, O_CREAT|O_RDWR, 0600);
+               if (fd < 0) {
+                       FAILED("open");
+               }
+               if (close(fd) != 0) {
+                       FAILED("close");
+               }
+       }
+}
+
+static int os2_delete(DIR *d)
+{
+       off_t offsets[READDIR_SIZE];
+       int i, j;
+       struct dirent *de;
+       char names[READDIR_SIZE][30];
+
+       /* scan, remembering offsets */
+       for (i=0, de=readdir(d); 
+            de && i < READDIR_SIZE; 
+            de=readdir(d), i++) {
+               offsets[i] = telldir(d);
+               strcpy(names[i], de->d_name);
+       }
+
+       if (i == 0) {
+               return 0;
+       }
+
+       /* delete the first few */
+       for (j=0; j<MIN(i, DELETE_SIZE); j++) {
+               char fname[40];
+               snprintf(fname, sizeof(fname), TESTDIR "/%s", names[j]);
+               unlink(fname) == 0 || FAILED("unlink");
+       }
+
+       /* seek to just after the deletion */
+       seekdir(d, offsets[j-1]);
+
+       /* return number deleted */
+       return j;
+}
+
+int test_readdir_os2_delete(void)
+{
+       int total_deleted = 0;
+       DIR *d;
+       struct dirent *de;
+
+       test_readdir_os2_delete_ret = 0;
+
+       cleanup();
+       create_files();
+
+       d = opendir(TESTDIR "/test0.txt");
+       if (d != NULL) FAILED("opendir() on file succeed");
+       if (errno != ENOTDIR) FAILED("opendir() on file didn't give ENOTDIR");
+
+       d = opendir(TESTDIR);
+
+       /* skip past . and .. */
+       de = readdir(d);
+       strcmp(de->d_name, ".") == 0 || FAILED("match .");
+       de = readdir(d);
+       strcmp(de->d_name, "..") == 0 || FAILED("match ..");
+
+       while (1) {
+               int n = os2_delete(d);
+               if (n == 0) break;
+               total_deleted += n;
+       }
+       closedir(d);
+
+       fprintf(stderr, "Deleted %d files of %d\n", total_deleted, NUM_FILES);
+
+       rmdir(TESTDIR) == 0 || FAILED("rmdir");
+
+       if (system("rm -rf " TESTDIR) == -1) {
+               FAILED("system");
+       }
+
+       return test_readdir_os2_delete_ret;
+}
diff --git a/ctdb/lib/replace/test/shared_mmap.c b/ctdb/lib/replace/test/shared_mmap.c
new file mode 100644 (file)
index 0000000..50dad8d
--- /dev/null
@@ -0,0 +1,68 @@
+/* this tests whether we can use a shared writeable mmap on a file -
+   as needed for the mmap variant of FAST_SHARE_MODES */
+
+#if defined(HAVE_UNISTD_H)
+#include <unistd.h>
+#endif
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define DATA "conftest.mmap"
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
+
+main()
+{
+       int *buf;
+       int i; 
+       int fd = open(DATA,O_RDWR|O_CREAT|O_TRUNC,0666);
+       int count=7;
+
+       if (fd == -1) exit(1);
+
+       for (i=0;i<10000;i++) {
+               write(fd,&i,sizeof(i));
+       }
+
+       close(fd);
+
+       if (fork() == 0) {
+               fd = open(DATA,O_RDWR);
+               if (fd == -1) exit(1);
+
+               buf = (int *)mmap(NULL, 10000*sizeof(int), 
+                                  (PROT_READ | PROT_WRITE), 
+                                  MAP_FILE | MAP_SHARED, 
+                                  fd, 0);
+
+               while (count-- && buf[9124] != 55732) sleep(1);
+
+               if (count <= 0) exit(1);
+
+               buf[1763] = 7268;
+               exit(0);
+       }
+
+       fd = open(DATA,O_RDWR);
+       if (fd == -1) exit(1);
+
+       buf = (int *)mmap(NULL, 10000*sizeof(int), 
+                          (PROT_READ | PROT_WRITE), 
+                          MAP_FILE | MAP_SHARED, 
+                          fd, 0);
+
+       if (buf == (int *)-1) exit(1);
+
+       buf[9124] = 55732;
+
+       while (count-- && buf[1763] != 7268) sleep(1);
+
+       unlink(DATA);
+               
+       if (count > 0) exit(0);
+       exit(1);
+}
diff --git a/ctdb/lib/replace/test/shared_mremap.c b/ctdb/lib/replace/test/shared_mremap.c
new file mode 100644 (file)
index 0000000..05032ad
--- /dev/null
@@ -0,0 +1,48 @@
+/* this tests whether we can use mremap */
+
+#if defined(HAVE_UNISTD_H)
+#include <unistd.h>
+#endif
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define DATA "conftest.mmap"
+
+#ifndef MAP_FILE
+#define MAP_FILE 0
+#endif
+
+#ifndef MAP_FAILED
+#define MAP_FAILED (int *)-1
+#endif
+
+main()
+{
+       int *buf;
+       int fd;
+       int err = 1;
+
+       fd = open(DATA, O_RDWR|O_CREAT|O_TRUNC, 0666);
+       if (fd == -1) {
+               exit(1);
+       }
+
+       buf = (int *)mmap(NULL, 0x1000, PROT_READ | PROT_WRITE,
+                         MAP_FILE | MAP_SHARED, fd, 0);
+       if (buf == MAP_FAILED) {
+               goto done;
+       }
+
+       buf = mremap(buf, 0x1000, 0x2000, MREMAP_MAYMOVE);
+       if (buf == MAP_FAILED) {
+               goto done;
+       }
+
+       err = 0;
+done:
+       close(fd);
+       unlink(DATA);
+       exit(err);
+}
diff --git a/ctdb/lib/replace/test/snprintf.c b/ctdb/lib/replace/test/snprintf.c
new file mode 100644 (file)
index 0000000..d06630b
--- /dev/null
@@ -0,0 +1,29 @@
+void foo(const char *format, ...)
+{
+       va_list ap;
+       int len;
+       char buf[20];
+       long long l = 1234567890;
+       l *= 100;
+
+       va_start(ap, format);
+       len = vsnprintf(buf, 0, format, ap);
+       va_end(ap);
+       if (len != 5) exit(1);
+
+       va_start(ap, format);
+       len = vsnprintf(0, 0, format, ap);
+       va_end(ap);
+       if (len != 5) exit(2);
+
+       if (snprintf(buf, 3, "hello") != 5 || strcmp(buf, "he") != 0) exit(3);
+
+       if (snprintf(buf, 20, "%lld", l) != 12 || strcmp(buf, "123456789000") != 0) exit(4);
+       if (snprintf(buf, 20, "%zu", 123456789) != 9 || strcmp(buf, "123456789") != 0) exit(5);
+       if (snprintf(buf, 20, "%2\$d %1\$d", 3, 4) != 3 || strcmp(buf, "4 3") != 0) exit(6);
+       if (snprintf(buf, 20, "%s", 0) < 3) exit(7);
+
+       printf("1");
+       exit(0);
+}
+main() { foo("hello"); }
diff --git a/ctdb/lib/replace/test/strptime.c b/ctdb/lib/replace/test/strptime.c
new file mode 100644 (file)
index 0000000..5bf03f5
--- /dev/null
@@ -0,0 +1,173 @@
+
+#ifdef LIBREPLACE_CONFIGURE_TEST_STRPTIME
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+#define true 1
+#define false 0
+
+#ifndef __STRING
+#define __STRING(x)    #x
+#endif
+
+/* make printf a no-op */
+#define printf if(0) printf
+
+#else /* LIBREPLACE_CONFIGURE_TEST_STRPTIME */
+
+#include "replace.h"
+#include "system/time.h"
+#include "replace-test.h"
+
+#endif /* LIBREPLACE_CONFIGURE_TEST_STRPTIME */
+
+int libreplace_test_strptime(void)
+{
+       const char *s = "20070414101546Z";
+       char *ret;
+       struct tm t, t2;
+
+       memset(&t, 0, sizeof(t));
+       memset(&t2, 0, sizeof(t2));
+
+       printf("test: strptime\n");
+
+       ret = strptime(s, "%Y%m%d%H%M%S", &t);
+       if ( ret == NULL ) {
+               printf("failure: strptime [\n"
+                      "returned NULL\n"
+                      "]\n");
+               return false;
+       }
+
+       if ( *ret != 'Z' ) {
+               printf("failure: strptime [\n"
+                      "ret doesn't point to 'Z'\n"
+                      "]\n");
+               return false;
+       }
+
+       ret = strptime(s, "%Y%m%d%H%M%SZ", &t2);
+       if ( ret == NULL ) {
+               printf("failure: strptime [\n"
+                      "returned NULL with Z\n"
+                      "]\n");
+               return false;
+       }
+
+       if ( *ret != '\0' ) {
+               printf("failure: strptime [\n"
+                      "ret doesn't point to '\\0'\n"
+                      "]\n");
+               return false;
+       }
+
+#define CMP_TM_ELEMENT(t1,t2,elem) \
+       if (t1.elem != t2.elem) { \
+               printf("failure: strptime [\n" \
+                      "result differs if the format string has a 'Z' at the end\n" \
+                      "element: %s %d != %d\n" \
+                      "]\n", \
+                      __STRING(elen), t1.elem, t2.elem); \
+               return false; \
+       }
+
+       CMP_TM_ELEMENT(t,t2,tm_sec);
+       CMP_TM_ELEMENT(t,t2,tm_min);
+       CMP_TM_ELEMENT(t,t2,tm_hour);
+       CMP_TM_ELEMENT(t,t2,tm_mday);
+       CMP_TM_ELEMENT(t,t2,tm_mon);
+       CMP_TM_ELEMENT(t,t2,tm_year);
+       CMP_TM_ELEMENT(t,t2,tm_wday);
+       CMP_TM_ELEMENT(t,t2,tm_yday);
+       CMP_TM_ELEMENT(t,t2,tm_isdst);
+
+       if (t.tm_sec != 46) {
+               printf("failure: strptime [\n"
+                      "tm_sec: expected: 46, got: %d\n"
+                      "]\n",
+                      t.tm_sec);
+               return false;
+       }
+
+       if (t.tm_min != 15) {
+               printf("failure: strptime [\n"
+                      "tm_min: expected: 15, got: %d\n"
+                      "]\n",
+                      t.tm_min);
+               return false;
+       }
+
+       if (t.tm_hour != 10) {
+               printf("failure: strptime [\n"
+                      "tm_hour: expected: 10, got: %d\n"
+                      "]\n",
+                      t.tm_hour);
+               return false;
+       }
+
+       if (t.tm_mday != 14) {
+               printf("failure: strptime [\n"
+                      "tm_mday: expected: 14, got: %d\n"
+                      "]\n",
+                      t.tm_mday);
+               return false;
+       }
+
+       if (t.tm_mon != 3) {
+               printf("failure: strptime [\n"
+                      "tm_mon: expected: 3, got: %d\n"
+                      "]\n",
+                      t.tm_mon);
+               return false;
+       }
+
+       if (t.tm_year != 107) {
+               printf("failure: strptime [\n"
+                      "tm_year: expected: 107, got: %d\n"
+                      "]\n",
+                      t.tm_year);
+               return false;
+       }
+
+       if (t.tm_wday != 6) { /* saturday */
+               printf("failure: strptime [\n"
+                      "tm_wday: expected: 6, got: %d\n"
+                      "]\n",
+                      t.tm_wday);
+               return false;
+       }
+
+       if (t.tm_yday != 103) {
+               printf("failure: strptime [\n"
+                      "tm_yday: expected: 103, got: %d\n"
+                      "]\n",
+                      t.tm_yday);
+               return false;
+       }
+
+       /* we don't test this as it depends on the host configuration
+       if (t.tm_isdst != 0) {
+               printf("failure: strptime [\n"
+                      "tm_isdst: expected: 0, got: %d\n"
+                      "]\n",
+                      t.tm_isdst);
+               return false;
+       }*/
+
+       printf("success: strptime\n");
+
+       return true;
+}
+
+#ifdef LIBREPLACE_CONFIGURE_TEST_STRPTIME
+int main (void)
+{
+       int ret;
+       ret = libreplace_test_strptime();
+       if (ret == false) return 1;
+       return 0;
+}
+#endif
diff --git a/ctdb/lib/replace/test/testsuite.c b/ctdb/lib/replace/test/testsuite.c
new file mode 100644 (file)
index 0000000..52629ec
--- /dev/null
@@ -0,0 +1,1101 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   libreplace tests
+
+   Copyright (C) Jelmer Vernooij 2006
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "replace-test.h"
+#include "replace-testsuite.h"
+
+/*
+  we include all the system/ include files here so that libreplace tests
+  them in the build farm
+*/
+#include "system/capability.h"
+#include "system/dir.h"
+#include "system/filesys.h"
+#include "system/glob.h"
+#include "system/iconv.h"
+#include "system/locale.h"
+#include "system/network.h"
+#include "system/passwd.h"
+#include "system/readline.h"
+#include "system/select.h"
+#include "system/shmem.h"
+#include "system/syslog.h"
+#include "system/terminal.h"
+#include "system/time.h"
+#include "system/wait.h"
+#include "system/aio.h"
+
+#define TESTFILE "testfile.dat"
+
+
+/*
+  test ftruncate() function
+ */
+static int test_ftruncate(void)
+{
+       struct stat st;
+       int fd;
+       const int size = 1234;
+       printf("test: ftruncate\n");
+       unlink(TESTFILE);
+       fd = open(TESTFILE, O_RDWR|O_CREAT, 0600);
+       if (fd == -1) {
+               printf("failure: ftruncate [\n"
+                          "creating '%s' failed - %s\n]\n", TESTFILE, strerror(errno));
+               return false;
+       }
+       if (ftruncate(fd, size) != 0) {
+               printf("failure: ftruncate [\n%s\n]\n", strerror(errno));
+               return false;
+       }
+       if (fstat(fd, &st) != 0) {
+               printf("failure: ftruncate [\nfstat failed - %s\n]\n", strerror(errno));
+               return false;
+       }
+       if (st.st_size != size) {
+               printf("failure: ftruncate [\ngave wrong size %d - expected %d\n]\n",
+                      (int)st.st_size, size);
+               return false;
+       }
+       unlink(TESTFILE);
+       printf("success: ftruncate\n");
+       return true;
+}
+
+/*
+  test strlcpy() function.
+  see http://www.gratisoft.us/todd/papers/strlcpy.html
+ */
+static int test_strlcpy(void)
+{
+       char buf[4];
+       const struct {
+               const char *src;
+               size_t result;
+       } tests[] = {
+               { "abc", 3 },
+               { "abcdef", 6 },
+               { "abcd", 4 },
+               { "", 0 },
+               { NULL, 0 }
+       };
+       int i;
+       printf("test: strlcpy\n");
+       for (i=0;tests[i].src;i++) {
+               if (strlcpy(buf, tests[i].src, sizeof(buf)) != tests[i].result) {
+                       printf("failure: strlcpy [\ntest %d failed\n]\n", i);
+                       return false;
+               }
+       }
+       printf("success: strlcpy\n");
+       return true;
+}
+
+static int test_strlcat(void)
+{
+       char tmp[10];
+       printf("test: strlcat\n");
+       strlcpy(tmp, "", sizeof(tmp));
+       if (strlcat(tmp, "bla", 3) != 3) {
+               printf("failure: strlcat [\ninvalid return code\n]\n");
+               return false;
+       }
+       if (strcmp(tmp, "bl") != 0) {
+               printf("failure: strlcat [\nexpected \"bl\", got \"%s\"\n]\n", 
+                          tmp);
+               return false;
+       }
+
+       strlcpy(tmp, "da", sizeof(tmp));
+       if (strlcat(tmp, "me", 4) != 4) {
+               printf("failure: strlcat [\nexpected \"dam\", got \"%s\"\n]\n",
+                          tmp);
+               return false;
+       }
+
+       printf("success: strlcat\n");
+       return true;
+}
+
+static int test_mktime(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_initgroups(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_memmove(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_strdup(void)
+{
+       char *x;
+       printf("test: strdup\n");
+       x = strdup("bla");
+       if (strcmp("bla", x) != 0) {
+               printf("failure: strdup [\nfailed: expected \"bla\", got \"%s\"\n]\n",
+                          x);
+               return false;
+       }
+       free(x);
+       printf("success: strdup\n");
+       return true;
+}      
+
+static int test_setlinebuf(void)
+{
+       printf("test: setlinebuf\n");
+       setlinebuf(stdout);
+       printf("success: setlinebuf\n");
+       return true;
+}
+
+static int test_vsyslog(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_timegm(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_setenv(void)
+{
+#define TEST_SETENV(key, value, overwrite, result) do { \
+       int _ret; \
+       char *_v; \
+       _ret = setenv(key, value, overwrite); \
+       if (_ret != 0) { \
+               printf("failure: setenv [\n" \
+                       "setenv(%s, %s, %d) failed\n" \
+                       "]\n", \
+                       key, value, overwrite); \
+               return false; \
+       } \
+       _v=getenv(key); \
+       if (!_v) { \
+               printf("failure: setenv [\n" \
+                       "getenv(%s) returned NULL\n" \
+                       "]\n", \
+                       key); \
+               return false; \
+       } \
+       if (strcmp(result, _v) != 0) { \
+               printf("failure: setenv [\n" \
+                       "getenv(%s): '%s' != '%s'\n" \
+                       "]\n", \
+                       key, result, _v); \
+               return false; \
+       } \
+} while(0)
+
+#define TEST_UNSETENV(key) do { \
+       char *_v; \
+       unsetenv(key); \
+       _v=getenv(key); \
+       if (_v) { \
+               printf("failure: setenv [\n" \
+                       "getenv(%s): NULL != '%s'\n" \
+                       "]\n", \
+                       SETENVTEST_KEY, _v); \
+               return false; \
+       } \
+} while (0)
+
+#define SETENVTEST_KEY "SETENVTESTKEY"
+#define SETENVTEST_VAL "SETENVTESTVAL"
+
+       printf("test: setenv\n");
+       TEST_SETENV(SETENVTEST_KEY, SETENVTEST_VAL"1", 0, SETENVTEST_VAL"1");
+       TEST_SETENV(SETENVTEST_KEY, SETENVTEST_VAL"2", 0, SETENVTEST_VAL"1");
+       TEST_SETENV(SETENVTEST_KEY, SETENVTEST_VAL"3", 1, SETENVTEST_VAL"3");
+       TEST_SETENV(SETENVTEST_KEY, SETENVTEST_VAL"4", 1, SETENVTEST_VAL"4");
+       TEST_UNSETENV(SETENVTEST_KEY);
+       TEST_UNSETENV(SETENVTEST_KEY);
+       TEST_SETENV(SETENVTEST_KEY, SETENVTEST_VAL"5", 0, SETENVTEST_VAL"5");
+       TEST_UNSETENV(SETENVTEST_KEY);
+       TEST_UNSETENV(SETENVTEST_KEY);
+       printf("success: setenv\n");
+       return true;
+}
+
+static int test_strndup(void)
+{
+       char *x;
+       printf("test: strndup\n");
+       x = strndup("bla", 0);
+       if (strcmp(x, "") != 0) {
+               printf("failure: strndup [\ninvalid\n]\n");
+               return false;
+       }
+       free(x);
+       x = strndup("bla", 2);
+       if (strcmp(x, "bl") != 0) {
+               printf("failure: strndup [\ninvalid\n]\n");
+               return false;
+       }
+       free(x);
+       x = strndup("bla", 10);
+       if (strcmp(x, "bla") != 0) {
+               printf("failure: strndup [\ninvalid\n]\n");
+               return false;
+       }
+       free(x);
+       printf("success: strndup\n");
+       return true;
+}
+
+static int test_strnlen(void)
+{
+       printf("test: strnlen\n");
+       if (strnlen("bla", 2) != 2) {
+               printf("failure: strnlen [\nunexpected length\n]\n");
+               return false;
+       }
+
+       if (strnlen("some text\n", 0) != 0) {
+               printf("failure: strnlen [\nunexpected length\n]\n");
+               return false;
+       }
+
+       if (strnlen("some text", 20) != 9) {
+               printf("failure: strnlen [\nunexpected length\n]\n");
+               return false;
+       }
+
+       printf("success: strnlen\n");
+       return true;
+}
+
+static int test_waitpid(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_seteuid(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_setegid(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_asprintf(void)
+{
+       char *x;
+       printf("test: asprintf\n");
+       if (asprintf(&x, "%d", 9) != 1) {
+               printf("failure: asprintf [\ngenerate asprintf\n]\n");
+               return false;
+       }
+       if (strcmp(x, "9") != 0) {
+               printf("failure: asprintf [\ngenerate asprintf\n]\n");
+               return false;
+       }
+       if (asprintf(&x, "dat%s", "a") != 4) {
+               printf("failure: asprintf [\ngenerate asprintf\n]\n");
+               return false;
+       }
+       if (strcmp(x, "data") != 0) {
+               printf("failure: asprintf [\ngenerate asprintf\n]\n");
+               return false;
+       }
+       printf("success: asprintf\n");
+       return true;
+}
+
+static int test_snprintf(void)
+{
+       char tmp[10];
+       printf("test: snprintf\n");
+       if (snprintf(tmp, 3, "foo%d", 9) != 4) {
+               printf("failure: snprintf [\nsnprintf return code failed\n]\n");
+               return false;
+       }
+
+       if (strcmp(tmp, "fo") != 0) {
+               printf("failure: snprintf [\nsnprintf failed\n]\n");
+               return false;
+       }
+
+       printf("success: snprintf\n");
+       return true;
+}
+
+static int test_vasprintf(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_vsnprintf(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_opendir(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_readdir(void)
+{
+       printf("test: readdir\n");
+       if (test_readdir_os2_delete() != 0) {
+               return false;
+       }
+       printf("success: readdir\n");
+       return true;
+}
+
+static int test_telldir(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_seekdir(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_dlopen(void)
+{
+       /* FIXME: test dlopen, dlsym, dlclose, dlerror */
+       return true;
+}
+
+
+static int test_chroot(void)
+{
+       /* FIXME: chroot() */
+       return true;
+}
+
+static int test_bzero(void)
+{
+       /* FIXME: bzero */
+       return true;
+}
+
+static int test_strerror(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_errno(void)
+{
+       printf("test: errno\n");
+       errno = 3;
+       if (errno != 3) {
+               printf("failure: errno [\nerrno failed\n]\n");
+               return false;
+       }
+
+       printf("success: errno\n");
+       return true;
+}
+
+static int test_mkdtemp(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_mkstemp(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_pread(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_pwrite(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_inet_ntoa(void)
+{
+       /* FIXME */
+       return true;
+}
+
+#define TEST_STRTO_X(type,fmt,func,str,base,res,diff,rrnoo) do {\
+       type _v; \
+       char _s[64]; \
+       char *_p = NULL;\
+       char *_ep = NULL; \
+       strlcpy(_s, str, sizeof(_s));\
+       if (diff >= 0) { \
+               _ep = &_s[diff]; \
+       } \
+       errno = 0; \
+       _v = func(_s, &_p, base); \
+       if (errno != rrnoo) { \
+               printf("failure: %s [\n" \
+                      "\t%s\n" \
+                      "\t%s(\"%s\",%d,%d): " fmt " (=/!)= " fmt "\n" \
+                      "\terrno: %d != %d\n" \
+                      "]\n", \
+                       __STRING(func), __location__, __STRING(func), \
+                      str, diff, base, res, _v, rrnoo, errno); \
+               return false; \
+       } else if (_v != res) { \
+               printf("failure: %s [\n" \
+                      "\t%s\n" \
+                      "\t%s(\"%s\",%d,%d): " fmt " != " fmt "\n" \
+                      "]\n", \
+                      __STRING(func), __location__, __STRING(func), \
+                      str, diff, base, res, _v); \
+               return false; \
+       } else if (_p != _ep) { \
+               printf("failure: %s [\n" \
+                      "\t%s\n" \
+                      "\t%s(\"%s\",%d,%d): " fmt " (=/!)= " fmt "\n" \
+                      "\tptr: %p - %p = %d != %d\n" \
+                      "]\n", \
+                      __STRING(func), __location__, __STRING(func), \
+                      str, diff, base, res, _v, _ep, _p, (int)(diff - (_ep - _p)), diff); \
+               return false; \
+       } \
+} while (0)
+
+static int test_strtoll(void)
+{
+       printf("test: strtoll\n");
+
+#define TEST_STRTOLL(str,base,res,diff,errnoo) TEST_STRTO_X(long long int, "%lld", strtoll,str,base,res,diff,errnoo)
+
+       TEST_STRTOLL("15",      10,     15LL,   2, 0);
+       TEST_STRTOLL("  15",    10,     15LL,   4, 0);
+       TEST_STRTOLL("15",      0,      15LL,   2, 0);
+       TEST_STRTOLL(" 15 ",    0,      15LL,   3, 0);
+       TEST_STRTOLL("+15",     10,     15LL,   3, 0);
+       TEST_STRTOLL("  +15",   10,     15LL,   5, 0);
+       TEST_STRTOLL("+15",     0,      15LL,   3, 0);
+       TEST_STRTOLL(" +15 ",   0,      15LL,   4, 0);
+       TEST_STRTOLL("-15",     10,     -15LL,  3, 0);
+       TEST_STRTOLL("  -15",   10,     -15LL,  5, 0);
+       TEST_STRTOLL("-15",     0,      -15LL,  3, 0);
+       TEST_STRTOLL(" -15 ",   0,      -15LL,  4, 0);
+       TEST_STRTOLL("015",     10,     15LL,   3, 0);
+       TEST_STRTOLL("  015",   10,     15LL,   5, 0);
+       TEST_STRTOLL("015",     0,      13LL,   3, 0);
+       TEST_STRTOLL("  015",   0,      13LL,   5, 0);
+       TEST_STRTOLL("0x15",    10,     0LL,    1, 0);
+       TEST_STRTOLL("  0x15",  10,     0LL,    3, 0);
+       TEST_STRTOLL("0x15",    0,      21LL,   4, 0);
+       TEST_STRTOLL("  0x15",  0,      21LL,   6, 0);
+
+       TEST_STRTOLL("10",      16,     16LL,   2, 0);
+       TEST_STRTOLL("  10 ",   16,     16LL,   4, 0);
+       TEST_STRTOLL("0x10",    16,     16LL,   4, 0);
+       TEST_STRTOLL("0x10",    0,      16LL,   4, 0);
+       TEST_STRTOLL(" 0x10 ",  0,      16LL,   5, 0);
+       TEST_STRTOLL("+10",     16,     16LL,   3, 0);
+       TEST_STRTOLL("  +10 ",  16,     16LL,   5, 0);
+       TEST_STRTOLL("+0x10",   16,     16LL,   5, 0);
+       TEST_STRTOLL("+0x10",   0,      16LL,   5, 0);
+       TEST_STRTOLL(" +0x10 ", 0,      16LL,   6, 0);
+       TEST_STRTOLL("-10",     16,     -16LL,  3, 0);
+       TEST_STRTOLL("  -10 ",  16,     -16LL,  5, 0);
+       TEST_STRTOLL("-0x10",   16,     -16LL,  5, 0);
+       TEST_STRTOLL("-0x10",   0,      -16LL,  5, 0);
+       TEST_STRTOLL(" -0x10 ", 0,      -16LL,  6, 0);
+       TEST_STRTOLL("010",     16,     16LL,   3, 0);
+       TEST_STRTOLL("  010 ",  16,     16LL,   5, 0);
+       TEST_STRTOLL("-010",    16,     -16LL,  4, 0);
+
+       TEST_STRTOLL("11",      8,      9LL,    2, 0);
+       TEST_STRTOLL("011",     8,      9LL,    3, 0);
+       TEST_STRTOLL("011",     0,      9LL,    3, 0);
+       TEST_STRTOLL("-11",     8,      -9LL,   3, 0);
+       TEST_STRTOLL("-011",    8,      -9LL,   4, 0);
+       TEST_STRTOLL("-011",    0,      -9LL,   4, 0);
+
+       TEST_STRTOLL("011",     8,      9LL,    3, 0);
+       TEST_STRTOLL("011",     0,      9LL,    3, 0);
+       TEST_STRTOLL("-11",     8,      -9LL,   3, 0);
+       TEST_STRTOLL("-011",    8,      -9LL,   4, 0);
+       TEST_STRTOLL("-011",    0,      -9LL,   4, 0);
+
+       TEST_STRTOLL("Text",    0,      0LL,    0, 0);
+
+       TEST_STRTOLL("9223372036854775807",     10,     9223372036854775807LL,  19, 0);
+       TEST_STRTOLL("9223372036854775807",     0,      9223372036854775807LL,  19, 0);
+       TEST_STRTOLL("9223372036854775808",     0,      9223372036854775807LL,  19, ERANGE);
+       TEST_STRTOLL("9223372036854775808",     10,     9223372036854775807LL,  19, ERANGE);
+       TEST_STRTOLL("0x7FFFFFFFFFFFFFFF",      0,      9223372036854775807LL,  18, 0);
+       TEST_STRTOLL("0x7FFFFFFFFFFFFFFF",      16,     9223372036854775807LL,  18, 0);
+       TEST_STRTOLL("7FFFFFFFFFFFFFFF",        16,     9223372036854775807LL,  16, 0);
+       TEST_STRTOLL("0x8000000000000000",      0,      9223372036854775807LL,  18, ERANGE);
+       TEST_STRTOLL("0x8000000000000000",      16,     9223372036854775807LL,  18, ERANGE);
+       TEST_STRTOLL("80000000000000000",       16,     9223372036854775807LL,  17, ERANGE);
+       TEST_STRTOLL("0777777777777777777777",  0,      9223372036854775807LL,  22, 0);
+       TEST_STRTOLL("0777777777777777777777",  8,      9223372036854775807LL,  22, 0);
+       TEST_STRTOLL("777777777777777777777",   8,      9223372036854775807LL,  21, 0);
+       TEST_STRTOLL("01000000000000000000000", 0,      9223372036854775807LL,  23, ERANGE);
+       TEST_STRTOLL("01000000000000000000000", 8,      9223372036854775807LL,  23, ERANGE);
+       TEST_STRTOLL("1000000000000000000000",  8,      9223372036854775807LL,  22, ERANGE);
+
+       TEST_STRTOLL("-9223372036854775808",    10,     -9223372036854775807LL -1,      20, 0);
+       TEST_STRTOLL("-9223372036854775808",    0,      -9223372036854775807LL -1,      20, 0);
+       TEST_STRTOLL("-9223372036854775809",    0,      -9223372036854775807LL -1,      20, ERANGE);
+       TEST_STRTOLL("-9223372036854775809",    10,     -9223372036854775807LL -1,      20, ERANGE);
+       TEST_STRTOLL("-0x8000000000000000",     0,      -9223372036854775807LL -1,      19, 0);
+       TEST_STRTOLL("-0x8000000000000000",     16,     -9223372036854775807LL -1,      19, 0);
+       TEST_STRTOLL("-8000000000000000",       16,     -9223372036854775807LL -1,      17, 0);
+       TEST_STRTOLL("-0x8000000000000001",     0,      -9223372036854775807LL -1,      19, ERANGE);
+       TEST_STRTOLL("-0x8000000000000001",     16,     -9223372036854775807LL -1,      19, ERANGE);
+       TEST_STRTOLL("-80000000000000001",      16,     -9223372036854775807LL -1,      18, ERANGE);
+       TEST_STRTOLL("-01000000000000000000000",0,      -9223372036854775807LL -1,      24, 0);
+       TEST_STRTOLL("-01000000000000000000000",8,      -9223372036854775807LL -1,      24, 0);
+       TEST_STRTOLL("-1000000000000000000000", 8,      -9223372036854775807LL -1,      23, 0);
+       TEST_STRTOLL("-01000000000000000000001",0,      -9223372036854775807LL -1,      24, ERANGE);
+       TEST_STRTOLL("-01000000000000000000001",8,      -9223372036854775807LL -1,      24, ERANGE);
+       TEST_STRTOLL("-1000000000000000000001", 8,      -9223372036854775807LL -1,      23, ERANGE);
+
+       printf("success: strtoll\n");
+       return true;
+}
+
+static int test_strtoull(void)
+{
+       printf("test: strtoull\n");
+
+#define TEST_STRTOULL(str,base,res,diff,errnoo) TEST_STRTO_X(long long unsigned int,"%llu",strtoull,str,base,res,diff,errnoo)
+
+       TEST_STRTOULL("15",     10,     15LLU,  2, 0);
+       TEST_STRTOULL("  15",   10,     15LLU,  4, 0);
+       TEST_STRTOULL("15",     0,      15LLU,  2, 0);
+       TEST_STRTOULL(" 15 ",   0,      15LLU,  3, 0);
+       TEST_STRTOULL("+15",    10,     15LLU,  3, 0);
+       TEST_STRTOULL("  +15",  10,     15LLU,  5, 0);
+       TEST_STRTOULL("+15",    0,      15LLU,  3, 0);
+       TEST_STRTOULL(" +15 ",  0,      15LLU,  4, 0);
+       TEST_STRTOULL("-15",    10,     18446744073709551601LLU,        3, 0);
+       TEST_STRTOULL("  -15",  10,     18446744073709551601LLU,        5, 0);
+       TEST_STRTOULL("-15",    0,      18446744073709551601LLU,        3, 0);
+       TEST_STRTOULL(" -15 ",  0,      18446744073709551601LLU,        4, 0);
+       TEST_STRTOULL("015",    10,     15LLU,  3, 0);
+       TEST_STRTOULL("  015",  10,     15LLU,  5, 0);
+       TEST_STRTOULL("015",    0,      13LLU,  3, 0);
+       TEST_STRTOULL("  015",  0,      13LLU,  5, 0);
+       TEST_STRTOULL("0x15",   10,     0LLU,   1, 0);
+       TEST_STRTOULL("  0x15", 10,     0LLU,   3, 0);
+       TEST_STRTOULL("0x15",   0,      21LLU,  4, 0);
+       TEST_STRTOULL("  0x15", 0,      21LLU,  6, 0);
+
+       TEST_STRTOULL("10",     16,     16LLU,  2, 0);
+       TEST_STRTOULL("  10 ",  16,     16LLU,  4, 0);
+       TEST_STRTOULL("0x10",   16,     16LLU,  4, 0);
+       TEST_STRTOULL("0x10",   0,      16LLU,  4, 0);
+       TEST_STRTOULL(" 0x10 ", 0,      16LLU,  5, 0);
+       TEST_STRTOULL("+10",    16,     16LLU,  3, 0);
+       TEST_STRTOULL("  +10 ", 16,     16LLU,  5, 0);
+       TEST_STRTOULL("+0x10",  16,     16LLU,  5, 0);
+       TEST_STRTOULL("+0x10",  0,      16LLU,  5, 0);
+       TEST_STRTOULL(" +0x10 ",        0,      16LLU,  6, 0);
+       TEST_STRTOULL("-10",    16,     -16LLU, 3, 0);
+       TEST_STRTOULL("  -10 ", 16,     -16LLU, 5, 0);
+       TEST_STRTOULL("-0x10",  16,     -16LLU, 5, 0);
+       TEST_STRTOULL("-0x10",  0,      -16LLU, 5, 0);
+       TEST_STRTOULL(" -0x10 ",        0,      -16LLU, 6, 0);
+       TEST_STRTOULL("010",    16,     16LLU,  3, 0);
+       TEST_STRTOULL("  010 ", 16,     16LLU,  5, 0);
+       TEST_STRTOULL("-010",   16,     -16LLU, 4, 0);
+
+       TEST_STRTOULL("11",     8,      9LLU,   2, 0);
+       TEST_STRTOULL("011",    8,      9LLU,   3, 0);
+       TEST_STRTOULL("011",    0,      9LLU,   3, 0);
+       TEST_STRTOULL("-11",    8,      -9LLU,  3, 0);
+       TEST_STRTOULL("-011",   8,      -9LLU,  4, 0);
+       TEST_STRTOULL("-011",   0,      -9LLU,  4, 0);
+
+       TEST_STRTOULL("011",    8,      9LLU,   3, 0);
+       TEST_STRTOULL("011",    0,      9LLU,   3, 0);
+       TEST_STRTOULL("-11",    8,      -9LLU,  3, 0);
+       TEST_STRTOULL("-011",   8,      -9LLU,  4, 0);
+       TEST_STRTOULL("-011",   0,      -9LLU,  4, 0);
+
+       TEST_STRTOULL("Text",   0,      0LLU,   0, 0);
+
+       TEST_STRTOULL("9223372036854775807",    10,     9223372036854775807LLU, 19, 0);
+       TEST_STRTOULL("9223372036854775807",    0,      9223372036854775807LLU, 19, 0);
+       TEST_STRTOULL("9223372036854775808",    0,      9223372036854775808LLU, 19, 0);
+       TEST_STRTOULL("9223372036854775808",    10,     9223372036854775808LLU, 19, 0);
+       TEST_STRTOULL("0x7FFFFFFFFFFFFFFF",     0,      9223372036854775807LLU, 18, 0);
+       TEST_STRTOULL("0x7FFFFFFFFFFFFFFF",     16,     9223372036854775807LLU, 18, 0);
+       TEST_STRTOULL("7FFFFFFFFFFFFFFF",       16,     9223372036854775807LLU, 16, 0);
+       TEST_STRTOULL("0x8000000000000000",     0,      9223372036854775808LLU, 18, 0);
+       TEST_STRTOULL("0x8000000000000000",     16,     9223372036854775808LLU, 18, 0);
+       TEST_STRTOULL("8000000000000000",       16,     9223372036854775808LLU, 16, 0);
+       TEST_STRTOULL("0777777777777777777777", 0,      9223372036854775807LLU, 22, 0);
+       TEST_STRTOULL("0777777777777777777777", 8,      9223372036854775807LLU, 22, 0);
+       TEST_STRTOULL("777777777777777777777",  8,      9223372036854775807LLU, 21, 0);
+       TEST_STRTOULL("01000000000000000000000",0,      9223372036854775808LLU, 23, 0);
+       TEST_STRTOULL("01000000000000000000000",8,      9223372036854775808LLU, 23, 0);
+       TEST_STRTOULL("1000000000000000000000", 8,      9223372036854775808LLU, 22, 0);
+
+       TEST_STRTOULL("-9223372036854775808",   10,     9223372036854775808LLU, 20, 0);
+       TEST_STRTOULL("-9223372036854775808",   0,      9223372036854775808LLU, 20, 0);
+       TEST_STRTOULL("-9223372036854775809",   0,      9223372036854775807LLU, 20, 0);
+       TEST_STRTOULL("-9223372036854775809",   10,     9223372036854775807LLU, 20, 0);
+       TEST_STRTOULL("-0x8000000000000000",    0,      9223372036854775808LLU, 19, 0);
+       TEST_STRTOULL("-0x8000000000000000",    16,     9223372036854775808LLU, 19, 0);
+       TEST_STRTOULL("-8000000000000000",      16,     9223372036854775808LLU, 17, 0);
+       TEST_STRTOULL("-0x8000000000000001",    0,      9223372036854775807LLU, 19, 0);
+       TEST_STRTOULL("-0x8000000000000001",    16,     9223372036854775807LLU, 19, 0);
+       TEST_STRTOULL("-8000000000000001",      16,     9223372036854775807LLU, 17, 0);
+       TEST_STRTOULL("-01000000000000000000000",0,     9223372036854775808LLU, 24, 0);
+       TEST_STRTOULL("-01000000000000000000000",8,     9223372036854775808LLU, 24, 0);
+       TEST_STRTOULL("-1000000000000000000000",8,      9223372036854775808LLU, 23, 0);
+       TEST_STRTOULL("-01000000000000000000001",0,     9223372036854775807LLU, 24, 0);
+       TEST_STRTOULL("-01000000000000000000001",8,     9223372036854775807LLU, 24, 0);
+       TEST_STRTOULL("-1000000000000000000001",8,      9223372036854775807LLU, 23, 0);
+
+       TEST_STRTOULL("18446744073709551615",   0,      18446744073709551615LLU,        20, 0);
+       TEST_STRTOULL("18446744073709551615",   10,     18446744073709551615LLU,        20, 0);
+       TEST_STRTOULL("18446744073709551616",   0,      18446744073709551615LLU,        20, ERANGE);
+       TEST_STRTOULL("18446744073709551616",   10,     18446744073709551615LLU,        20, ERANGE);
+       TEST_STRTOULL("0xFFFFFFFFFFFFFFFF",     0,      18446744073709551615LLU,        18, 0);
+       TEST_STRTOULL("0xFFFFFFFFFFFFFFFF",     16,     18446744073709551615LLU,        18, 0);
+       TEST_STRTOULL("FFFFFFFFFFFFFFFF",       16,     18446744073709551615LLU,        16, 0);
+       TEST_STRTOULL("0x10000000000000000",    0,      18446744073709551615LLU,        19, ERANGE);
+       TEST_STRTOULL("0x10000000000000000",    16,     18446744073709551615LLU,        19, ERANGE);
+       TEST_STRTOULL("10000000000000000",      16,     18446744073709551615LLU,        17, ERANGE);
+       TEST_STRTOULL("01777777777777777777777",0,      18446744073709551615LLU,        23, 0);
+       TEST_STRTOULL("01777777777777777777777",8,      18446744073709551615LLU,        23, 0);
+       TEST_STRTOULL("1777777777777777777777", 8,      18446744073709551615LLU,        22, 0);
+       TEST_STRTOULL("02000000000000000000000",0,      18446744073709551615LLU,        23, ERANGE);
+       TEST_STRTOULL("02000000000000000000000",8,      18446744073709551615LLU,        23, ERANGE);
+       TEST_STRTOULL("2000000000000000000000", 8,      18446744073709551615LLU,        22, ERANGE);
+
+       TEST_STRTOULL("-18446744073709551615",  0,      1LLU,                           21, 0);
+       TEST_STRTOULL("-18446744073709551615",  10,     1LLU,                           21, 0);
+       TEST_STRTOULL("-18446744073709551616",  0,      18446744073709551615LLU,        21, ERANGE);
+       TEST_STRTOULL("-18446744073709551616",  10,     18446744073709551615LLU,        21, ERANGE);
+       TEST_STRTOULL("-0xFFFFFFFFFFFFFFFF",    0,      1LLU,                           19, 0);
+       TEST_STRTOULL("-0xFFFFFFFFFFFFFFFF",    16,     1LLU,                           19, 0);
+       TEST_STRTOULL("-FFFFFFFFFFFFFFFF",      16,     1LLU,                           17, 0);
+       TEST_STRTOULL("-0x10000000000000000",   0,      18446744073709551615LLU,        20, ERANGE);
+       TEST_STRTOULL("-0x10000000000000000",   16,     18446744073709551615LLU,        20, ERANGE);
+       TEST_STRTOULL("-10000000000000000",     16,     18446744073709551615LLU,        18, ERANGE);
+       TEST_STRTOULL("-01777777777777777777777",0,     1LLU,                           24, 0);
+       TEST_STRTOULL("-01777777777777777777777",8,     1LLU,                           24, 0);
+       TEST_STRTOULL("-1777777777777777777777",8,      1LLU,                           23, 0);
+       TEST_STRTOULL("-02000000000000000000000",0,     18446744073709551615LLU,        24, ERANGE);
+       TEST_STRTOULL("-02000000000000000000000",8,     18446744073709551615LLU,        24, ERANGE);
+       TEST_STRTOULL("-2000000000000000000000",8,      18446744073709551615LLU,        23, ERANGE);
+
+       printf("success: strtoull\n");
+       return true;
+}
+
+/* 
+FIXME:
+Types:
+bool
+socklen_t
+uint{8,16,32,64}_t
+int{8,16,32,64}_t
+intptr_t
+
+Constants:
+PATH_NAME_MAX
+UINT{16,32,64}_MAX
+INT32_MAX
+*/
+
+static int test_va_copy(void)
+{
+       /* FIXME */
+       return true;
+}
+
+static int test_FUNCTION(void)
+{
+       printf("test: FUNCTION\n");
+       if (strcmp(__FUNCTION__, "test_FUNCTION") != 0) {
+               printf("failure: FUNCTION [\nFUNCTION invalid\n]\n");
+               return false;
+       }
+       printf("success: FUNCTION\n");
+       return true;
+}
+
+static int test_MIN(void)
+{
+       printf("test: MIN\n");
+       if (MIN(20, 1) != 1) {
+               printf("failure: MIN [\nMIN invalid\n]\n");
+               return false;
+       }
+       if (MIN(1, 20) != 1) {
+               printf("failure: MIN [\nMIN invalid\n]\n");
+               return false;
+       }
+       printf("success: MIN\n");
+       return true;
+}
+
+static int test_MAX(void)
+{
+       printf("test: MAX\n");
+       if (MAX(20, 1) != 20) {
+               printf("failure: MAX [\nMAX invalid\n]\n");
+               return false;
+       }
+       if (MAX(1, 20) != 20) {
+               printf("failure: MAX [\nMAX invalid\n]\n");
+               return false;
+       }
+       printf("success: MAX\n");
+       return true;
+}
+
+static int test_socketpair(void)
+{
+       int sock[2];
+       char buf[20];
+
+       printf("test: socketpair\n");
+
+       if (socketpair(AF_UNIX, SOCK_STREAM, 0, sock) == -1) {
+               printf("failure: socketpair [\n"
+                          "socketpair() failed\n"
+                          "]\n");
+               return false;
+       }
+
+       if (write(sock[1], "automatisch", 12) == -1) {
+               printf("failure: socketpair [\n"
+                          "write() failed: %s\n"
+                          "]\n", strerror(errno));
+               return false;
+       }
+
+       if (read(sock[0], buf, 12) == -1) {
+               printf("failure: socketpair [\n"
+                          "read() failed: %s\n"
+                          "]\n", strerror(errno));
+               return false;
+       }
+
+       if (strcmp(buf, "automatisch") != 0) {
+               printf("failure: socketpair [\n"
+                          "expected: automatisch, got: %s\n"
+                          "]\n", buf);
+               return false;
+       }
+
+       printf("success: socketpair\n");
+
+       return true;
+}
+
+extern int libreplace_test_strptime(void);
+
+static int test_strptime(void)
+{
+       return libreplace_test_strptime();
+}
+
+extern int getifaddrs_test(void);
+
+static int test_getifaddrs(void)
+{
+
+       printf("test: getifaddrs\n");
+
+       if (getifaddrs_test() != 0) {
+               printf("failure: getifaddrs\n");
+               return false;
+       }
+
+       printf("success: getifaddrs\n");
+       return true;
+}
+
+static int test_utime(void)
+{
+       struct utimbuf u;
+       struct stat st1, st2, st3;
+       int fd;
+
+       printf("test: utime\n");
+       unlink(TESTFILE);
+
+       fd = open(TESTFILE, O_RDWR|O_CREAT, 0600);
+       if (fd == -1) {
+               printf("failure: utime [\n"
+                      "creating '%s' failed - %s\n]\n",
+                      TESTFILE, strerror(errno));
+               return false;
+       }
+
+       if (fstat(fd, &st1) != 0) {
+               printf("failure: utime [\n"
+                      "fstat (1) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+       u.actime = st1.st_atime + 300;
+       u.modtime = st1.st_mtime - 300;
+       if (utime(TESTFILE, &u) != 0) {
+               printf("failure: utime [\n"
+                      "utime(&u) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+       if (fstat(fd, &st2) != 0) {
+               printf("failure: utime [\n"
+                      "fstat (2) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+       if (utime(TESTFILE, NULL) != 0) {
+               printf("failure: utime [\n"
+                      "utime(NULL) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+       if (fstat(fd, &st3) != 0) {
+               printf("failure: utime [\n"
+                      "fstat (3) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+#define CMP_VAL(a,c,b) do { \
+       if (a c b) { \
+               printf("failure: utime [\n" \
+                      "%s: %s(%d) %s %s(%d)\n]\n", \
+                      __location__, \
+                      #a, (int)a, #c, #b, (int)b); \
+               return false; \
+       } \
+} while(0)
+#define EQUAL_VAL(a,b) CMP_VAL(a,!=,b)
+#define GREATER_VAL(a,b) CMP_VAL(a,<=,b)
+#define LESSER_VAL(a,b) CMP_VAL(a,>=,b)
+
+       EQUAL_VAL(st2.st_atime, st1.st_atime + 300);
+       EQUAL_VAL(st2.st_mtime, st1.st_mtime - 300);
+       LESSER_VAL(st3.st_atime, st2.st_atime);
+       GREATER_VAL(st3.st_mtime, st2.st_mtime);
+
+#undef CMP_VAL
+#undef EQUAL_VAL
+#undef GREATER_VAL
+#undef LESSER_VAL
+
+       unlink(TESTFILE);
+       printf("success: utime\n");
+       return true;
+}
+
+static int test_utimes(void)
+{
+       struct timeval tv[2];
+       struct stat st1, st2;
+       int fd;
+
+       printf("test: utimes\n");
+       unlink(TESTFILE);
+
+       fd = open(TESTFILE, O_RDWR|O_CREAT, 0600);
+       if (fd == -1) {
+               printf("failure: utimes [\n"
+                      "creating '%s' failed - %s\n]\n",
+                      TESTFILE, strerror(errno));
+               return false;
+       }
+
+       if (fstat(fd, &st1) != 0) {
+               printf("failure: utimes [\n"
+                      "fstat (1) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+       ZERO_STRUCT(tv);
+       tv[0].tv_sec = st1.st_atime + 300;
+       tv[1].tv_sec = st1.st_mtime - 300;
+       if (utimes(TESTFILE, tv) != 0) {
+               printf("failure: utimes [\n"
+                      "utimes(tv) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+       if (fstat(fd, &st2) != 0) {
+               printf("failure: utimes [\n"
+                      "fstat (2) failed - %s\n]\n",
+                      strerror(errno));
+               return false;
+       }
+
+#define EQUAL_VAL(a,b) do { \
+       if (a != b) { \
+               printf("failure: utimes [\n" \
+                      "%s: %s(%d) != %s(%d)\n]\n", \
+                      __location__, \
+                      #a, (int)a, #b, (int)b); \
+               return false; \
+       } \
+} while(0)
+
+       EQUAL_VAL(st2.st_atime, st1.st_atime + 300);
+       EQUAL_VAL(st2.st_mtime, st1.st_mtime - 300);
+
+#undef EQUAL_VAL
+
+       unlink(TESTFILE);
+       printf("success: utimes\n");
+       return true;
+}
+
+static int test_memmem(void)
+{
+       char *s;
+
+       printf("test: memmem\n");
+
+       s = (char *)memmem("foo", 3, "fo", 2);
+       if (strcmp(s, "foo") != 0) {
+               printf(__location__ ": Failed memmem\n");
+               return false;
+       }
+
+       s = (char *)memmem("foo", 3, "", 0);
+       /* it is allowable for this to return NULL (as happens on
+          FreeBSD) */
+       if (s && strcmp(s, "foo") != 0) {
+               printf(__location__ ": Failed memmem\n");
+               return false;
+       }
+
+       s = (char *)memmem("foo", 4, "o", 1);
+       if (strcmp(s, "oo") != 0) {
+               printf(__location__ ": Failed memmem\n");
+               return false;
+       }
+
+       s = (char *)memmem("foobarfodx", 11, "fod", 3);
+       if (strcmp(s, "fodx") != 0) {
+               printf(__location__ ": Failed memmem\n");
+               return false;
+       }
+
+       printf("success: memmem\n");
+
+       return true;
+}
+
+
+bool torture_local_replace(struct torture_context *ctx)
+{
+       bool ret = true;
+       ret &= test_ftruncate();
+       ret &= test_strlcpy();
+       ret &= test_strlcat();
+       ret &= test_mktime();
+       ret &= test_initgroups();
+       ret &= test_memmove();
+       ret &= test_strdup();
+       ret &= test_setlinebuf();
+       ret &= test_vsyslog();
+       ret &= test_timegm();
+       ret &= test_setenv();
+       ret &= test_strndup();
+       ret &= test_strnlen();
+       ret &= test_waitpid();
+       ret &= test_seteuid();
+       ret &= test_setegid();
+       ret &= test_asprintf();
+       ret &= test_snprintf();
+       ret &= test_vasprintf();
+       ret &= test_vsnprintf();
+       ret &= test_opendir();
+       ret &= test_readdir();
+       ret &= test_telldir();
+       ret &= test_seekdir();
+       ret &= test_dlopen();
+       ret &= test_chroot();
+       ret &= test_bzero();
+       ret &= test_strerror();
+       ret &= test_errno();
+       ret &= test_mkdtemp();
+       ret &= test_mkstemp();
+       ret &= test_pread();
+       ret &= test_pwrite();
+       ret &= test_inet_ntoa();
+       ret &= test_strtoll();
+       ret &= test_strtoull();
+       ret &= test_va_copy();
+       ret &= test_FUNCTION();
+       ret &= test_MIN();
+       ret &= test_MAX();
+       ret &= test_socketpair();
+       ret &= test_strptime();
+       ret &= test_getifaddrs();
+       ret &= test_utime();
+       ret &= test_utimes();
+       ret &= test_memmem();
+
+       return ret;
+}
diff --git a/ctdb/lib/replace/timegm.c b/ctdb/lib/replace/timegm.c
new file mode 100644 (file)
index 0000000..395c684
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 1997 Kungliga Tekniska Högskolan
+ * (Royal Institute of Technology, Stockholm, Sweden). 
+ * All rights reserved. 
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions 
+ * are met: 
+ *
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright 
+ *    notice, this list of conditions and the following disclaimer in the 
+ *    documentation and/or other materials provided with the distribution. 
+ *
+ * 3. Neither the name of the Institute nor the names of its contributors 
+ *    may be used to endorse or promote products derived from this software 
+ *    without specific prior written permission. 
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND 
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE 
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 
+ * SUCH DAMAGE. 
+ */
+
+/*
+  adapted for Samba4 by Andrew Tridgell
+*/
+
+#include "replace.h"
+#include "system/time.h"
+
+static int is_leap(unsigned y)
+{
+       y += 1900;
+       return (y % 4) == 0 && ((y % 100) != 0 || (y % 400) == 0);
+}
+
+time_t rep_timegm(struct tm *tm)
+{
+       static const unsigned ndays[2][12] ={
+               {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+               {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
+       time_t res = 0;
+       unsigned i;
+
+       if (tm->tm_mon > 12 ||
+           tm->tm_mon < 0 ||
+           tm->tm_mday > 31 ||
+           tm->tm_min > 60 ||
+           tm->tm_sec > 60 ||
+           tm->tm_hour > 24) {
+               /* invalid tm structure */
+               return 0;
+       }
+       
+       for (i = 70; i < tm->tm_year; ++i)
+               res += is_leap(i) ? 366 : 365;
+       
+       for (i = 0; i < tm->tm_mon; ++i)
+               res += ndays[is_leap(tm->tm_year)][i];
+       res += tm->tm_mday - 1;
+       res *= 24;
+       res += tm->tm_hour;
+       res *= 60;
+       res += tm->tm_min;
+       res *= 60;
+       res += tm->tm_sec;
+       return res;
+}
diff --git a/ctdb/lib/replace/timegm.m4 b/ctdb/lib/replace/timegm.m4
new file mode 100644 (file)
index 0000000..9b76d0c
--- /dev/null
@@ -0,0 +1 @@
+AC_CHECK_FUNCS(timegm,[],[LIBREPLACEOBJ="${LIBREPLACEOBJ} $libreplacedir/timegm.o"])
diff --git a/ctdb/lib/replace/win32.m4 b/ctdb/lib/replace/win32.m4
new file mode 100644 (file)
index 0000000..eb364e2
--- /dev/null
@@ -0,0 +1,20 @@
+AC_CHECK_HEADERS(direct.h windows.h winsock2.h ws2tcpip.h)
+
+#######################################
+# Check for mkdir mode
+AC_CACHE_CHECK( [whether mkdir supports mode], libreplace_cv_mkdir_has_mode,
+       AC_TRY_COMPILE([
+               #include <stdio.h>
+               #ifdef HAVE_DIRECT_H
+               #include <direct.h>
+               #endif],[
+                       mkdir("foo",0777);
+                       return 0;
+       ],
+    libreplace_cv_mkdir_has_mode="yes",
+    libreplace_cv_mkdir_has_mode="no") )
+
+if test "$libreplace_cv_mkdir_has_mode" = "yes"
+then
+    AC_DEFINE(HAVE_MKDIR_MODE, 1, [Define if target mkdir supports mode option])
+fi
diff --git a/ctdb/lib/replace/win32_replace.h b/ctdb/lib/replace/win32_replace.h
new file mode 100644 (file)
index 0000000..9901e72
--- /dev/null
@@ -0,0 +1,159 @@
+#ifndef _WIN32_REPLACE_H
+#define _WIN32_REPLACE_H
+
+#ifdef HAVE_WINSOCK2_H
+#include <winsock2.h>
+#endif
+
+#ifdef HAVE_WS2TCPIP_H
+#include <ws2tcpip.h>
+#endif
+
+#ifdef HAVE_WINDOWS_H
+#include <windows.h>
+#endif
+
+/* Map BSD Socket errorcodes to the WSA errorcodes (if possible) */ 
+
+#define EAFNOSUPPORT   WSAEAFNOSUPPORT
+#define ECONNREFUSED    WSAECONNREFUSED 
+#define EINPROGRESS    WSAEINPROGRESS
+#define EMSGSIZE       WSAEMSGSIZE 
+#define ENOBUFS         WSAENOBUFS
+#define ENOTSOCK       WSAENOTSOCK
+#define ENETUNREACH    WSAENETUNREACH
+#define ENOPROTOOPT    WSAENOPROTOOPT
+#define ENOTCONN       WSAENOTCONN 
+#define ENOTSUP                134 
+
+/* We undefine the following constants due to conflicts with the w32api headers
+ * and the Windows Platform SDK/DDK.
+ */
+
+#undef interface
+
+#undef ERROR_INVALID_PARAMETER
+#undef ERROR_INSUFFICIENT_BUFFER
+#undef ERROR_INVALID_DATATYPE
+
+#undef FILE_GENERIC_READ
+#undef FILE_GENERIC_WRITE
+#undef FILE_GENERIC_EXECUTE
+#undef FILE_ATTRIBUTE_READONLY
+#undef FILE_ATTRIBUTE_HIDDEN
+#undef FILE_ATTRIBUTE_SYSTEM
+#undef FILE_ATTRIBUTE_DIRECTORY
+#undef FILE_ATTRIBUTE_ARCHIVE
+#undef FILE_ATTRIBUTE_DEVICE
+#undef FILE_ATTRIBUTE_NORMAL
+#undef FILE_ATTRIBUTE_TEMPORARY
+#undef FILE_ATTRIBUTE_REPARSE_POINT
+#undef FILE_ATTRIBUTE_COMPRESSED
+#undef FILE_ATTRIBUTE_OFFLINE
+#undef FILE_ATTRIBUTE_ENCRYPTED
+#undef FILE_FLAG_WRITE_THROUGH
+#undef FILE_FLAG_NO_BUFFERING
+#undef FILE_FLAG_RANDOM_ACCESS
+#undef FILE_FLAG_SEQUENTIAL_SCAN
+#undef FILE_FLAG_DELETE_ON_CLOSE
+#undef FILE_FLAG_BACKUP_SEMANTICS
+#undef FILE_FLAG_POSIX_SEMANTICS
+#undef FILE_TYPE_DISK
+#undef FILE_TYPE_UNKNOWN
+#undef FILE_CASE_SENSITIVE_SEARCH
+#undef FILE_CASE_PRESERVED_NAMES
+#undef FILE_UNICODE_ON_DISK
+#undef FILE_PERSISTENT_ACLS
+#undef FILE_FILE_COMPRESSION
+#undef FILE_VOLUME_QUOTAS
+#undef FILE_VOLUME_IS_COMPRESSED
+#undef FILE_NOTIFY_CHANGE_FILE_NAME
+#undef FILE_NOTIFY_CHANGE_DIR_NAME
+#undef FILE_NOTIFY_CHANGE_ATTRIBUTES
+#undef FILE_NOTIFY_CHANGE_SIZE
+#undef FILE_NOTIFY_CHANGE_LAST_WRITE
+#undef FILE_NOTIFY_CHANGE_LAST_ACCESS
+#undef FILE_NOTIFY_CHANGE_CREATION
+#undef FILE_NOTIFY_CHANGE_EA
+#undef FILE_NOTIFY_CHANGE_SECURITY
+#undef FILE_NOTIFY_CHANGE_STREAM_NAME
+#undef FILE_NOTIFY_CHANGE_STREAM_SIZE
+#undef FILE_NOTIFY_CHANGE_STREAM_WRITE
+#undef FILE_NOTIFY_CHANGE_NAME
+
+#undef PRINTER_ATTRIBUTE_QUEUED
+#undef PRINTER_ATTRIBUTE_DIRECT
+#undef PRINTER_ATTRIBUTE_DEFAULT
+#undef PRINTER_ATTRIBUTE_SHARED
+#undef PRINTER_ATTRIBUTE_NETWORK
+#undef PRINTER_ATTRIBUTE_HIDDEN
+#undef PRINTER_ATTRIBUTE_LOCAL
+#undef PRINTER_ATTRIBUTE_ENABLE_DEVQ
+#undef PRINTER_ATTRIBUTE_KEEPPRINTEDJOBS
+#undef PRINTER_ATTRIBUTE_DO_COMPLETE_FIRST
+#undef PRINTER_ATTRIBUTE_WORK_OFFLINE
+#undef PRINTER_ATTRIBUTE_ENABLE_BIDI
+#undef PRINTER_ATTRIBUTE_RAW_ONLY
+#undef PRINTER_ATTRIBUTE_PUBLISHED
+#undef PRINTER_ENUM_DEFAULT
+#undef PRINTER_ENUM_LOCAL
+#undef PRINTER_ENUM_CONNECTIONS
+#undef PRINTER_ENUM_FAVORITE
+#undef PRINTER_ENUM_NAME
+#undef PRINTER_ENUM_REMOTE
+#undef PRINTER_ENUM_SHARED
+#undef PRINTER_ENUM_NETWORK
+#undef PRINTER_ENUM_EXPAND
+#undef PRINTER_ENUM_CONTAINER
+#undef PRINTER_ENUM_ICON1
+#undef PRINTER_ENUM_ICON2
+#undef PRINTER_ENUM_ICON3
+#undef PRINTER_ENUM_ICON4
+#undef PRINTER_ENUM_ICON5
+#undef PRINTER_ENUM_ICON6
+#undef PRINTER_ENUM_ICON7
+#undef PRINTER_ENUM_ICON8
+#undef PRINTER_STATUS_PAUSED
+#undef PRINTER_STATUS_ERROR
+#undef PRINTER_STATUS_PENDING_DELETION
+#undef PRINTER_STATUS_PAPER_JAM
+#undef PRINTER_STATUS_PAPER_OUT
+#undef PRINTER_STATUS_MANUAL_FEED
+#undef PRINTER_STATUS_PAPER_PROBLEM
+#undef PRINTER_STATUS_OFFLINE
+#undef PRINTER_STATUS_IO_ACTIVE
+#undef PRINTER_STATUS_BUSY
+#undef PRINTER_STATUS_PRINTING
+#undef PRINTER_STATUS_OUTPUT_BIN_FULL
+#undef PRINTER_STATUS_NOT_AVAILABLE
+#undef PRINTER_STATUS_WAITING
+#undef PRINTER_STATUS_PROCESSING
+#undef PRINTER_STATUS_INITIALIZING
+#undef PRINTER_STATUS_WARMING_UP
+#undef PRINTER_STATUS_TONER_LOW
+#undef PRINTER_STATUS_NO_TONER
+#undef PRINTER_STATUS_PAGE_PUNT
+#undef PRINTER_STATUS_USER_INTERVENTION
+#undef PRINTER_STATUS_OUT_OF_MEMORY
+#undef PRINTER_STATUS_DOOR_OPEN
+#undef PRINTER_STATUS_SERVER_UNKNOWN
+#undef PRINTER_STATUS_POWER_SAVE
+
+#undef DWORD
+#undef HKEY_CLASSES_ROOT
+#undef HKEY_CURRENT_USER
+#undef HKEY_LOCAL_MACHINE
+#undef HKEY_USERS
+#undef HKEY_PERFORMANCE_DATA
+#undef HKEY_CURRENT_CONFIG
+#undef HKEY_DYN_DATA
+#undef REG_DWORD
+#undef REG_QWORD
+
+#undef SERVICE_STATE_ALL
+
+#undef SE_GROUP_MANDATORY
+#undef SE_GROUP_ENABLED_BY_DEFAULT
+#undef SE_GROUP_ENABLED
+
+#endif /* _WIN32_REPLACE_H */
diff --git a/ctdb/lib/replace/wscript b/ctdb/lib/replace/wscript
new file mode 100644 (file)
index 0000000..2117f56
--- /dev/null
@@ -0,0 +1,632 @@
+#!/usr/bin/env python
+
+APPNAME = 'libreplace'
+VERSION = '1.2.1'
+
+blddir = 'bin'
+
+import sys, os, Utils
+
+# find the buildtools directory
+srcdir = '.'
+while not os.path.exists(srcdir+'/buildtools') and len(srcdir.split('/')) < 5:
+    srcdir = '../' + srcdir
+sys.path.insert(0, srcdir + '/buildtools/wafsamba')
+
+import wafsamba, samba_dist
+import Options, os, preproc
+
+samba_dist.DIST_DIRS('lib/replace buildtools:buildtools')
+
+def set_options(opt):
+    opt.BUILTIN_DEFAULT('NONE')
+    opt.PRIVATE_EXTENSION_DEFAULT('')
+    opt.RECURSE('buildtools/wafsamba')
+
+@wafsamba.runonce
+def configure(conf):
+    conf.RECURSE('buildtools/wafsamba')
+
+    conf.env.standalone_replace = conf.IN_LAUNCH_DIR()
+
+    conf.DEFINE('HAVE_LIBREPLACE', 1)
+    conf.DEFINE('LIBREPLACE_NETWORK_CHECKS', 1)
+
+    # on Tru64 certain features are only available with _OSF_SOURCE set to 1
+    # and _XOPEN_SOURCE set to 600
+    if conf.env['SYSTEM_UNAME_SYSNAME'] == 'OSF1':
+        conf.DEFINE('_OSF_SOURCE', 1, add_to_cflags=True)
+        conf.DEFINE('_XOPEN_SOURCE', 600, add_to_cflags=True)
+
+    # SCM_RIGHTS is only avail if _XOPEN_SOURCE iÑ• defined on IRIX
+    if conf.env['SYSTEM_UNAME_SYSNAME'] == 'IRIX':
+        conf.DEFINE('_XOPEN_SOURCE', 600, add_to_cflags=True)
+        conf.DEFINE('_BSD_TYPES', 1, add_to_cflags=True)
+
+    # Try to find the right extra flags for C99 initialisers
+    for f in ["", "-AC99", "-qlanglvl=extc99", "-qlanglvl=stdc99", "-c99"]:
+        if conf.CHECK_CFLAGS([f], '''
+struct foo {int x;char y;};
+struct foo bar = { .y = 'X', .x = 1 };
+'''):
+            if f != "":
+                conf.ADD_CFLAGS(f)
+            break
+
+    if conf.CHECK_CFLAGS(['-fstack-protector']) and conf.CHECK_LDFLAGS(['-fstack-protector']):
+        conf.ADD_CFLAGS('-fstack-protector')
+        conf.ADD_LDFLAGS('-fstack-protector')
+
+    # Try to find the right extra flags for -Werror behaviour
+    for f in ["-Werror",       # GCC
+             "-errwarn=%all", # Sun Studio
+             "-qhalt=w",     # IBM xlc
+             "-w2",           # Tru64
+             ]:
+        if conf.CHECK_CFLAGS([f], '''
+'''):
+            if not 'WERROR_CFLAGS' in conf.env:
+                conf.env['WERROR_CFLAGS'] = []
+            conf.env['WERROR_CFLAGS'].extend([f])
+            break
+
+    conf.CHECK_HEADERS('linux/types.h crypt.h locale.h acl/libacl.h compat.h')
+    conf.CHECK_HEADERS('acl/libacl.h attr/xattr.h compat.h ctype.h dustat.h')
+    conf.CHECK_HEADERS('fcntl.h fnmatch.h glob.h history.h krb5.h langinfo.h')
+    conf.CHECK_HEADERS('libaio.h locale.h ndir.h pwd.h')
+    conf.CHECK_HEADERS('shadow.h sys/acl.h')
+    conf.CHECK_HEADERS('sys/attributes.h attr/attributes.h sys/capability.h sys/dir.h sys/epoll.h')
+    conf.CHECK_HEADERS('sys/fcntl.h sys/filio.h sys/filsys.h sys/fs/s5param.h sys/fs/vx/quota.h')
+    conf.CHECK_HEADERS('sys/id.h sys/ioctl.h sys/ipc.h sys/mman.h sys/mode.h sys/ndir.h sys/priv.h')
+    conf.CHECK_HEADERS('sys/resource.h sys/security.h sys/shm.h sys/statfs.h sys/statvfs.h sys/termio.h')
+    conf.CHECK_HEADERS('sys/vfs.h sys/xattr.h termio.h termios.h sys/file.h')
+    conf.CHECK_HEADERS('sys/ucontext.h sys/wait.h sys/stat.h malloc.h grp.h')
+    conf.CHECK_HEADERS('sys/select.h setjmp.h utime.h sys/syslog.h syslog.h')
+    conf.CHECK_HEADERS('stdarg.h vararg.h sys/mount.h mntent.h')
+    conf.CHECK_HEADERS('stropts.h unix.h string.h strings.h sys/param.h limits.h')
+    conf.CHECK_HEADERS('''sys/socket.h netinet/in.h netdb.h arpa/inet.h netinet/in_systm.h
+                          netinet/ip.h netinet/tcp.h netinet/in_ip.h
+                          sys/sockio.h sys/un.h''', together=True)
+    conf.CHECK_HEADERS('sys/uio.h ifaddrs.h direct.h dirent.h')
+    conf.CHECK_HEADERS('windows.h winsock2.h ws2tcpip.h')
+    conf.CHECK_HEADERS('libintl.h errno.h')
+    conf.CHECK_HEADERS('gcrypt.h getopt.h iconv.h')
+    conf.CHECK_HEADERS('sys/inotify.h memory.h nss.h sasl/sasl.h')
+    conf.CHECK_HEADERS('security/pam_appl.h zlib.h asm/unistd.h')
+    conf.CHECK_HEADERS('aio.h sys/unistd.h rpc/rpc.h rpc/nettype.h alloca.h float.h')
+
+    conf.CHECK_HEADERS('rpcsvc/nis.h rpcsvc/ypclnt.h sys/sysctl.h')
+    conf.CHECK_HEADERS('sys/fileio.h sys/filesys.h sys/dustat.h sys/sysmacros.h')
+    conf.CHECK_HEADERS('xfs/libxfs.h netgroup.h')
+
+    conf.CHECK_CODE('', headers='rpc/rpc.h rpcsvc/yp_prot.h', define='HAVE_RPCSVC_YP_PROT_H')
+
+    conf.CHECK_HEADERS('valgrind.h valgrind/valgrind.h valgrind/memcheck.h')
+    conf.CHECK_HEADERS('nss_common.h nsswitch.h ns_api.h')
+    conf.CHECK_HEADERS('sys/extattr.h sys/ea.h sys/proplist.h sys/cdefs.h')
+    conf.CHECK_HEADERS('utmp.h utmpx.h lastlog.h malloc.h')
+    conf.CHECK_HEADERS('syscall.h sys/syscall.h inttypes.h')
+
+    # Check for process set name support
+    conf.CHECK_CODE('''
+                    #include <sys/prctl.h>
+                    int main(void) {
+                        prctl(0);
+                        return 0;
+                    }
+                    ''',
+                    'HAVE_PRCTL',
+                    headers='sys/prctl.h',
+                    msg='Checking for prctl syscall')
+
+    conf.CHECK_CODE('''
+                    #include <unistd.h>
+                    #ifdef HAVE_FCNTL_H
+                    #include <fcntl.h>
+                    #endif
+                    int main(void) { int fd = open("/dev/null", O_DIRECT); }
+                    ''',
+                    define='HAVE_OPEN_O_DIRECT',
+                    addmain=False,
+                    msg='Checking for O_DIRECT flag to open(2)')
+
+    conf.CHECK_TYPES('"long long" intptr_t uintptr_t ptrdiff_t comparison_fn_t')
+    conf.CHECK_TYPE('_Bool', define='HAVE__Bool')
+    conf.CHECK_TYPE('bool', define='HAVE_BOOL')
+
+    conf.CHECK_TYPE('int8_t', 'char')
+    conf.CHECK_TYPE('uint8_t', 'unsigned char')
+    conf.CHECK_TYPE('int16_t', 'short')
+    conf.CHECK_TYPE('uint16_t', 'unsigned short')
+    conf.CHECK_TYPE('int32_t', 'int')
+    conf.CHECK_TYPE('uint32_t', 'unsigned')
+    conf.CHECK_TYPE('int64_t', 'long long')
+    conf.CHECK_TYPE('uint64_t', 'unsigned long long')
+    conf.CHECK_TYPE('size_t', 'unsigned int')
+    conf.CHECK_TYPE('ssize_t', 'int')
+    conf.CHECK_TYPE('ino_t', 'unsigned')
+    conf.CHECK_TYPE('loff_t', 'off_t')
+    conf.CHECK_TYPE('offset_t', 'loff_t')
+    conf.CHECK_TYPE('volatile int', define='HAVE_VOLATILE')
+    conf.CHECK_TYPE('uint_t', 'unsigned int')
+    conf.CHECK_TYPE('blksize_t', 'long', headers='sys/types.h sys/stat.h unistd.h')
+    conf.CHECK_TYPE('blkcnt_t', 'long', headers='sys/types.h sys/stat.h unistd.h')
+
+    conf.CHECK_SIZEOF('bool char int "long long" long short size_t ssize_t')
+    conf.CHECK_SIZEOF('int8_t uint8_t int16_t uint16_t int32_t uint32_t int64_t uint64_t')
+    conf.CHECK_SIZEOF('void*', define='SIZEOF_VOID_P')
+    conf.CHECK_SIZEOF('off_t dev_t ino_t time_t')
+
+    conf.CHECK_TYPES('socklen_t', headers='sys/socket.h')
+    conf.CHECK_TYPE_IN('struct ifaddrs', 'ifaddrs.h')
+    conf.CHECK_TYPE_IN('struct addrinfo', 'netdb.h')
+    conf.CHECK_TYPE_IN('struct sockaddr', 'sys/socket.h')
+    conf.CHECK_CODE('struct sockaddr_in6 x', define='HAVE_STRUCT_SOCKADDR_IN6',
+                    headers='sys/socket.h netdb.h netinet/in.h')
+    conf.CHECK_TYPE_IN('struct sockaddr_storage', 'sys/socket.h')
+    conf.CHECK_TYPE_IN('sa_family_t', 'sys/socket.h')
+
+    conf.CHECK_TYPE_IN('sig_atomic_t', 'signal.h', define='HAVE_SIG_ATOMIC_T_TYPE')
+
+    conf.CHECK_FUNCS_IN('''inet_ntoa inet_aton inet_ntop inet_pton connect gethostbyname
+                           getaddrinfo getnameinfo freeaddrinfo gai_strerror socketpair''',
+                        'socket nsl', checklibc=True,
+                        headers='sys/socket.h netinet/in.h arpa/inet.h netdb.h')
+
+    # Some old Linux systems have broken header files and
+    # miss the IPV6_V6ONLY define in netinet/in.h,
+    # but have it in linux/in6.h.
+    # We can't include both files so we just check if the value
+    # if defined and do the replacement in system/network.h
+    if not conf.CHECK_VARIABLE('IPV6_V6ONLY',
+                               headers='sys/socket.h netdb.h netinet/in.h'):
+        conf.CHECK_CODE('''
+                        #include <linux/in6.h>
+                        #if (IPV6_V6ONLY != 26)
+                        #error no IPV6_V6ONLY support on linux
+                        #endif
+                        int main(void) { return IPV6_V6ONLY; }
+                        ''',
+                        define='HAVE_LINUX_IPV6_V6ONLY_26',
+                        addmain=False,
+                        msg='Checking for IPV6_V6ONLY in linux/in6.h',
+                        local_include=False)
+
+    conf.CHECK_CODE('''
+                       struct sockaddr_storage sa_store;
+                       struct addrinfo *ai = NULL;
+                       struct in6_addr in6addr;
+                       int idx = if_nametoindex("iface1");
+                       int s = socket(AF_INET6, SOCK_STREAM, 0);
+                       int ret = getaddrinfo(NULL, NULL, NULL, &ai);
+                       if (ret != 0) {
+                           const char *es = gai_strerror(ret);
+                       }
+                       freeaddrinfo(ai);
+                       {
+                          int val = 1;
+                          #ifdef HAVE_LINUX_IPV6_V6ONLY_26
+                          #define IPV6_V6ONLY 26
+                          #endif
+                          ret = setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
+                                           (const void *)&val, sizeof(val));
+                       }
+                       ''',
+                    define='HAVE_IPV6',
+                    lib='nsl socket',
+                    headers='sys/socket.h netdb.h netinet/in.h')
+
+    if conf.CONFIG_SET('HAVE_SYS_UCONTEXT_H') and conf.CONFIG_SET('HAVE_SIGNAL_H'):
+        conf.CHECK_CODE('''
+                       ucontext_t uc;
+                       sigaddset(&uc.uc_sigmask, SIGUSR1);
+                       ''',
+                       'HAVE_UCONTEXT_T',
+                       msg="Checking whether we have ucontext_t",
+                       headers='signal.h sys/ucontext.h')
+
+    # these may be builtins, so we need the link=False strategy
+    conf.CHECK_FUNCS('strdup memmem printf memset memcpy memmove strcpy strncpy bzero', link=False)
+
+    # See https://bugzilla.samba.org/show_bug.cgi?id=1097
+    #
+    # Ported in from autoconf where it was added with this commit:
+    # commit 804cfb20a067b4b687089dc72a8271b3abf20f31
+    # Author: Simo Sorce <idra@samba.org>
+    # Date:   Wed Aug 25 14:24:16 2004 +0000
+    #     r2070: Let's try to overload srnlen and strndup for AIX where they are natly broken.
+
+    host_os = sys.platform
+    if host_os.rfind('aix') > -1:
+        conf.DEFINE('BROKEN_STRNLEN', 1)
+        conf.DEFINE('BROKEN_STRNDUP', 1)
+
+    conf.CHECK_FUNCS('shl_load shl_unload shl_findsym')
+    conf.CHECK_FUNCS('pipe strftime srandom random srand rand usleep setbuffer')
+    conf.CHECK_FUNCS('lstat getpgrp utime utimes setuid seteuid setreuid setresuid setgid setegid')
+    conf.CHECK_FUNCS('setregid setresgid chroot strerror vsyslog setlinebuf mktime')
+    conf.CHECK_FUNCS('ftruncate chsize rename waitpid wait4')
+    conf.CHECK_FUNCS('initgroups pread pwrite strndup strcasestr')
+    conf.CHECK_FUNCS('strtok_r mkdtemp dup2 dprintf vdprintf isatty chown lchown')
+    conf.CHECK_FUNCS('link readlink symlink realpath snprintf vsnprintf')
+    conf.CHECK_FUNCS('asprintf vasprintf setenv unsetenv strnlen strtoull __strtoull')
+    conf.CHECK_FUNCS('strtouq strtoll __strtoll strtoq memalign posix_memalign')
+    conf.CHECK_FUNCS('prctl')
+
+    # libbsd on some platforms provides strlcpy and strlcat
+    if not conf.CHECK_FUNCS('strlcpy strlcat'):
+        conf.CHECK_FUNCS_IN('strlcpy strlcat', 'bsd', headers='bsd/string.h',
+                checklibc=True)
+    if not conf.CHECK_FUNCS('getpeereid'):
+        conf.CHECK_FUNCS_IN('getpeereid', 'bsd', headers='sys/types.h bsd/unistd.h')
+    if not conf.CHECK_FUNCS_IN('setproctitle', 'bsd', headers='sys/types.h bsd/unistd.h'):
+        conf.CHECK_FUNCS_IN('setproctitle', 'setproctitle', headers='setproctitle.h')
+
+    conf.CHECK_CODE('''
+                struct ucred cred;
+                socklen_t cred_len;
+                int ret = getsockopt(0, SOL_SOCKET, SO_PEERCRED, &cred, &cred_len);''',
+                'HAVE_PEERCRED',
+                msg="Checking whether we can use SO_PEERCRED to get socket credentials",
+                headers='sys/types.h sys/socket.h')
+
+    #Some OS (ie. freebsd) return EINVAL if the convertion could not be done, it's not what we expect
+    #Let's detect those cases
+    if conf.CONFIG_SET('HAVE_STRTOLL'):
+        conf.CHECK_CODE('''
+                        long long nb = strtoll("Text", NULL, 0);
+                        if (errno == EINVAL) {
+                            return 0;
+                        } else {
+                            return 1;
+                        }
+                        ''',
+                        msg="Checking correct behavior of strtoll",
+                        headers = 'errno.h',
+                        execute = True,
+                        define = 'HAVE_BSD_STRTOLL',
+                        )
+    conf.CHECK_FUNCS('if_nametoindex strerror_r')
+    conf.CHECK_FUNCS('getdirentries getdents syslog')
+    conf.CHECK_FUNCS('gai_strerror get_current_dir_name')
+    conf.CHECK_FUNCS('timegm getifaddrs freeifaddrs mmap setgroups syscall setsid')
+    conf.CHECK_FUNCS('getgrent_r getgrgid_r getgrnam_r getgrouplist getpagesize')
+    conf.CHECK_FUNCS('getpwent_r getpwnam_r getpwuid_r epoll_create')
+
+    conf.SET_TARGET_TYPE('attr', 'EMPTY')
+
+    xattr_headers='sys/attributes.h attr/xattr.h sys/xattr.h'
+
+    conf.CHECK_FUNCS_IN('''
+fgetxattr flistea flistxattr
+fremovexattr fsetxattr getxattr
+listxattr removexattr setxattr
+''', 'attr', checklibc=True, headers=xattr_headers)
+
+    # We need to check for linux xattrs first, as we do not wish to link to -lattr
+    # (the XFS compat API) on Linux systems with the native xattr API
+    if not conf.CONFIG_SET('HAVE_GETXATTR'):
+        conf.CHECK_FUNCS_IN('''
+attr_get attr_getf attr_list attr_listf attropen attr_remove
+attr_removef attr_set attr_setf extattr_delete_fd extattr_delete_file
+extattr_get_fd extattr_get_file extattr_list_fd extattr_list_file
+extattr_set_fd extattr_set_file fgetea
+fremoveea fsetea getea listea
+removeea setea
+''', 'attr', checklibc=True, headers=xattr_headers)
+
+    if (conf.CONFIG_SET('HAVE_ATTR_LISTF') or
+        conf.CONFIG_SET('HAVE_EXTATTR_LIST_FD') or
+        conf.CONFIG_SET('HAVE_FLISTEA') or
+        conf.CONFIG_SET('HAVE_FLISTXATTR')):
+            conf.DEFINE('HAVE_XATTR_SUPPORT', 1)
+
+    # Darwin has extra options to xattr-family functions
+    conf.CHECK_CODE('getxattr(NULL, NULL, NULL, 0, 0, 0)',
+                    headers=xattr_headers, local_include=False,
+                    define='XATTR_ADDITIONAL_OPTIONS',
+                    msg="Checking whether xattr interface takes additional options")
+
+    conf.CHECK_FUNCS_IN('dlopen dlsym dlerror dlclose', 'dl',
+                        checklibc=True, headers='dlfcn.h dl.h')
+
+    conf.CHECK_C_PROTOTYPE('dlopen', 'void *dlopen(const char* filename, unsigned int flags)',
+                           define='DLOPEN_TAKES_UNSIGNED_FLAGS', headers='dlfcn.h dl.h')
+
+    if conf.CHECK_FUNCS_IN('fdatasync', 'rt', checklibc=True):
+        # some systems are missing the declaration
+        conf.CHECK_DECLS('fdatasync')
+
+    if conf.CHECK_FUNCS_IN('clock_gettime', 'rt', checklibc=True):
+        for c in ['CLOCK_MONOTONIC', 'CLOCK_PROCESS_CPUTIME_ID', 'CLOCK_REALTIME']:
+            conf.CHECK_CODE('''
+                #if TIME_WITH_SYS_TIME
+                # include <sys/time.h>
+                # include <time.h>
+                #else
+                # if HAVE_SYS_TIME_H
+                #  include <sys/time.h>
+                # else
+                #  include <time.h>
+                # endif
+                #endif
+                clockid_t clk = %s''' % c,
+                'HAVE_%s' % c,
+                msg='Checking whether the clock_gettime clock ID %s is available' % c)
+
+    conf.CHECK_TYPE('struct timespec', headers='sys/time.h time.h')
+
+    # these headers need to be tested as a group on freebsd
+    conf.CHECK_HEADERS(headers='sys/socket.h net/if.h', together=True)
+    conf.CHECK_HEADERS(headers='netinet/in.h arpa/nameser.h resolv.h', together=True)
+    conf.CHECK_FUNCS_IN('res_search', 'resolv', checklibc=True,
+                        headers='netinet/in.h arpa/nameser.h resolv.h')
+
+
+    if not conf.CHECK_FUNCS_IN('gettext', 'intl', checklibc=True, headers='libintl.h'):
+    # Some hosts need lib iconv for linking with lib intl
+    # So we try with flags just in case it helps.
+        oldflags = conf.env['LDFLAGS_INTL']
+        conf.env['LDFLAGS_INTL'] = "-liconv"
+        if not conf.CHECK_LIB('intl'):
+            conf.env['LDFLAGS_INTL'] = oldflags
+        else:
+            conf.CHECK_FUNCS_IN('gettext', 'intl', checklibc=True, headers='libintl.h')
+
+    conf.CHECK_FUNCS_IN('dgettext gettext', 'intl', headers='libintl.h')
+    conf.CHECK_FUNCS_IN('pthread_create', 'pthread', checklibc=True, headers='pthread.h')
+
+    conf.CHECK_FUNCS_IN('crypt', 'crypt', checklibc=True)
+
+    conf.CHECK_VARIABLE('rl_event_hook', define='HAVE_DECL_RL_EVENT_HOOK', always=True,
+                        headers='readline.h readline/readline.h readline/history.h')
+
+    conf.CHECK_DECLS('snprintf vsnprintf asprintf vasprintf')
+
+    conf.CHECK_DECLS('errno', headers='errno.h', reverse=True)
+    conf.CHECK_DECLS('environ getgrent_r getpwent_r', reverse=True, headers='pwd.h grp.h')
+    conf.CHECK_DECLS('pread pwrite setenv setresgid setresuid', reverse=True)
+
+    if conf.CONFIG_SET('HAVE_EPOLL_CREATE') and conf.CONFIG_SET('HAVE_SYS_EPOLL_H'):
+        conf.DEFINE('HAVE_EPOLL', 1)
+
+    conf.CHECK_HEADERS('poll.h')
+    conf.CHECK_FUNCS('poll')
+
+    conf.CHECK_FUNCS('strptime')
+    conf.CHECK_DECLS('strptime', headers='time.h')
+    conf.CHECK_CODE('''#define LIBREPLACE_CONFIGURE_TEST_STRPTIME
+                       #include "test/strptime.c"''',
+                       define='HAVE_WORKING_STRPTIME',
+                       execute=True,
+                       addmain=False,
+                       msg='Checking for working strptime')
+
+    conf.CHECK_CODE('gettimeofday(NULL, NULL)', 'HAVE_GETTIMEOFDAY_TZ', execute=False)
+
+    conf.CHECK_CODE('#include "test/snprintf.c"',
+                    define="HAVE_C99_VSNPRINTF",
+                    execute=True,
+                    addmain=False,
+                    msg="Checking for C99 vsnprintf")
+
+    conf.CHECK_CODE('#include "test/shared_mmap.c"',
+                    addmain=False, add_headers=False, execute=True,
+                    define='HAVE_SHARED_MMAP',
+                    msg="Checking for HAVE_SHARED_MMAP")
+
+    conf.CHECK_CODE('#include "test/shared_mremap.c"',
+                    addmain=False, add_headers=False, execute=True,
+                    define='HAVE_MREMAP',
+                    msg="Checking for HAVE_MREMAP")
+
+    # OpenBSD (and I've heard HPUX) doesn't sync between mmap and write.
+    # FIXME: Anything other than a 0 or 1 exit code should abort configure!
+    conf.CHECK_CODE('#include "test/incoherent_mmap.c"',
+                    addmain=False, add_headers=False, execute=True,
+                    define='HAVE_INCOHERENT_MMAP',
+                    msg="Checking for HAVE_INCOHERENT_MMAP")
+
+    conf.SAMBA_BUILD_ENV()
+
+    conf.CHECK_CODE('''
+                    typedef struct {unsigned x;} FOOBAR;
+                    #define X_FOOBAR(x) ((FOOBAR) { x })
+                    #define FOO_ONE X_FOOBAR(1)
+                    FOOBAR f = FOO_ONE;
+                    static const struct {
+                        FOOBAR y;
+                    } f2[] = {
+                        {FOO_ONE}
+                    };
+                    static const FOOBAR f3[] = {FOO_ONE};
+                    ''',
+                    define='HAVE_IMMEDIATE_STRUCTURES')
+
+    conf.CHECK_CODE('mkdir("foo",0777)', define='HAVE_MKDIR_MODE', headers='sys/stat.h')
+
+    conf.CHECK_STRUCTURE_MEMBER('struct stat', 'st_mtim.tv_nsec', define='HAVE_STAT_TV_NSEC',
+                                headers='sys/stat.h')
+    # we need the st_rdev test under two names
+    conf.CHECK_STRUCTURE_MEMBER('struct stat', 'st_rdev',
+                                define='HAVE_STRUCT_STAT_ST_RDEV',
+                                headers='sys/stat.h')
+    conf.CHECK_STRUCTURE_MEMBER('struct stat', 'st_rdev', define='HAVE_ST_RDEV',
+                                headers='sys/stat.h')
+    conf.CHECK_STRUCTURE_MEMBER('struct sockaddr_storage', 'ss_family',
+                                headers='sys/socket.h netinet/in.h')
+    conf.CHECK_STRUCTURE_MEMBER('struct sockaddr_storage', '__ss_family',
+                                headers='sys/socket.h netinet/in.h')
+
+
+    if conf.CHECK_STRUCTURE_MEMBER('struct sockaddr', 'sa_len',
+                                   headers='sys/socket.h netinet/in.h',
+                                   define='HAVE_SOCKADDR_SA_LEN'):
+        # the old build system produced both defines
+        conf.DEFINE('HAVE_STRUCT_SOCKADDR_SA_LEN', 1)
+
+    conf.CHECK_STRUCTURE_MEMBER('struct sockaddr_in', 'sin_len',
+                                headers='sys/socket.h netinet/in.h',
+                                define='HAVE_SOCK_SIN_LEN')
+
+    conf.CHECK_CODE('struct sockaddr_un sunaddr; sunaddr.sun_family = AF_UNIX;',
+                    define='HAVE_UNIXSOCKET', headers='sys/socket.h sys/un.h')
+
+
+    conf.CHECK_CODE('''
+                    struct stat st;
+                    char tpl[20]="/tmp/test.XXXXXX";
+                    char tpl2[20]="/tmp/test.XXXXXX";
+                    int fd = mkstemp(tpl);
+                    int fd2 = mkstemp(tpl2);
+                    if (fd == -1) {
+                          if (fd2 != -1) {
+                                  unlink(tpl2);
+                          }
+                          exit(1);
+                    }
+                    if (fd2 == -1) exit(1);
+                    unlink(tpl);
+                    unlink(tpl2);
+                    if (fstat(fd, &st) != 0) exit(1);
+                    if ((st.st_mode & 0777) != 0600) exit(1);
+                    if (strcmp(tpl, "/tmp/test.XXXXXX") == 0) {
+                          exit(1);
+                    }
+                    if (strcmp(tpl, tpl2) == 0) {
+                          exit(1);
+                    }
+                    exit(0);
+                    ''',
+                    define='HAVE_SECURE_MKSTEMP',
+                    execute=True,
+                    mandatory=True) # lets see if we get a mandatory failure for this one
+
+    if conf.CHECK_CFLAGS('-fvisibility=hidden'):
+        conf.env.VISIBILITY_CFLAGS = '-fvisibility=hidden'
+        conf.CHECK_CODE('''void vis_foo1(void) {}
+                           __attribute__((visibility("default"))) void vis_foo2(void) {}''',
+                        cflags=conf.env.VISIBILITY_CFLAGS,
+                        define='HAVE_VISIBILITY_ATTR')
+
+    # look for a method of finding the list of network interfaces
+    for method in ['HAVE_IFACE_GETIFADDRS', 'HAVE_IFACE_AIX', 'HAVE_IFACE_IFCONF', 'HAVE_IFACE_IFREQ']:
+        if conf.CHECK_CODE('''
+                           #define %s 1
+                           #define NO_CONFIG_H 1
+                           #define AUTOCONF_TEST 1
+                           #define SOCKET_WRAPPER_NOT_REPLACE
+                           #include "replace.c"
+                           #include "inet_ntop.c"
+                           #include "snprintf.c"
+                           #include "getifaddrs.c"
+                           #define getifaddrs_test main
+                           #include "test/getifaddrs.c"
+                           ''' % method,
+                           method,
+                           lib='nsl socket',
+                           addmain=False,
+                           execute=True):
+            break
+
+    conf.RECURSE('system')
+    conf.SAMBA_CONFIG_H()
+
+
+REPLACEMENT_FUNCTIONS = {
+    'replace.c': ['ftruncate', 'strlcpy', 'strlcat', 'mktime', 'initgroups',
+                  'memmove', 'strdup', 'setlinebuf', 'vsyslog', 'strnlen',
+                  'strndup', 'waitpid', 'seteuid', 'setegid', 'chroot',
+                  'mkstemp', 'mkdtemp', 'pread', 'pwrite', 'strcasestr',
+                  'strtok_r', 'strtoll', 'strtoull', 'setenv', 'unsetenv',
+                  'utime', 'utimes', 'dup2', 'chown', 'link', 'readlink',
+                  'symlink', 'lchown', 'realpath', 'memmem', 'vdprintf',
+                  'dprintf', 'get_current_dir_name',
+                  'strerror_r', 'clock_gettime'],
+    'timegm.c': ['timegm'],
+    # Note: C99_VSNPRINTF is not a function, but a special condition
+    # for replacement
+    'snprintf.c': ['C99_VSNPRINTF', 'snprintf', 'vsnprintf', 'asprintf', 'vasprintf'],
+    # Note: WORKING_STRPTIME is not a function, but a special condition
+    # for replacement
+    'strptime.c': ['WORKING_STRPTIME', 'strptime'],
+    }
+
+
+def build(bld):
+    bld.RECURSE('buildtools/wafsamba')
+
+    REPLACE_HOSTCC_SOURCE = ''
+
+    for filename, functions in REPLACEMENT_FUNCTIONS.iteritems():
+        for function in functions:
+            if not bld.CONFIG_SET('HAVE_%s' % function.upper()):
+                REPLACE_HOSTCC_SOURCE += ' %s' % filename
+                break
+
+    extra_libs = ''
+    if bld.CONFIG_SET('HAVE_LIBBSD'): extra_libs += ' bsd'
+
+    bld.SAMBA_SUBSYSTEM('LIBREPLACE_HOSTCC',
+        REPLACE_HOSTCC_SOURCE,
+        use_hostcc=True,
+        use_global_deps=False,
+        cflags='-DSOCKET_WRAPPER_DISABLE=1 -DNSS_WRAPPER_DISABLE=1 -DUID_WRAPPER_DISABLE=1 -D_SAMBA_HOSTCC_',
+        group='compiler_libraries',
+        deps = extra_libs
+    )
+
+    REPLACE_SOURCE = REPLACE_HOSTCC_SOURCE
+
+    if not bld.CONFIG_SET('HAVE_CRYPT'):         REPLACE_SOURCE += ' crypt.c'
+    if not bld.CONFIG_SET('HAVE_DLOPEN'):        REPLACE_SOURCE += ' dlfcn.c'
+    if not bld.CONFIG_SET('HAVE_POLL'):          REPLACE_SOURCE += ' poll.c'
+
+    if not bld.CONFIG_SET('HAVE_SOCKETPAIR'):    REPLACE_SOURCE += ' socketpair.c'
+    if not bld.CONFIG_SET('HAVE_CONNECT'):       REPLACE_SOURCE += ' socket.c'
+    if not bld.CONFIG_SET('HAVE_GETIFADDRS'):    REPLACE_SOURCE += ' getifaddrs.c'
+    if not bld.CONFIG_SET('HAVE_GETADDRINFO'):   REPLACE_SOURCE += ' getaddrinfo.c'
+    if not bld.CONFIG_SET('HAVE_INET_NTOA'):     REPLACE_SOURCE += ' inet_ntoa.c'
+    if not bld.CONFIG_SET('HAVE_INET_ATON'):     REPLACE_SOURCE += ' inet_aton.c'
+    if not bld.CONFIG_SET('HAVE_INET_NTOP'):     REPLACE_SOURCE += ' inet_ntop.c'
+    if not bld.CONFIG_SET('HAVE_INET_PTON'):     REPLACE_SOURCE += ' inet_pton.c'
+    if not bld.CONFIG_SET('HAVE_GETXATTR') or bld.CONFIG_SET('XATTR_ADDITIONAL_OPTIONS'):
+                                                 REPLACE_SOURCE += ' xattr.c'
+
+    bld.SAMBA_LIBRARY('replace',
+                      source=REPLACE_SOURCE,
+                      group='base_libraries',
+                      # FIXME: Ideally symbols should be hidden here so they 
+                      # don't appear in the global namespace when Samba 
+                      # libraries are loaded, but this doesn't appear to work 
+                      # at the moment:
+                      # hide_symbols=bld.BUILTIN_LIBRARY('replace'),
+                      private_library=True,
+                      deps='crypt dl nsl socket rt attr' + extra_libs)
+
+    bld.SAMBA_SUBSYSTEM('replace-test',
+                      source='''test/testsuite.c test/strptime.c
+                      test/os2_delete.c test/getifaddrs.c''',
+                      deps='replace')
+
+    if bld.env.standalone_replace:
+        bld.SAMBA_BINARY('replace_testsuite',
+                         source='test/main.c',
+                         deps='replace replace-test',
+                         install=False)
+
+    # build replacements for stdint.h and stdbool.h if needed
+    bld.SAMBA_GENERATOR('replace_stdint_h',
+                        rule='cp ${SRC} ${TGT}',
+                        source='hdr_replace.h',
+                        target='stdint.h',
+                        enabled = not bld.CONFIG_SET('HAVE_STDINT_H'))
+    bld.SAMBA_GENERATOR('replace_stdbool_h',
+                        rule='cp ${SRC} ${TGT}',
+                        source='hdr_replace.h',
+                        target='stdbool.h',
+                        enabled = not bld.CONFIG_SET('HAVE_STDBOOL_H'))
+
+def dist():
+    '''makes a tarball for distribution'''
+    samba_dist.dist()
diff --git a/ctdb/lib/replace/xattr.c b/ctdb/lib/replace/xattr.c
new file mode 100644 (file)
index 0000000..a26ff67
--- /dev/null
@@ -0,0 +1,729 @@
+/* 
+   Unix SMB/CIFS implementation.
+   replacement routines for xattr implementations
+   Copyright (C) Jeremy Allison  1998-2005
+   Copyright (C) Timur Bakeyev        2005
+   Copyright (C) Bjoern Jacke    2006-2007
+   Copyright (C) Herb Lewis           2003
+   Copyright (C) Andrew Bartlett      2012
+
+     ** NOTE! The following LGPL license applies to the replace
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/dir.h"
+
+/******** Solaris EA helper function prototypes ********/
+#ifdef HAVE_ATTROPEN
+#define SOLARIS_ATTRMODE S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP
+static int solaris_write_xattr(int attrfd, const char *value, size_t size);
+static ssize_t solaris_read_xattr(int attrfd, void *value, size_t size);
+static ssize_t solaris_list_xattr(int attrdirfd, char *list, size_t size);
+static int solaris_unlinkat(int attrdirfd, const char *name);
+static int solaris_attropen(const char *path, const char *attrpath, int oflag, mode_t mode);
+static int solaris_openat(int fildes, const char *path, int oflag, mode_t mode);
+#endif
+
+/**************************************************************************
+ Wrappers for extented attribute calls. Based on the Linux package with
+ support for IRIX and (Net|Free)BSD also. Expand as other systems have them.
+****************************************************************************/
+
+ssize_t rep_getxattr (const char *path, const char *name, void *value, size_t size)
+{
+#if defined(HAVE_GETXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return getxattr(path, name, value, size);
+#else
+
+/* So that we do not recursivly call this function */
+#undef getxattr
+       int options = 0;
+       return getxattr(path, name, value, size, 0, options);
+#endif
+#elif defined(HAVE_GETEA)
+       return getea(path, name, value, size);
+#elif defined(HAVE_EXTATTR_GET_FILE)
+       char *s;
+       ssize_t retval;
+       int attrnamespace = (strncmp(name, "system", 6) == 0) ? 
+               EXTATTR_NAMESPACE_SYSTEM : EXTATTR_NAMESPACE_USER;
+       const char *attrname = ((s=strchr(name, '.')) == NULL) ? name : s + 1;
+       /*
+        * The BSD implementation has a nasty habit of silently truncating
+        * the returned value to the size of the buffer, so we have to check
+        * that the buffer is large enough to fit the returned value.
+        */
+       if((retval=extattr_get_file(path, attrnamespace, attrname, NULL, 0)) >= 0) {
+               if (size == 0) {
+                       return retval;
+               } else if (retval > size) {
+                       errno = ERANGE;
+                       return -1;
+               }
+               if((retval=extattr_get_file(path, attrnamespace, attrname, value, size)) >= 0)
+                       return retval;
+       }
+
+       return -1;
+#elif defined(HAVE_ATTR_GET)
+       int retval, flags = 0;
+       int valuelength = (int)size;
+       char *attrname = strchr(name,'.') + 1;
+
+       if (strncmp(name, "system", 6) == 0) flags |= ATTR_ROOT;
+
+       retval = attr_get(path, attrname, (char *)value, &valuelength, flags);
+       if (size == 0 && retval == -1 && errno == E2BIG) {
+               return valuelength;
+       }
+
+       return retval ? retval : valuelength;
+#elif defined(HAVE_ATTROPEN)
+       ssize_t ret = -1;
+       int attrfd = solaris_attropen(path, name, O_RDONLY, 0);
+       if (attrfd >= 0) {
+               ret = solaris_read_xattr(attrfd, value, size);
+               close(attrfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+ssize_t rep_fgetxattr (int filedes, const char *name, void *value, size_t size)
+{
+#if defined(HAVE_FGETXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return fgetxattr(filedes, name, value, size);
+#else
+
+/* So that we do not recursivly call this function */
+#undef fgetxattr
+       int options = 0;
+       return fgetxattr(filedes, name, value, size, 0, options);
+#endif
+#elif defined(HAVE_FGETEA)
+       return fgetea(filedes, name, value, size);
+#elif defined(HAVE_EXTATTR_GET_FD)
+       char *s;
+       ssize_t retval;
+       int attrnamespace = (strncmp(name, "system", 6) == 0) ? 
+               EXTATTR_NAMESPACE_SYSTEM : EXTATTR_NAMESPACE_USER;
+       const char *attrname = ((s=strchr(name, '.')) == NULL) ? name : s + 1;
+
+       if((retval=extattr_get_fd(filedes, attrnamespace, attrname, NULL, 0)) >= 0) {
+               if (size == 0) {
+                       return retval;
+               } else if (retval > size) {
+                       errno = ERANGE;
+                       return -1;
+               }
+               if((retval=extattr_get_fd(filedes, attrnamespace, attrname, value, size)) >= 0)
+                       return retval;
+       }
+
+       return -1;
+#elif defined(HAVE_ATTR_GETF)
+       int retval, flags = 0;
+       int valuelength = (int)size;
+       char *attrname = strchr(name,'.') + 1;
+
+       if (strncmp(name, "system", 6) == 0) flags |= ATTR_ROOT;
+
+       retval = attr_getf(filedes, attrname, (char *)value, &valuelength, flags);
+       if (size == 0 && retval == -1 && errno == E2BIG) {
+               return valuelength;
+       }
+       return retval ? retval : valuelength;
+#elif defined(HAVE_ATTROPEN)
+       ssize_t ret = -1;
+       int attrfd = solaris_openat(filedes, name, O_RDONLY|O_XATTR, 0);
+       if (attrfd >= 0) {
+               ret = solaris_read_xattr(attrfd, value, size);
+               close(attrfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+#if defined(HAVE_EXTATTR_LIST_FILE)
+
+#define EXTATTR_PREFIX(s)      (s), (sizeof((s))-1)
+
+static struct {
+        int space;
+       const char *name;
+       size_t len;
+} 
+extattr[] = {
+       { EXTATTR_NAMESPACE_SYSTEM, EXTATTR_PREFIX("system.") },
+        { EXTATTR_NAMESPACE_USER, EXTATTR_PREFIX("user.") },
+};
+
+typedef union {
+       const char *path;
+       int filedes;
+} extattr_arg;
+
+static ssize_t bsd_attr_list (int type, extattr_arg arg, char *list, size_t size)
+{
+       ssize_t list_size, total_size = 0;
+       int i, t, len;
+       char *buf;
+       /* Iterate through extattr(2) namespaces */
+       for(t = 0; t < ARRAY_SIZE(extattr); t++) {
+               switch(type) {
+#if defined(HAVE_EXTATTR_LIST_FILE)
+                       case 0:
+                               list_size = extattr_list_file(arg.path, extattr[t].space, list, size);
+                               break;
+#endif
+#if defined(HAVE_EXTATTR_LIST_LINK)
+                       case 1:
+                               list_size = extattr_list_link(arg.path, extattr[t].space, list, size);
+                               break;
+#endif
+#if defined(HAVE_EXTATTR_LIST_FD)
+                       case 2:
+                               list_size = extattr_list_fd(arg.filedes, extattr[t].space, list, size);
+                               break;
+#endif
+                       default:
+                               errno = ENOSYS;
+                               return -1;
+               }
+               /* Some error happend. Errno should be set by the previous call */
+               if(list_size < 0)
+                       return -1;
+               /* No attributes */
+               if(list_size == 0)
+                       continue;
+               /* XXX: Call with an empty buffer may be used to calculate
+                  necessary buffer size. Unfortunately, we can't say, how
+                  many attributes were returned, so here is the potential
+                  problem with the emulation.
+               */
+               if(list == NULL) {
+                       /* Take the worse case of one char attribute names - 
+                          two bytes per name plus one more for sanity.
+                       */
+                       total_size += list_size + (list_size/2 + 1)*extattr[t].len;
+                       continue;
+               }
+               /* Count necessary offset to fit namespace prefixes */
+               len = 0;
+               for(i = 0; i < list_size; i += list[i] + 1)
+                       len += extattr[t].len;
+
+               total_size += list_size + len;
+               /* Buffer is too small to fit the results */
+               if(total_size > size) {
+                       errno = ERANGE;
+                       return -1;
+               }
+               /* Shift results back, so we can prepend prefixes */
+               buf = (char *)memmove(list + len, list, list_size);
+
+               for(i = 0; i < list_size; i += len + 1) {
+                       len = buf[i];
+                       strncpy(list, extattr[t].name, extattr[t].len + 1);
+                       list += extattr[t].len;
+                       strncpy(list, buf + i + 1, len);
+                       list[len] = '\0';
+                       list += len + 1;
+               }
+               size -= total_size;
+       }
+       return total_size;
+}
+
+#endif
+
+#if defined(HAVE_ATTR_LIST) && (defined(HAVE_SYS_ATTRIBUTES_H) || defined(HAVE_ATTR_ATTRIBUTES_H))
+static char attr_buffer[ATTR_MAX_VALUELEN];
+
+static ssize_t irix_attr_list(const char *path, int filedes, char *list, size_t size, int flags)
+{
+       int retval = 0, index;
+       attrlist_cursor_t *cursor = 0;
+       int total_size = 0;
+       attrlist_t * al = (attrlist_t *)attr_buffer;
+       attrlist_ent_t *ae;
+       size_t ent_size, left = size;
+       char *bp = list;
+
+       while (true) {
+           if (filedes)
+               retval = attr_listf(filedes, attr_buffer, ATTR_MAX_VALUELEN, flags, cursor);
+           else
+               retval = attr_list(path, attr_buffer, ATTR_MAX_VALUELEN, flags, cursor);
+           if (retval) break;
+           for (index = 0; index < al->al_count; index++) {
+               ae = ATTR_ENTRY(attr_buffer, index);
+               ent_size = strlen(ae->a_name) + sizeof("user.");
+               if (left >= ent_size) {
+                   strncpy(bp, "user.", sizeof("user."));
+                   strncat(bp, ae->a_name, ent_size - sizeof("user."));
+                   bp += ent_size;
+                   left -= ent_size;
+               } else if (size) {
+                   errno = ERANGE;
+                   retval = -1;
+                   break;
+               }
+               total_size += ent_size;
+           }
+           if (al->al_more == 0) break;
+       }
+       if (retval == 0) {
+           flags |= ATTR_ROOT;
+           cursor = 0;
+           while (true) {
+               if (filedes)
+                   retval = attr_listf(filedes, attr_buffer, ATTR_MAX_VALUELEN, flags, cursor);
+               else
+                   retval = attr_list(path, attr_buffer, ATTR_MAX_VALUELEN, flags, cursor);
+               if (retval) break;
+               for (index = 0; index < al->al_count; index++) {
+                   ae = ATTR_ENTRY(attr_buffer, index);
+                   ent_size = strlen(ae->a_name) + sizeof("system.");
+                   if (left >= ent_size) {
+                       strncpy(bp, "system.", sizeof("system."));
+                       strncat(bp, ae->a_name, ent_size - sizeof("system."));
+                       bp += ent_size;
+                       left -= ent_size;
+                   } else if (size) {
+                       errno = ERANGE;
+                       retval = -1;
+                       break;
+                   }
+                   total_size += ent_size;
+               }
+               if (al->al_more == 0) break;
+           }
+       }
+       return (ssize_t)(retval ? retval : total_size);
+}
+
+#endif
+
+ssize_t rep_listxattr (const char *path, char *list, size_t size)
+{
+#if defined(HAVE_LISTXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return listxattr(path, list, size);
+#else
+/* So that we do not recursivly call this function */
+#undef listxattr
+       int options = 0;
+       return listxattr(path, list, size, options);
+#endif
+#elif defined(HAVE_LISTEA)
+       return listea(path, list, size);
+#elif defined(HAVE_EXTATTR_LIST_FILE)
+       extattr_arg arg;
+       arg.path = path;
+       return bsd_attr_list(0, arg, list, size);
+#elif defined(HAVE_ATTR_LIST) && defined(HAVE_SYS_ATTRIBUTES_H)
+       return irix_attr_list(path, 0, list, size, 0);
+#elif defined(HAVE_ATTROPEN)
+       ssize_t ret = -1;
+       int attrdirfd = solaris_attropen(path, ".", O_RDONLY, 0);
+       if (attrdirfd >= 0) {
+               ret = solaris_list_xattr(attrdirfd, list, size);
+               close(attrdirfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+ssize_t rep_flistxattr (int filedes, char *list, size_t size)
+{
+#if defined(HAVE_FLISTXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return flistxattr(filedes, list, size);
+#else
+/* So that we do not recursivly call this function */
+#undef flistxattr
+       int options = 0;
+       return flistxattr(filedes, list, size, options);
+#endif
+#elif defined(HAVE_FLISTEA)
+       return flistea(filedes, list, size);
+#elif defined(HAVE_EXTATTR_LIST_FD)
+       extattr_arg arg;
+       arg.filedes = filedes;
+       return bsd_attr_list(2, arg, list, size);
+#elif defined(HAVE_ATTR_LISTF)
+       return irix_attr_list(NULL, filedes, list, size, 0);
+#elif defined(HAVE_ATTROPEN)
+       ssize_t ret = -1;
+       int attrdirfd = solaris_openat(filedes, ".", O_RDONLY|O_XATTR, 0);
+       if (attrdirfd >= 0) {
+               ret = solaris_list_xattr(attrdirfd, list, size);
+               close(attrdirfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+int rep_removexattr (const char *path, const char *name)
+{
+#if defined(HAVE_REMOVEXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return removexattr(path, name);
+#else
+/* So that we do not recursivly call this function */
+#undef removexattr
+       int options = 0;
+       return removexattr(path, name, options);
+#endif
+#elif defined(HAVE_REMOVEEA)
+       return removeea(path, name);
+#elif defined(HAVE_EXTATTR_DELETE_FILE)
+       char *s;
+       int attrnamespace = (strncmp(name, "system", 6) == 0) ? 
+               EXTATTR_NAMESPACE_SYSTEM : EXTATTR_NAMESPACE_USER;
+       const char *attrname = ((s=strchr(name, '.')) == NULL) ? name : s + 1;
+
+       return extattr_delete_file(path, attrnamespace, attrname);
+#elif defined(HAVE_ATTR_REMOVE)
+       int flags = 0;
+       char *attrname = strchr(name,'.') + 1;
+
+       if (strncmp(name, "system", 6) == 0) flags |= ATTR_ROOT;
+
+       return attr_remove(path, attrname, flags);
+#elif defined(HAVE_ATTROPEN)
+       int ret = -1;
+       int attrdirfd = solaris_attropen(path, ".", O_RDONLY, 0);
+       if (attrdirfd >= 0) {
+               ret = solaris_unlinkat(attrdirfd, name);
+               close(attrdirfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+int rep_fremovexattr (int filedes, const char *name)
+{
+#if defined(HAVE_FREMOVEXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return fremovexattr(filedes, name);
+#else
+/* So that we do not recursivly call this function */
+#undef fremovexattr
+       int options = 0;
+       return fremovexattr(filedes, name, options);
+#endif
+#elif defined(HAVE_FREMOVEEA)
+       return fremoveea(filedes, name);
+#elif defined(HAVE_EXTATTR_DELETE_FD)
+       char *s;
+       int attrnamespace = (strncmp(name, "system", 6) == 0) ? 
+               EXTATTR_NAMESPACE_SYSTEM : EXTATTR_NAMESPACE_USER;
+       const char *attrname = ((s=strchr(name, '.')) == NULL) ? name : s + 1;
+
+       return extattr_delete_fd(filedes, attrnamespace, attrname);
+#elif defined(HAVE_ATTR_REMOVEF)
+       int flags = 0;
+       char *attrname = strchr(name,'.') + 1;
+
+       if (strncmp(name, "system", 6) == 0) flags |= ATTR_ROOT;
+
+       return attr_removef(filedes, attrname, flags);
+#elif defined(HAVE_ATTROPEN)
+       int ret = -1;
+       int attrdirfd = solaris_openat(filedes, ".", O_RDONLY|O_XATTR, 0);
+       if (attrdirfd >= 0) {
+               ret = solaris_unlinkat(attrdirfd, name);
+               close(attrdirfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+int rep_setxattr (const char *path, const char *name, const void *value, size_t size, int flags)
+{
+#if defined(HAVE_SETXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return setxattr(path, name, value, size, flags);
+#else
+/* So that we do not recursivly call this function */
+#undef setxattr
+       int options = 0;
+       return setxattr(path, name, value, size, 0, options);
+#endif
+#elif defined(HAVE_SETEA)
+       return setea(path, name, value, size, flags);
+#elif defined(HAVE_EXTATTR_SET_FILE)
+       char *s;
+       int retval = 0;
+       int attrnamespace = (strncmp(name, "system", 6) == 0) ? 
+               EXTATTR_NAMESPACE_SYSTEM : EXTATTR_NAMESPACE_USER;
+       const char *attrname = ((s=strchr(name, '.')) == NULL) ? name : s + 1;
+       if (flags) {
+               /* Check attribute existence */
+               retval = extattr_get_file(path, attrnamespace, attrname, NULL, 0);
+               if (retval < 0) {
+                       /* REPLACE attribute, that doesn't exist */
+                       if (flags & XATTR_REPLACE && errno == ENOATTR) {
+                               errno = ENOATTR;
+                               return -1;
+                       }
+                       /* Ignore other errors */
+               }
+               else {
+                       /* CREATE attribute, that already exists */
+                       if (flags & XATTR_CREATE) {
+                               errno = EEXIST;
+                               return -1;
+                       }
+               }
+       }
+       retval = extattr_set_file(path, attrnamespace, attrname, value, size);
+       return (retval < 0) ? -1 : 0;
+#elif defined(HAVE_ATTR_SET)
+       int myflags = 0;
+       char *attrname = strchr(name,'.') + 1;
+
+       if (strncmp(name, "system", 6) == 0) myflags |= ATTR_ROOT;
+       if (flags & XATTR_CREATE) myflags |= ATTR_CREATE;
+       if (flags & XATTR_REPLACE) myflags |= ATTR_REPLACE;
+
+       return attr_set(path, attrname, (const char *)value, size, myflags);
+#elif defined(HAVE_ATTROPEN)
+       int ret = -1;
+       int myflags = O_RDWR;
+       int attrfd;
+       if (flags & XATTR_CREATE) myflags |= O_EXCL;
+       if (!(flags & XATTR_REPLACE)) myflags |= O_CREAT;
+       attrfd = solaris_attropen(path, name, myflags, (mode_t) SOLARIS_ATTRMODE);
+       if (attrfd >= 0) {
+               ret = solaris_write_xattr(attrfd, value, size);
+               close(attrfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+int rep_fsetxattr (int filedes, const char *name, const void *value, size_t size, int flags)
+{
+#if defined(HAVE_FSETXATTR)
+#ifndef XATTR_ADDITIONAL_OPTIONS
+       return fsetxattr(filedes, name, value, size, flags);
+#else
+/* So that we do not recursivly call this function */
+#undef fsetxattr
+       int options = 0;
+       return fsetxattr(filedes, name, value, size, 0, options);
+#endif
+#elif defined(HAVE_FSETEA)
+       return fsetea(filedes, name, value, size, flags);
+#elif defined(HAVE_EXTATTR_SET_FD)
+       char *s;
+       int retval = 0;
+       int attrnamespace = (strncmp(name, "system", 6) == 0) ? 
+               EXTATTR_NAMESPACE_SYSTEM : EXTATTR_NAMESPACE_USER;
+       const char *attrname = ((s=strchr(name, '.')) == NULL) ? name : s + 1;
+       if (flags) {
+               /* Check attribute existence */
+               retval = extattr_get_fd(filedes, attrnamespace, attrname, NULL, 0);
+               if (retval < 0) {
+                       /* REPLACE attribute, that doesn't exist */
+                       if (flags & XATTR_REPLACE && errno == ENOATTR) {
+                               errno = ENOATTR;
+                               return -1;
+                       }
+                       /* Ignore other errors */
+               }
+               else {
+                       /* CREATE attribute, that already exists */
+                       if (flags & XATTR_CREATE) {
+                               errno = EEXIST;
+                               return -1;
+                       }
+               }
+       }
+       retval = extattr_set_fd(filedes, attrnamespace, attrname, value, size);
+       return (retval < 0) ? -1 : 0;
+#elif defined(HAVE_ATTR_SETF)
+       int myflags = 0;
+       char *attrname = strchr(name,'.') + 1;
+
+       if (strncmp(name, "system", 6) == 0) myflags |= ATTR_ROOT;
+       if (flags & XATTR_CREATE) myflags |= ATTR_CREATE;
+       if (flags & XATTR_REPLACE) myflags |= ATTR_REPLACE;
+
+       return attr_setf(filedes, attrname, (const char *)value, size, myflags);
+#elif defined(HAVE_ATTROPEN)
+       int ret = -1;
+       int myflags = O_RDWR | O_XATTR;
+       int attrfd;
+       if (flags & XATTR_CREATE) myflags |= O_EXCL;
+       if (!(flags & XATTR_REPLACE)) myflags |= O_CREAT;
+       attrfd = solaris_openat(filedes, name, myflags, (mode_t) SOLARIS_ATTRMODE);
+       if (attrfd >= 0) {
+               ret = solaris_write_xattr(attrfd, value, size);
+               close(attrfd);
+       }
+       return ret;
+#else
+       errno = ENOSYS;
+       return -1;
+#endif
+}
+
+/**************************************************************************
+ helper functions for Solaris' EA support
+****************************************************************************/
+#ifdef HAVE_ATTROPEN
+static ssize_t solaris_read_xattr(int attrfd, void *value, size_t size)
+{
+       struct stat sbuf;
+
+       if (fstat(attrfd, &sbuf) == -1) {
+               errno = ENOATTR;
+               return -1;
+       }
+
+       /* This is to return the current size of the named extended attribute */
+       if (size == 0) {
+               return sbuf.st_size;
+       }
+
+       /* check size and read xattr */
+       if (sbuf.st_size > size) {
+               errno = ERANGE;
+               return -1;
+       }
+
+       return read(attrfd, value, sbuf.st_size);
+}
+
+static ssize_t solaris_list_xattr(int attrdirfd, char *list, size_t size)
+{
+       ssize_t len = 0;
+       DIR *dirp;
+       struct dirent *de;
+       int newfd = dup(attrdirfd);
+       /* CAUTION: The originating file descriptor should not be
+                   used again following the call to fdopendir().
+                   For that reason we dup() the file descriptor
+                   here to make things more clear. */
+       dirp = fdopendir(newfd);
+
+       while ((de = readdir(dirp))) {
+               size_t listlen = strlen(de->d_name) + 1;
+               if (!strcmp(de->d_name, ".") || !strcmp(de->d_name, "..")) {
+                       /* we don't want "." and ".." here: */
+                       continue;
+               }
+
+               if (size == 0) {
+                       /* return the current size of the list of extended attribute names*/
+                       len += listlen;
+               } else {
+                       /* check size and copy entrieÑ• + nul into list. */
+                       if ((len + listlen) > size) {
+                               errno = ERANGE;
+                               len = -1;
+                               break;
+                       } else {
+                               strlcpy(list + len, de->d_name, listlen);
+                               len += listlen;
+                       }
+               }
+       }
+
+       if (closedir(dirp) == -1) {
+               return -1;
+       }
+       return len;
+}
+
+static int solaris_unlinkat(int attrdirfd, const char *name)
+{
+       if (unlinkat(attrdirfd, name, 0) == -1) {
+               if (errno == ENOENT) {
+                       errno = ENOATTR;
+               }
+               return -1;
+       }
+       return 0;
+}
+
+static int solaris_attropen(const char *path, const char *attrpath, int oflag, mode_t mode)
+{
+       int filedes = attropen(path, attrpath, oflag, mode);
+       if (filedes == -1) {
+               if (errno == EINVAL) {
+                       errno = ENOTSUP;
+               } else {
+                       errno = ENOATTR;
+               }
+       }
+       return filedes;
+}
+
+static int solaris_openat(int fildes, const char *path, int oflag, mode_t mode)
+{
+       int filedes = openat(fildes, path, oflag, mode);
+       if (filedes == -1) {
+               if (errno == EINVAL) {
+                       errno = ENOTSUP;
+               } else {
+                       errno = ENOATTR;
+               }
+       }
+       return filedes;
+}
+
+static int solaris_write_xattr(int attrfd, const char *value, size_t size)
+{
+       if ((ftruncate(attrfd, 0) == 0) && (write(attrfd, value, size) == size)) {
+               return 0;
+       } else {
+               return -1;
+       }
+}
+#endif /*HAVE_ATTROPEN*/
+
+
diff --git a/ctdb/lib/socket_wrapper/config.m4 b/ctdb/lib/socket_wrapper/config.m4
new file mode 100644 (file)
index 0000000..6db392f
--- /dev/null
@@ -0,0 +1,21 @@
+AC_ARG_ENABLE(socket-wrapper, 
+AS_HELP_STRING([--enable-socket-wrapper], [Turn on socket wrapper library (default=no)]))
+
+DEFAULT_TEST_OPTIONS=
+HAVE_SOCKET_WRAPPER=no
+
+if eval "test x$developer = xyes"; then
+       enable_socket_wrapper=yes
+fi
+    
+if eval "test x$enable_socket_wrapper = xyes"; then
+        AC_DEFINE(SOCKET_WRAPPER,1,[Use socket wrapper library])
+       DEFAULT_TEST_OPTIONS=--socket-wrapper
+       HAVE_SOCKET_WRAPPER=yes
+
+       SOCKET_WRAPPER_OBJS="lib/socket_wrapper/socket_wrapper.o"
+fi
+
+AC_SUBST(DEFAULT_TEST_OPTIONS)
+AC_SUBST(HAVE_SOCKET_WRAPPER)
+AC_SUBST(SOCKET_WRAPPER_OBJS)
diff --git a/ctdb/lib/socket_wrapper/socket_wrapper.c b/ctdb/lib/socket_wrapper/socket_wrapper.c
new file mode 100644 (file)
index 0000000..2c24ab7
--- /dev/null
@@ -0,0 +1,2654 @@
+/*
+ * Copyright (C) Jelmer Vernooij 2005,2008 <jelmer@samba.org>
+ * Copyright (C) Stefan Metzmacher 2006-2009 <metze@samba.org>
+ *
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the author nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+   Socket wrapper library. Passes all socket communication over
+   unix domain sockets if the environment variable SOCKET_WRAPPER_DIR
+   is set.
+*/
+
+#include "config.h"
+
+#ifdef HAVE_LIBREPLACE
+
+#define SOCKET_WRAPPER_NOT_REPLACE
+#include "replace.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/time.h"
+
+#else /* HAVE_LIBREPLACE */
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/filio.h>
+#include <errno.h>
+#include <sys/un.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#endif /* HAVE_LIBREPLACE */
+
+#ifndef _PUBLIC_
+#define _PUBLIC_
+#endif
+
+#define SWRAP_DLIST_ADD(list,item) do { \
+       if (!(list)) { \
+               (item)->prev    = NULL; \
+               (item)->next    = NULL; \
+               (list)          = (item); \
+       } else { \
+               (item)->prev    = NULL; \
+               (item)->next    = (list); \
+               (list)->prev    = (item); \
+               (list)          = (item); \
+       } \
+} while (0)
+
+#define SWRAP_DLIST_REMOVE(list,item) do { \
+       if ((list) == (item)) { \
+               (list)          = (item)->next; \
+               if (list) { \
+                       (list)->prev    = NULL; \
+               } \
+       } else { \
+               if ((item)->prev) { \
+                       (item)->prev->next      = (item)->next; \
+               } \
+               if ((item)->next) { \
+                       (item)->next->prev      = (item)->prev; \
+               } \
+       } \
+       (item)->prev    = NULL; \
+       (item)->next    = NULL; \
+} while (0)
+
+/* LD_PRELOAD doesn't work yet, so REWRITE_CALLS is all we support
+ * for now */
+#define REWRITE_CALLS 
+
+#ifdef REWRITE_CALLS
+#define real_accept accept
+#define real_connect connect
+#define real_bind bind
+#define real_listen listen
+#define real_getpeername getpeername
+#define real_getsockname getsockname
+#define real_getsockopt getsockopt
+#define real_setsockopt setsockopt
+#define real_recvfrom recvfrom
+#define real_sendto sendto
+#define real_sendmsg sendmsg
+#define real_ioctl ioctl
+#define real_recv recv
+#define real_read read
+#define real_send send
+#define real_readv readv
+#define real_writev writev
+#define real_socket socket
+#define real_close close
+#define real_dup dup
+#define real_dup2 dup2
+#endif
+
+#ifdef HAVE_GETTIMEOFDAY_TZ
+#define swrapGetTimeOfDay(tval) gettimeofday(tval,NULL)
+#else
+#define swrapGetTimeOfDay(tval)        gettimeofday(tval)
+#endif
+
+/* we need to use a very terse format here as IRIX 6.4 silently
+   truncates names to 16 chars, so if we use a longer name then we
+   can't tell which port a packet came from with recvfrom() 
+   
+   with this format we have 8 chars left for the directory name
+*/
+#define SOCKET_FORMAT "%c%02X%04X"
+#define SOCKET_TYPE_CHAR_TCP           'T'
+#define SOCKET_TYPE_CHAR_UDP           'U'
+#define SOCKET_TYPE_CHAR_TCP_V6                'X'
+#define SOCKET_TYPE_CHAR_UDP_V6                'Y'
+
+/* This limit is to avoid broadcast sendto() needing to stat too many
+ * files.  It may be raised (with a performance cost) to up to 254
+ * without changing the format above */
+#define MAX_WRAPPED_INTERFACES 32
+
+#ifdef HAVE_IPV6
+/*
+ * FD00::5357:5FXX
+ */
+static const struct in6_addr *swrap_ipv6(void)
+{
+       static struct in6_addr v;
+       static int initialized;
+       int ret;
+
+       if (initialized) {
+               return &v;
+       }
+       initialized = 1;
+
+       ret = inet_pton(AF_INET6, "FD00::5357:5F00", &v);
+       if (ret <= 0) {
+               abort();
+       }
+
+       return &v;
+}
+#endif
+
+static struct sockaddr *sockaddr_dup(const void *data, socklen_t len)
+{
+       struct sockaddr *ret = (struct sockaddr *)malloc(len);
+       memcpy(ret, data, len);
+       return ret;
+}
+
+static void set_port(int family, int prt, struct sockaddr *addr)
+{
+       switch (family) {
+       case AF_INET:
+               ((struct sockaddr_in *)addr)->sin_port = htons(prt);
+               break;
+#ifdef HAVE_IPV6
+       case AF_INET6:
+               ((struct sockaddr_in6 *)addr)->sin6_port = htons(prt);
+               break;
+#endif
+       }
+}
+
+static size_t socket_length(int family)
+{
+       switch (family) {
+       case AF_INET:
+               return sizeof(struct sockaddr_in);
+#ifdef HAVE_IPV6
+       case AF_INET6:
+               return sizeof(struct sockaddr_in6);
+#endif
+       }
+       return 0;
+}
+
+struct socket_info_fd {
+       struct socket_info_fd *prev, *next;
+       int fd;
+};
+
+struct socket_info
+{
+       struct socket_info_fd *fds;
+
+       int family;
+       int type;
+       int protocol;
+       int bound;
+       int bcast;
+       int is_server;
+       int connected;
+       int defer_connect;
+
+       char *tmp_path;
+
+       struct sockaddr *myname;
+       socklen_t myname_len;
+
+       struct sockaddr *peername;
+       socklen_t peername_len;
+
+       struct {
+               unsigned long pck_snd;
+               unsigned long pck_rcv;
+       } io;
+
+       struct socket_info *prev, *next;
+};
+
+static struct socket_info *sockets;
+
+const char *socket_wrapper_dir(void)
+{
+       const char *s = getenv("SOCKET_WRAPPER_DIR");
+       if (s == NULL) {
+               return NULL;
+       }
+       if (strncmp(s, "./", 2) == 0) {
+               s += 2;
+       }
+       return s;
+}
+
+unsigned int socket_wrapper_default_iface(void)
+{
+       const char *s = getenv("SOCKET_WRAPPER_DEFAULT_IFACE");
+       if (s) {
+               unsigned int iface;
+               if (sscanf(s, "%u", &iface) == 1) {
+                       if (iface >= 1 && iface <= MAX_WRAPPED_INTERFACES) {
+                               return iface;
+                       }
+               }
+       }
+
+       return 1;/* 127.0.0.1 */
+}
+
+static int convert_un_in(const struct sockaddr_un *un, struct sockaddr *in, socklen_t *len)
+{
+       unsigned int iface;
+       unsigned int prt;
+       const char *p;
+       char type;
+
+       p = strrchr(un->sun_path, '/');
+       if (p) p++; else p = un->sun_path;
+
+       if (sscanf(p, SOCKET_FORMAT, &type, &iface, &prt) != 3) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       if (iface == 0 || iface > MAX_WRAPPED_INTERFACES) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       if (prt > 0xFFFF) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       switch(type) {
+       case SOCKET_TYPE_CHAR_TCP:
+       case SOCKET_TYPE_CHAR_UDP: {
+               struct sockaddr_in *in2 = (struct sockaddr_in *)(void *)in;
+
+               if ((*len) < sizeof(*in2)) {
+                   errno = EINVAL;
+                   return -1;
+               }
+
+               memset(in2, 0, sizeof(*in2));
+               in2->sin_family = AF_INET;
+               in2->sin_addr.s_addr = htonl((127<<24) | iface);
+               in2->sin_port = htons(prt);
+
+               *len = sizeof(*in2);
+               break;
+       }
+#ifdef HAVE_IPV6
+       case SOCKET_TYPE_CHAR_TCP_V6:
+       case SOCKET_TYPE_CHAR_UDP_V6: {
+               struct sockaddr_in6 *in2 = (struct sockaddr_in6 *)(void *)in;
+
+               if ((*len) < sizeof(*in2)) {
+                       errno = EINVAL;
+                       return -1;
+               }
+
+               memset(in2, 0, sizeof(*in2));
+               in2->sin6_family = AF_INET6;
+               in2->sin6_addr = *swrap_ipv6();
+               in2->sin6_addr.s6_addr[15] = iface;
+               in2->sin6_port = htons(prt);
+
+               *len = sizeof(*in2);
+               break;
+       }
+#endif
+       default:
+               errno = EINVAL;
+               return -1;
+       }
+
+       return 0;
+}
+
+static int convert_in_un_remote(struct socket_info *si, const struct sockaddr *inaddr, struct sockaddr_un *un,
+                               int *bcast)
+{
+       char type = '\0';
+       unsigned int prt;
+       unsigned int iface;
+       int is_bcast = 0;
+
+       if (bcast) *bcast = 0;
+
+       switch (inaddr->sa_family) {
+       case AF_INET: {
+               const struct sockaddr_in *in = 
+                   (const struct sockaddr_in *)(const void *)inaddr;
+               unsigned int addr = ntohl(in->sin_addr.s_addr);
+               char u_type = '\0';
+               char b_type = '\0';
+               char a_type = '\0';
+
+               switch (si->type) {
+               case SOCK_STREAM:
+                       u_type = SOCKET_TYPE_CHAR_TCP;
+                       break;
+               case SOCK_DGRAM:
+                       u_type = SOCKET_TYPE_CHAR_UDP;
+                       a_type = SOCKET_TYPE_CHAR_UDP;
+                       b_type = SOCKET_TYPE_CHAR_UDP;
+                       break;
+               }
+
+               prt = ntohs(in->sin_port);
+               if (a_type && addr == 0xFFFFFFFF) {
+                       /* 255.255.255.255 only udp */
+                       is_bcast = 2;
+                       type = a_type;
+                       iface = socket_wrapper_default_iface();
+               } else if (b_type && addr == 0x7FFFFFFF) {
+                       /* 127.255.255.255 only udp */
+                       is_bcast = 1;
+                       type = b_type;
+                       iface = socket_wrapper_default_iface();
+               } else if ((addr & 0xFFFFFF00) == 0x7F000000) {
+                       /* 127.0.0.X */
+                       is_bcast = 0;
+                       type = u_type;
+                       iface = (addr & 0x000000FF);
+               } else {
+                       errno = ENETUNREACH;
+                       return -1;
+               }
+               if (bcast) *bcast = is_bcast;
+               break;
+       }
+#ifdef HAVE_IPV6
+       case AF_INET6: {
+               const struct sockaddr_in6 *in = 
+                   (const struct sockaddr_in6 *)(const void *)inaddr;
+               struct in6_addr cmp1, cmp2;
+
+               switch (si->type) {
+               case SOCK_STREAM:
+                       type = SOCKET_TYPE_CHAR_TCP_V6;
+                       break;
+               case SOCK_DGRAM:
+                       type = SOCKET_TYPE_CHAR_UDP_V6;
+                       break;
+               }
+
+               /* XXX no multicast/broadcast */
+
+               prt = ntohs(in->sin6_port);
+
+               cmp1 = *swrap_ipv6();
+               cmp2 = in->sin6_addr;
+               cmp2.s6_addr[15] = 0;
+               if (IN6_ARE_ADDR_EQUAL(&cmp1, &cmp2)) {
+                       iface = in->sin6_addr.s6_addr[15];
+               } else {
+                       errno = ENETUNREACH;
+                       return -1;
+               }
+
+               break;
+       }
+#endif
+       default:
+               errno = ENETUNREACH;
+               return -1;
+       }
+
+       if (prt == 0) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       if (is_bcast) {
+               snprintf(un->sun_path, sizeof(un->sun_path), "%s/EINVAL", 
+                        socket_wrapper_dir());
+               /* the caller need to do more processing */
+               return 0;
+       }
+
+       snprintf(un->sun_path, sizeof(un->sun_path), "%s/"SOCKET_FORMAT, 
+                socket_wrapper_dir(), type, iface, prt);
+
+       return 0;
+}
+
+static int convert_in_un_alloc(struct socket_info *si, const struct sockaddr *inaddr, struct sockaddr_un *un,
+                              int *bcast)
+{
+       char type = '\0';
+       unsigned int prt;
+       unsigned int iface;
+       struct stat st;
+       int is_bcast = 0;
+
+       if (bcast) *bcast = 0;
+
+       switch (si->family) {
+       case AF_INET: {
+               const struct sockaddr_in *in = 
+                   (const struct sockaddr_in *)(const void *)inaddr;
+               unsigned int addr = ntohl(in->sin_addr.s_addr);
+               char u_type = '\0';
+               char d_type = '\0';
+               char b_type = '\0';
+               char a_type = '\0';
+
+               prt = ntohs(in->sin_port);
+
+               switch (si->type) {
+               case SOCK_STREAM:
+                       u_type = SOCKET_TYPE_CHAR_TCP;
+                       d_type = SOCKET_TYPE_CHAR_TCP;
+                       break;
+               case SOCK_DGRAM:
+                       u_type = SOCKET_TYPE_CHAR_UDP;
+                       d_type = SOCKET_TYPE_CHAR_UDP;
+                       a_type = SOCKET_TYPE_CHAR_UDP;
+                       b_type = SOCKET_TYPE_CHAR_UDP;
+                       break;
+               }
+
+               if (addr == 0) {
+                       /* 0.0.0.0 */
+                       is_bcast = 0;
+                       type = d_type;
+                       iface = socket_wrapper_default_iface();
+               } else if (a_type && addr == 0xFFFFFFFF) {
+                       /* 255.255.255.255 only udp */
+                       is_bcast = 2;
+                       type = a_type;
+                       iface = socket_wrapper_default_iface();
+               } else if (b_type && addr == 0x7FFFFFFF) {
+                       /* 127.255.255.255 only udp */
+                       is_bcast = 1;
+                       type = b_type;
+                       iface = socket_wrapper_default_iface();
+               } else if ((addr & 0xFFFFFF00) == 0x7F000000) {
+                       /* 127.0.0.X */
+                       is_bcast = 0;
+                       type = u_type;
+                       iface = (addr & 0x000000FF);
+               } else {
+                       errno = EADDRNOTAVAIL;
+                       return -1;
+               }
+               break;
+       }
+#ifdef HAVE_IPV6
+       case AF_INET6: {
+               const struct sockaddr_in6 *in = 
+                   (const struct sockaddr_in6 *)(const void *)inaddr;
+               struct in6_addr cmp1, cmp2;
+
+               switch (si->type) {
+               case SOCK_STREAM:
+                       type = SOCKET_TYPE_CHAR_TCP_V6;
+                       break;
+               case SOCK_DGRAM:
+                       type = SOCKET_TYPE_CHAR_UDP_V6;
+                       break;
+               }
+
+               /* XXX no multicast/broadcast */
+
+               prt = ntohs(in->sin6_port);
+
+               cmp1 = *swrap_ipv6();
+               cmp2 = in->sin6_addr;
+               cmp2.s6_addr[15] = 0;
+               if (IN6_IS_ADDR_UNSPECIFIED(&in->sin6_addr)) {
+                       iface = socket_wrapper_default_iface();
+               } else if (IN6_ARE_ADDR_EQUAL(&cmp1, &cmp2)) {
+                       iface = in->sin6_addr.s6_addr[15];
+               } else {
+                       errno = EADDRNOTAVAIL;
+                       return -1;
+               }
+
+               break;
+       }
+#endif
+       default:
+               errno = EADDRNOTAVAIL;
+               return -1;
+       }
+
+
+       if (bcast) *bcast = is_bcast;
+
+       if (iface == 0 || iface > MAX_WRAPPED_INTERFACES) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       if (prt == 0) {
+               /* handle auto-allocation of ephemeral ports */
+               for (prt = 5001; prt < 10000; prt++) {
+                       snprintf(un->sun_path, sizeof(un->sun_path), "%s/"SOCKET_FORMAT, 
+                                socket_wrapper_dir(), type, iface, prt);
+                       if (stat(un->sun_path, &st) == 0) continue;
+
+                       set_port(si->family, prt, si->myname);
+                       break;
+               }
+               if (prt == 10000) {
+                       errno = ENFILE;
+                       return -1;
+               }
+       }
+
+       snprintf(un->sun_path, sizeof(un->sun_path), "%s/"SOCKET_FORMAT, 
+                socket_wrapper_dir(), type, iface, prt);
+       return 0;
+}
+
+static struct socket_info *find_socket_info(int fd)
+{
+       struct socket_info *i;
+       for (i = sockets; i; i = i->next) {
+               struct socket_info_fd *f;
+               for (f = i->fds; f; f = f->next) {
+                       if (f->fd == fd) {
+                               return i;
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+static int sockaddr_convert_to_un(struct socket_info *si, const struct sockaddr *in_addr, socklen_t in_len, 
+                                 struct sockaddr_un *out_addr, int alloc_sock, int *bcast)
+{
+       struct sockaddr *out = (struct sockaddr *)(void *)out_addr;
+       if (!out_addr)
+               return 0;
+
+       out->sa_family = AF_UNIX;
+#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
+       out->sa_len = sizeof(*out_addr);
+#endif
+
+       switch (in_addr->sa_family) {
+       case AF_INET:
+#ifdef HAVE_IPV6
+       case AF_INET6:
+#endif
+               switch (si->type) {
+               case SOCK_STREAM:
+               case SOCK_DGRAM:
+                       break;
+               default:
+                       errno = ESOCKTNOSUPPORT;
+                       return -1;
+               }
+               if (alloc_sock) {
+                       return convert_in_un_alloc(si, in_addr, out_addr, bcast);
+               } else {
+                       return convert_in_un_remote(si, in_addr, out_addr, bcast);
+               }
+       default:
+               break;
+       }
+
+       errno = EAFNOSUPPORT;
+       return -1;
+}
+
+static int sockaddr_convert_from_un(const struct socket_info *si, 
+                                   const struct sockaddr_un *in_addr, 
+                                   socklen_t un_addrlen,
+                                   int family,
+                                   struct sockaddr *out_addr,
+                                   socklen_t *out_addrlen)
+{
+       int ret;
+
+       if (out_addr == NULL || out_addrlen == NULL) 
+               return 0;
+
+       if (un_addrlen == 0) {
+               *out_addrlen = 0;
+               return 0;
+       }
+
+       switch (family) {
+       case AF_INET:
+#ifdef HAVE_IPV6
+       case AF_INET6:
+#endif
+               switch (si->type) {
+               case SOCK_STREAM:
+               case SOCK_DGRAM:
+                       break;
+               default:
+                       errno = ESOCKTNOSUPPORT;
+                       return -1;
+               }
+               ret = convert_un_in(in_addr, out_addr, out_addrlen);
+#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
+               out_addr->sa_len = *out_addrlen;
+#endif
+               return ret;
+       default:
+               break;
+       }
+
+       errno = EAFNOSUPPORT;
+       return -1;
+}
+
+enum swrap_packet_type {
+       SWRAP_CONNECT_SEND,
+       SWRAP_CONNECT_UNREACH,
+       SWRAP_CONNECT_RECV,
+       SWRAP_CONNECT_ACK,
+       SWRAP_ACCEPT_SEND,
+       SWRAP_ACCEPT_RECV,
+       SWRAP_ACCEPT_ACK,
+       SWRAP_RECVFROM,
+       SWRAP_SENDTO,
+       SWRAP_SENDTO_UNREACH,
+       SWRAP_PENDING_RST,
+       SWRAP_RECV,
+       SWRAP_RECV_RST,
+       SWRAP_SEND,
+       SWRAP_SEND_RST,
+       SWRAP_CLOSE_SEND,
+       SWRAP_CLOSE_RECV,
+       SWRAP_CLOSE_ACK,
+};
+
+struct swrap_file_hdr {
+       uint32_t        magic;
+       uint16_t        version_major;
+       uint16_t        version_minor;
+       int32_t         timezone;
+       uint32_t        sigfigs;
+       uint32_t        frame_max_len;
+#define SWRAP_FRAME_LENGTH_MAX 0xFFFF
+       uint32_t        link_type;
+};
+#define SWRAP_FILE_HDR_SIZE 24
+
+struct swrap_packet_frame {
+       uint32_t seconds;
+       uint32_t micro_seconds;
+       uint32_t recorded_length;
+       uint32_t full_length;
+};
+#define SWRAP_PACKET_FRAME_SIZE 16
+
+union swrap_packet_ip {
+       struct {
+               uint8_t         ver_hdrlen;
+               uint8_t         tos;
+               uint16_t        packet_length;
+               uint16_t        identification;
+               uint8_t         flags;
+               uint8_t         fragment;
+               uint8_t         ttl;
+               uint8_t         protocol;
+               uint16_t        hdr_checksum;
+               uint32_t        src_addr;
+               uint32_t        dest_addr;
+       } v4;
+#define SWRAP_PACKET_IP_V4_SIZE 20
+       struct {
+               uint8_t         ver_prio;
+               uint8_t         flow_label_high;
+               uint16_t        flow_label_low;
+               uint16_t        payload_length;
+               uint8_t         next_header;
+               uint8_t         hop_limit;
+               uint8_t         src_addr[16];
+               uint8_t         dest_addr[16];
+       } v6;
+#define SWRAP_PACKET_IP_V6_SIZE 40
+};
+#define SWRAP_PACKET_IP_SIZE 40
+
+union swrap_packet_payload {
+       struct {
+               uint16_t        source_port;
+               uint16_t        dest_port;
+               uint32_t        seq_num;
+               uint32_t        ack_num;
+               uint8_t         hdr_length;
+               uint8_t         control;
+               uint16_t        window;
+               uint16_t        checksum;
+               uint16_t        urg;
+       } tcp;
+#define SWRAP_PACKET_PAYLOAD_TCP_SIZE 20
+       struct {
+               uint16_t        source_port;
+               uint16_t        dest_port;
+               uint16_t        length;
+               uint16_t        checksum;
+       } udp;
+#define SWRAP_PACKET_PAYLOAD_UDP_SIZE 8
+       struct {
+               uint8_t         type;
+               uint8_t         code;
+               uint16_t        checksum;
+               uint32_t        unused;
+       } icmp4;
+#define SWRAP_PACKET_PAYLOAD_ICMP4_SIZE 8
+       struct {
+               uint8_t         type;
+               uint8_t         code;
+               uint16_t        checksum;
+               uint32_t        unused;
+       } icmp6;
+#define SWRAP_PACKET_PAYLOAD_ICMP6_SIZE 8
+};
+#define SWRAP_PACKET_PAYLOAD_SIZE 20
+
+#define SWRAP_PACKET_MIN_ALLOC \
+       (SWRAP_PACKET_FRAME_SIZE + \
+        SWRAP_PACKET_IP_SIZE + \
+        SWRAP_PACKET_PAYLOAD_SIZE)
+
+static const char *socket_wrapper_pcap_file(void)
+{
+       static int initialized = 0;
+       static const char *s = NULL;
+       static const struct swrap_file_hdr h;
+       static const struct swrap_packet_frame f;
+       static const union swrap_packet_ip i;
+       static const union swrap_packet_payload p;
+
+       if (initialized == 1) {
+               return s;
+       }
+       initialized = 1;
+
+       /*
+        * TODO: don't use the structs use plain buffer offsets
+        *       and PUSH_U8(), PUSH_U16() and PUSH_U32()
+        * 
+        * for now make sure we disable PCAP support
+        * if the struct has alignment!
+        */
+       if (sizeof(h) != SWRAP_FILE_HDR_SIZE) {
+               return NULL;
+       }
+       if (sizeof(f) != SWRAP_PACKET_FRAME_SIZE) {
+               return NULL;
+       }
+       if (sizeof(i) != SWRAP_PACKET_IP_SIZE) {
+               return NULL;
+       }
+       if (sizeof(i.v4) != SWRAP_PACKET_IP_V4_SIZE) {
+               return NULL;
+       }
+       if (sizeof(i.v6) != SWRAP_PACKET_IP_V6_SIZE) {
+               return NULL;
+       }
+       if (sizeof(p) != SWRAP_PACKET_PAYLOAD_SIZE) {
+               return NULL;
+       }
+       if (sizeof(p.tcp) != SWRAP_PACKET_PAYLOAD_TCP_SIZE) {
+               return NULL;
+       }
+       if (sizeof(p.udp) != SWRAP_PACKET_PAYLOAD_UDP_SIZE) {
+               return NULL;
+       }
+       if (sizeof(p.icmp4) != SWRAP_PACKET_PAYLOAD_ICMP4_SIZE) {
+               return NULL;
+       }
+       if (sizeof(p.icmp6) != SWRAP_PACKET_PAYLOAD_ICMP6_SIZE) {
+               return NULL;
+       }
+
+       s = getenv("SOCKET_WRAPPER_PCAP_FILE");
+       if (s == NULL) {
+               return NULL;
+       }
+       if (strncmp(s, "./", 2) == 0) {
+               s += 2;
+       }
+       return s;
+}
+
+static uint8_t *swrap_packet_init(struct timeval *tval,
+                                 const struct sockaddr *src,
+                                 const struct sockaddr *dest,
+                                 int socket_type,
+                                 const uint8_t *payload,
+                                 size_t payload_len,
+                                 unsigned long tcp_seqno,
+                                 unsigned long tcp_ack,
+                                 unsigned char tcp_ctl,
+                                 int unreachable,
+                                 size_t *_packet_len)
+{
+       uint8_t *base;
+       uint8_t *buf;
+       struct swrap_packet_frame *frame;
+       union swrap_packet_ip *ip;
+       union swrap_packet_payload *pay;
+       size_t packet_len;
+       size_t alloc_len;
+       size_t nonwire_len = sizeof(*frame);
+       size_t wire_hdr_len = 0;
+       size_t wire_len = 0;
+       size_t ip_hdr_len = 0;
+       size_t icmp_hdr_len = 0;
+       size_t icmp_truncate_len = 0;
+       uint8_t protocol = 0, icmp_protocol = 0;
+       const struct sockaddr_in *src_in = NULL;
+       const struct sockaddr_in *dest_in = NULL;
+#ifdef HAVE_IPV6
+       const struct sockaddr_in6 *src_in6 = NULL;
+       const struct sockaddr_in6 *dest_in6 = NULL;
+#endif
+       uint16_t src_port;
+       uint16_t dest_port;
+
+       switch (src->sa_family) {
+       case AF_INET:
+               src_in = (const struct sockaddr_in *)src;
+               dest_in = (const struct sockaddr_in *)dest;
+               src_port = src_in->sin_port;
+               dest_port = dest_in->sin_port;
+               ip_hdr_len = sizeof(ip->v4);
+               break;
+#ifdef HAVE_IPV6
+       case AF_INET6:
+               src_in6 = (const struct sockaddr_in6 *)src;
+               dest_in6 = (const struct sockaddr_in6 *)dest;
+               src_port = src_in6->sin6_port;
+               dest_port = dest_in6->sin6_port;
+               ip_hdr_len = sizeof(ip->v6);
+               break;
+#endif
+       default:
+               return NULL;
+       }
+
+       switch (socket_type) {
+       case SOCK_STREAM:
+               protocol = 0x06; /* TCP */
+               wire_hdr_len = ip_hdr_len + sizeof(pay->tcp);
+               wire_len = wire_hdr_len + payload_len;
+               break;
+
+       case SOCK_DGRAM:
+               protocol = 0x11; /* UDP */
+               wire_hdr_len = ip_hdr_len + sizeof(pay->udp);
+               wire_len = wire_hdr_len + payload_len;
+               break;
+
+       default:
+               return NULL;
+       }
+
+       if (unreachable) {
+               icmp_protocol = protocol;
+               switch (src->sa_family) {
+               case AF_INET:
+                       protocol = 0x01; /* ICMPv4 */
+                       icmp_hdr_len = ip_hdr_len + sizeof(pay->icmp4);
+                       break;
+#ifdef HAVE_IPV6
+               case AF_INET6:
+                       protocol = 0x3A; /* ICMPv6 */
+                       icmp_hdr_len = ip_hdr_len + sizeof(pay->icmp6);
+                       break;
+#endif
+               }
+               if (wire_len > 64 ) {
+                       icmp_truncate_len = wire_len - 64;
+               }
+               wire_hdr_len += icmp_hdr_len;
+               wire_len += icmp_hdr_len;
+       }
+
+       packet_len = nonwire_len + wire_len;
+       alloc_len = packet_len;
+       if (alloc_len < SWRAP_PACKET_MIN_ALLOC) {
+               alloc_len = SWRAP_PACKET_MIN_ALLOC;
+       }
+
+       base = (uint8_t *)malloc(alloc_len);
+       if (!base) return NULL;
+
+       buf = base;
+
+       frame = (struct swrap_packet_frame *)buf;
+       frame->seconds          = tval->tv_sec;
+       frame->micro_seconds    = tval->tv_usec;
+       frame->recorded_length  = wire_len - icmp_truncate_len;
+       frame->full_length      = wire_len - icmp_truncate_len;
+       buf += SWRAP_PACKET_FRAME_SIZE;
+
+       ip = (union swrap_packet_ip *)buf;
+       switch (src->sa_family) {
+       case AF_INET:
+               ip->v4.ver_hdrlen       = 0x45; /* version 4 and 5 * 32 bit words */
+               ip->v4.tos              = 0x00;
+               ip->v4.packet_length    = htons(wire_len - icmp_truncate_len);
+               ip->v4.identification   = htons(0xFFFF);
+               ip->v4.flags            = 0x40; /* BIT 1 set - means don't fraqment */
+               ip->v4.fragment         = htons(0x0000);
+               ip->v4.ttl              = 0xFF;
+               ip->v4.protocol         = protocol;
+               ip->v4.hdr_checksum     = htons(0x0000);
+               ip->v4.src_addr         = src_in->sin_addr.s_addr;
+               ip->v4.dest_addr        = dest_in->sin_addr.s_addr;
+               buf += SWRAP_PACKET_IP_V4_SIZE;
+               break;
+#ifdef HAVE_IPV6
+       case AF_INET6:
+               ip->v6.ver_prio         = 0x60; /* version 4 and 5 * 32 bit words */
+               ip->v6.flow_label_high  = 0x00;
+               ip->v6.flow_label_low   = 0x0000;
+               ip->v6.payload_length   = htons(wire_len - icmp_truncate_len); /* TODO */
+               ip->v6.next_header      = protocol;
+               memcpy(ip->v6.src_addr, src_in6->sin6_addr.s6_addr, 16);
+               memcpy(ip->v6.dest_addr, dest_in6->sin6_addr.s6_addr, 16);
+               buf += SWRAP_PACKET_IP_V6_SIZE;
+               break;
+#endif
+       }
+
+       if (unreachable) {
+               pay = (union swrap_packet_payload *)buf;
+               switch (src->sa_family) {
+               case AF_INET:
+                       pay->icmp4.type         = 0x03; /* destination unreachable */
+                       pay->icmp4.code         = 0x01; /* host unreachable */
+                       pay->icmp4.checksum     = htons(0x0000);
+                       pay->icmp4.unused       = htonl(0x00000000);
+                       buf += SWRAP_PACKET_PAYLOAD_ICMP4_SIZE;
+
+                       /* set the ip header in the ICMP payload */
+                       ip = (union swrap_packet_ip *)buf;
+                       ip->v4.ver_hdrlen       = 0x45; /* version 4 and 5 * 32 bit words */
+                       ip->v4.tos              = 0x00;
+                       ip->v4.packet_length    = htons(wire_len - icmp_hdr_len);
+                       ip->v4.identification   = htons(0xFFFF);
+                       ip->v4.flags            = 0x40; /* BIT 1 set - means don't fraqment */
+                       ip->v4.fragment         = htons(0x0000);
+                       ip->v4.ttl              = 0xFF;
+                       ip->v4.protocol         = icmp_protocol;
+                       ip->v4.hdr_checksum     = htons(0x0000);
+                       ip->v4.src_addr         = dest_in->sin_addr.s_addr;
+                       ip->v4.dest_addr        = src_in->sin_addr.s_addr;
+                       buf += SWRAP_PACKET_IP_V4_SIZE;
+
+                       src_port = dest_in->sin_port;
+                       dest_port = src_in->sin_port;
+                       break;
+#ifdef HAVE_IPV6
+               case AF_INET6:
+                       pay->icmp6.type         = 0x01; /* destination unreachable */
+                       pay->icmp6.code         = 0x03; /* address unreachable */
+                       pay->icmp6.checksum     = htons(0x0000);
+                       pay->icmp6.unused       = htonl(0x00000000);
+                       buf += SWRAP_PACKET_PAYLOAD_ICMP6_SIZE;
+
+                       /* set the ip header in the ICMP payload */
+                       ip = (union swrap_packet_ip *)buf;
+                       ip->v6.ver_prio         = 0x60; /* version 4 and 5 * 32 bit words */
+                       ip->v6.flow_label_high  = 0x00;
+                       ip->v6.flow_label_low   = 0x0000;
+                       ip->v6.payload_length   = htons(wire_len - icmp_truncate_len); /* TODO */
+                       ip->v6.next_header      = protocol;
+                       memcpy(ip->v6.src_addr, dest_in6->sin6_addr.s6_addr, 16);
+                       memcpy(ip->v6.dest_addr, src_in6->sin6_addr.s6_addr, 16);
+                       buf += SWRAP_PACKET_IP_V6_SIZE;
+
+                       src_port = dest_in6->sin6_port;
+                       dest_port = src_in6->sin6_port;
+                       break;
+#endif
+               }
+       }
+
+       pay = (union swrap_packet_payload *)buf;
+
+       switch (socket_type) {
+       case SOCK_STREAM:
+               pay->tcp.source_port    = src_port;
+               pay->tcp.dest_port      = dest_port;
+               pay->tcp.seq_num        = htonl(tcp_seqno);
+               pay->tcp.ack_num        = htonl(tcp_ack);
+               pay->tcp.hdr_length     = 0x50; /* 5 * 32 bit words */
+               pay->tcp.control        = tcp_ctl;
+               pay->tcp.window         = htons(0x7FFF);
+               pay->tcp.checksum       = htons(0x0000);
+               pay->tcp.urg            = htons(0x0000);
+               buf += SWRAP_PACKET_PAYLOAD_TCP_SIZE;
+
+               break;
+
+       case SOCK_DGRAM:
+               pay->udp.source_port    = src_port;
+               pay->udp.dest_port      = dest_port;
+               pay->udp.length         = htons(8 + payload_len);
+               pay->udp.checksum       = htons(0x0000);
+               buf += SWRAP_PACKET_PAYLOAD_UDP_SIZE;
+
+               break;
+       }
+
+       if (payload && payload_len > 0) {
+               memcpy(buf, payload, payload_len);
+       }
+
+       *_packet_len = packet_len - icmp_truncate_len;
+       return base;
+}
+
+static int swrap_get_pcap_fd(const char *fname)
+{
+       static int fd = -1;
+
+       if (fd != -1) return fd;
+
+       fd = open(fname, O_WRONLY|O_CREAT|O_EXCL|O_APPEND, 0644);
+       if (fd != -1) {
+               struct swrap_file_hdr file_hdr;
+               file_hdr.magic          = 0xA1B2C3D4;
+               file_hdr.version_major  = 0x0002;       
+               file_hdr.version_minor  = 0x0004;
+               file_hdr.timezone       = 0x00000000;
+               file_hdr.sigfigs        = 0x00000000;
+               file_hdr.frame_max_len  = SWRAP_FRAME_LENGTH_MAX;
+               file_hdr.link_type      = 0x0065; /* 101 RAW IP */
+
+               if (write(fd, &file_hdr, sizeof(file_hdr)) != sizeof(file_hdr)) {
+                       close(fd);
+                       fd = -1;
+               }
+               return fd;
+       }
+
+       fd = open(fname, O_WRONLY|O_APPEND, 0644);
+
+       return fd;
+}
+
+static uint8_t *swrap_marshall_packet(struct socket_info *si,
+                                     const struct sockaddr *addr,
+                                     enum swrap_packet_type type,
+                                     const void *buf, size_t len,
+                                     size_t *packet_len)
+{
+       const struct sockaddr *src_addr;
+       const struct sockaddr *dest_addr;
+       unsigned long tcp_seqno = 0;
+       unsigned long tcp_ack = 0;
+       unsigned char tcp_ctl = 0;
+       int unreachable = 0;
+
+       struct timeval tv;
+
+       switch (si->family) {
+       case AF_INET:
+               break;
+#ifdef HAVE_IPV6
+       case AF_INET6:
+               break;
+#endif
+       default:
+               return NULL;
+       }
+
+       switch (type) {
+       case SWRAP_CONNECT_SEND:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               src_addr = si->myname;
+               dest_addr = addr;
+
+               tcp_seqno = si->io.pck_snd;
+               tcp_ack = si->io.pck_rcv;
+               tcp_ctl = 0x02; /* SYN */
+
+               si->io.pck_snd += 1;
+
+               break;
+
+       case SWRAP_CONNECT_RECV:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               dest_addr = si->myname;
+               src_addr = addr;
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x12; /** SYN,ACK */
+
+               si->io.pck_rcv += 1;
+
+               break;
+
+       case SWRAP_CONNECT_UNREACH:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               dest_addr = si->myname;
+               src_addr = addr;
+
+               /* Unreachable: resend the data of SWRAP_CONNECT_SEND */
+               tcp_seqno = si->io.pck_snd - 1;
+               tcp_ack = si->io.pck_rcv;
+               tcp_ctl = 0x02; /* SYN */
+               unreachable = 1;
+
+               break;
+
+       case SWRAP_CONNECT_ACK:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               src_addr = si->myname;
+               dest_addr = addr;
+
+               tcp_seqno = si->io.pck_snd;
+               tcp_ack = si->io.pck_rcv;
+               tcp_ctl = 0x10; /* ACK */
+
+               break;
+
+       case SWRAP_ACCEPT_SEND:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               dest_addr = si->myname;
+               src_addr = addr;
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x02; /* SYN */
+
+               si->io.pck_rcv += 1;
+
+               break;
+
+       case SWRAP_ACCEPT_RECV:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               src_addr = si->myname;
+               dest_addr = addr;
+
+               tcp_seqno = si->io.pck_snd;
+               tcp_ack = si->io.pck_rcv;
+               tcp_ctl = 0x12; /* SYN,ACK */
+
+               si->io.pck_snd += 1;
+
+               break;
+
+       case SWRAP_ACCEPT_ACK:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               dest_addr = si->myname;
+               src_addr = addr;
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x10; /* ACK */
+
+               break;
+
+       case SWRAP_SEND:
+               src_addr = si->myname;
+               dest_addr = si->peername;
+
+               tcp_seqno = si->io.pck_snd;
+               tcp_ack = si->io.pck_rcv;
+               tcp_ctl = 0x18; /* PSH,ACK */
+
+               si->io.pck_snd += len;
+
+               break;
+
+       case SWRAP_SEND_RST:
+               dest_addr = si->myname;
+               src_addr = si->peername;
+
+               if (si->type == SOCK_DGRAM) {
+                       return swrap_marshall_packet(si, si->peername,
+                                         SWRAP_SENDTO_UNREACH,
+                                         buf, len, packet_len);
+               }
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x14; /** RST,ACK */
+
+               break;
+
+       case SWRAP_PENDING_RST:
+               dest_addr = si->myname;
+               src_addr = si->peername;
+
+               if (si->type == SOCK_DGRAM) {
+                       return NULL;
+               }
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x14; /* RST,ACK */
+
+               break;
+
+       case SWRAP_RECV:
+               dest_addr = si->myname;
+               src_addr = si->peername;
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x18; /* PSH,ACK */
+
+               si->io.pck_rcv += len;
+
+               break;
+
+       case SWRAP_RECV_RST:
+               dest_addr = si->myname;
+               src_addr = si->peername;
+
+               if (si->type == SOCK_DGRAM) {
+                       return NULL;
+               }
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x14; /* RST,ACK */
+
+               break;
+
+       case SWRAP_SENDTO:
+               src_addr = si->myname;
+               dest_addr = addr;
+
+               si->io.pck_snd += len;
+
+               break;
+
+       case SWRAP_SENDTO_UNREACH:
+               dest_addr = si->myname;
+               src_addr = addr;
+
+               unreachable = 1;
+
+               break;
+
+       case SWRAP_RECVFROM:
+               dest_addr = si->myname;
+               src_addr = addr;
+
+               si->io.pck_rcv += len;
+
+               break;
+
+       case SWRAP_CLOSE_SEND:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               src_addr = si->myname;
+               dest_addr = si->peername;
+
+               tcp_seqno = si->io.pck_snd;
+               tcp_ack = si->io.pck_rcv;
+               tcp_ctl = 0x11; /* FIN, ACK */
+
+               si->io.pck_snd += 1;
+
+               break;
+
+       case SWRAP_CLOSE_RECV:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               dest_addr = si->myname;
+               src_addr = si->peername;
+
+               tcp_seqno = si->io.pck_rcv;
+               tcp_ack = si->io.pck_snd;
+               tcp_ctl = 0x11; /* FIN,ACK */
+
+               si->io.pck_rcv += 1;
+
+               break;
+
+       case SWRAP_CLOSE_ACK:
+               if (si->type != SOCK_STREAM) return NULL;
+
+               src_addr = si->myname;
+               dest_addr = si->peername;
+
+               tcp_seqno = si->io.pck_snd;
+               tcp_ack = si->io.pck_rcv;
+               tcp_ctl = 0x10; /* ACK */
+
+               break;
+       default:
+               return NULL;
+       }
+
+       swrapGetTimeOfDay(&tv);
+
+       return swrap_packet_init(&tv, src_addr, dest_addr, si->type,
+                                (const uint8_t *)buf, len,
+                                tcp_seqno, tcp_ack, tcp_ctl, unreachable,
+                                packet_len);
+}
+
+static void swrap_dump_packet(struct socket_info *si,
+                             const struct sockaddr *addr,
+                             enum swrap_packet_type type,
+                             const void *buf, size_t len)
+{
+       const char *file_name;
+       uint8_t *packet;
+       size_t packet_len = 0;
+       int fd;
+
+       file_name = socket_wrapper_pcap_file();
+       if (!file_name) {
+               return;
+       }
+
+       packet = swrap_marshall_packet(si, addr, type, buf, len, &packet_len);
+       if (!packet) {
+               return;
+       }
+
+       fd = swrap_get_pcap_fd(file_name);
+       if (fd != -1) {
+               if (write(fd, packet, packet_len) != packet_len) {
+                       free(packet);
+                       return;
+               }
+       }
+
+       free(packet);
+}
+
+_PUBLIC_ int swrap_socket(int family, int type, int protocol)
+{
+       struct socket_info *si;
+       struct socket_info_fd *fi;
+       int fd;
+       int real_type = type;
+#ifdef SOCK_CLOEXEC
+       real_type &= ~SOCK_CLOEXEC;
+#endif
+#ifdef SOCK_NONBLOCK
+       real_type &= ~SOCK_NONBLOCK;
+#endif
+
+       if (!socket_wrapper_dir()) {
+               return real_socket(family, type, protocol);
+       }
+
+       switch (family) {
+       case AF_INET:
+#ifdef HAVE_IPV6
+       case AF_INET6:
+#endif
+               break;
+       case AF_UNIX:
+               return real_socket(family, type, protocol);
+       default:
+               errno = EAFNOSUPPORT;
+               return -1;
+       }
+
+       switch (real_type) {
+       case SOCK_STREAM:
+               break;
+       case SOCK_DGRAM:
+               break;
+       default:
+               errno = EPROTONOSUPPORT;
+               return -1;
+       }
+
+       switch (protocol) {
+       case 0:
+               break;
+       case 6:
+               if (real_type == SOCK_STREAM) {
+                       break;
+               }
+               /*fall through*/
+       case 17:
+               if (real_type == SOCK_DGRAM) {
+                       break;
+               }
+               /*fall through*/
+       default:
+               errno = EPROTONOSUPPORT;
+               return -1;
+       }
+
+       /* We must call real_socket with type, from the caller, not the version we removed
+          SOCK_CLOEXEC and SOCK_NONBLOCK from */
+       fd = real_socket(AF_UNIX, type, 0);
+
+       if (fd == -1) return -1;
+
+       si = (struct socket_info *)calloc(1, sizeof(struct socket_info));
+       if (si == NULL) {
+               errno = ENOMEM;
+               return -1;
+       }
+
+       si->family = family;
+
+       /* however, the rest of the socket_wrapper code expects just
+        * the type, not the flags */
+       si->type = real_type;
+       si->protocol = protocol;
+
+       fi = (struct socket_info_fd *)calloc(1, sizeof(struct socket_info_fd));
+       if (fi == NULL) {
+               free(si);
+               errno = ENOMEM;
+               return -1;
+       }
+
+       fi->fd = fd;
+
+       SWRAP_DLIST_ADD(si->fds, fi);
+       SWRAP_DLIST_ADD(sockets, si);
+
+       return fd;
+}
+
+_PUBLIC_ int swrap_accept(int s, struct sockaddr *addr, socklen_t *addrlen)
+{
+       struct socket_info *parent_si, *child_si;
+       struct socket_info_fd *child_fi;
+       int fd;
+       struct sockaddr_un un_addr;
+       socklen_t un_addrlen = sizeof(un_addr);
+       struct sockaddr_un un_my_addr;
+       socklen_t un_my_addrlen = sizeof(un_my_addr);
+       struct sockaddr *my_addr;
+       socklen_t my_addrlen, len;
+       int ret;
+
+       parent_si = find_socket_info(s);
+       if (!parent_si) {
+               return real_accept(s, addr, addrlen);
+       }
+
+       /* 
+        * assume out sockaddr have the same size as the in parent
+        * socket family
+        */
+       my_addrlen = socket_length(parent_si->family);
+       if (my_addrlen <= 0) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       my_addr = (struct sockaddr *)malloc(my_addrlen);
+       if (my_addr == NULL) {
+               return -1;
+       }
+
+       memset(&un_addr, 0, sizeof(un_addr));
+       memset(&un_my_addr, 0, sizeof(un_my_addr));
+
+       ret = real_accept(s, (struct sockaddr *)(void *)&un_addr, &un_addrlen);
+       if (ret == -1) {
+               free(my_addr);
+               return ret;
+       }
+
+       fd = ret;
+
+       len = my_addrlen;
+       ret = sockaddr_convert_from_un(parent_si, &un_addr, un_addrlen,
+                                      parent_si->family, my_addr, &len);
+       if (ret == -1) {
+               free(my_addr);
+               close(fd);
+               return ret;
+       }
+
+       child_si = (struct socket_info *)malloc(sizeof(struct socket_info));
+       memset(child_si, 0, sizeof(*child_si));
+
+       child_fi = (struct socket_info_fd *)calloc(1, sizeof(struct socket_info_fd));
+       if (child_fi == NULL) {
+               free(child_si);
+               free(my_addr);
+               close(fd);
+               errno = ENOMEM;
+               return -1;
+       }
+
+       child_fi->fd = fd;
+
+       SWRAP_DLIST_ADD(child_si->fds, child_fi);
+
+       child_si->family = parent_si->family;
+       child_si->type = parent_si->type;
+       child_si->protocol = parent_si->protocol;
+       child_si->bound = 1;
+       child_si->is_server = 1;
+       child_si->connected = 1;
+
+       child_si->peername_len = len;
+       child_si->peername = sockaddr_dup(my_addr, len);
+
+       if (addr != NULL && addrlen != NULL) {
+               size_t copy_len = MIN(*addrlen, len);
+               if (copy_len > 0) {
+                       memcpy(addr, my_addr, copy_len);
+               }
+               *addrlen = len;
+       }
+
+       ret = real_getsockname(fd, (struct sockaddr *)(void *)&un_my_addr,
+                              &un_my_addrlen);
+       if (ret == -1) {
+               free(child_fi);
+               free(child_si);
+               close(fd);
+               return ret;
+       }
+
+       len = my_addrlen;
+       ret = sockaddr_convert_from_un(child_si, &un_my_addr, un_my_addrlen,
+                                      child_si->family, my_addr, &len);
+       if (ret == -1) {
+               free(child_fi);
+               free(child_si);
+               free(my_addr);
+               close(fd);
+               return ret;
+       }
+
+       child_si->myname_len = len;
+       child_si->myname = sockaddr_dup(my_addr, len);
+       free(my_addr);
+
+       SWRAP_DLIST_ADD(sockets, child_si);
+
+       swrap_dump_packet(child_si, addr, SWRAP_ACCEPT_SEND, NULL, 0);
+       swrap_dump_packet(child_si, addr, SWRAP_ACCEPT_RECV, NULL, 0);
+       swrap_dump_packet(child_si, addr, SWRAP_ACCEPT_ACK, NULL, 0);
+
+       return fd;
+}
+
+static int autobind_start_init;
+static int autobind_start;
+
+/* using sendto() or connect() on an unbound socket would give the
+   recipient no way to reply, as unlike UDP and TCP, a unix domain
+   socket can't auto-assign emphemeral port numbers, so we need to
+   assign it here.
+   Note: this might change the family from ipv6 to ipv4
+*/
+static int swrap_auto_bind(int fd, struct socket_info *si, int family)
+{
+       struct sockaddr_un un_addr;
+       int i;
+       char type;
+       int ret;
+       int port;
+       struct stat st;
+
+       if (autobind_start_init != 1) {
+               autobind_start_init = 1;
+               autobind_start = getpid();
+               autobind_start %= 50000;
+               autobind_start += 10000;
+       }
+
+       un_addr.sun_family = AF_UNIX;
+
+       switch (family) {
+       case AF_INET: {
+               struct sockaddr_in in;
+
+               switch (si->type) {
+               case SOCK_STREAM:
+                       type = SOCKET_TYPE_CHAR_TCP;
+                       break;
+               case SOCK_DGRAM:
+                       type = SOCKET_TYPE_CHAR_UDP;
+                       break;
+               default:
+                   errno = ESOCKTNOSUPPORT;
+                   return -1;
+               }
+
+               memset(&in, 0, sizeof(in));
+               in.sin_family = AF_INET;
+               in.sin_addr.s_addr = htonl(127<<24 | 
+                                          socket_wrapper_default_iface());
+
+               si->myname_len = sizeof(in);
+               si->myname = sockaddr_dup(&in, si->myname_len);
+               break;
+       }
+#ifdef HAVE_IPV6
+       case AF_INET6: {
+               struct sockaddr_in6 in6;
+
+               if (si->family != family) {
+                       errno = ENETUNREACH;
+                       return -1;
+               }
+
+               switch (si->type) {
+               case SOCK_STREAM:
+                       type = SOCKET_TYPE_CHAR_TCP_V6;
+                       break;
+               case SOCK_DGRAM:
+                       type = SOCKET_TYPE_CHAR_UDP_V6;
+                       break;
+               default:
+                       errno = ESOCKTNOSUPPORT;
+                       return -1;
+               }
+
+               memset(&in6, 0, sizeof(in6));
+               in6.sin6_family = AF_INET6;
+               in6.sin6_addr = *swrap_ipv6();
+               in6.sin6_addr.s6_addr[15] = socket_wrapper_default_iface();
+               si->myname_len = sizeof(in6);
+               si->myname = sockaddr_dup(&in6, si->myname_len);
+               break;
+       }
+#endif
+       default:
+               errno = ESOCKTNOSUPPORT;
+               return -1;
+       }
+
+       if (autobind_start > 60000) {
+               autobind_start = 10000;
+       }
+
+       for (i=0;i<1000;i++) {
+               port = autobind_start + i;
+               snprintf(un_addr.sun_path, sizeof(un_addr.sun_path), 
+                        "%s/"SOCKET_FORMAT, socket_wrapper_dir(),
+                        type, socket_wrapper_default_iface(), port);
+               if (stat(un_addr.sun_path, &st) == 0) continue;
+
+               ret = real_bind(fd, (struct sockaddr *)(void *)&un_addr,
+                               sizeof(un_addr));
+               if (ret == -1) return ret;
+
+               si->tmp_path = strdup(un_addr.sun_path);
+               si->bound = 1;
+               autobind_start = port + 1;
+               break;
+       }
+       if (i == 1000) {
+               errno = ENFILE;
+               return -1;
+       }
+
+       si->family = family;
+       set_port(si->family, port, si->myname);
+
+       return 0;
+}
+
+
+_PUBLIC_ int swrap_connect(int s, const struct sockaddr *serv_addr, socklen_t addrlen)
+{
+       int ret;
+       struct sockaddr_un un_addr;
+       struct socket_info *si = find_socket_info(s);
+       int bcast = 0;
+
+       if (!si) {
+               return real_connect(s, serv_addr, addrlen);
+       }
+
+       if (si->bound == 0) {
+               ret = swrap_auto_bind(s, si, serv_addr->sa_family);
+               if (ret == -1) return -1;
+       }
+
+       if (si->family != serv_addr->sa_family) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       ret = sockaddr_convert_to_un(si, serv_addr,
+                                    addrlen, &un_addr, 0, &bcast);
+       if (ret == -1) return -1;
+
+       if (bcast) {
+               errno = ENETUNREACH;
+               return -1;
+       }
+
+       if (si->type == SOCK_DGRAM) {
+               si->defer_connect = 1;
+               ret = 0;
+       } else {
+               swrap_dump_packet(si, serv_addr, SWRAP_CONNECT_SEND, NULL, 0);
+
+               ret = real_connect(s, (struct sockaddr *)(void *)&un_addr,
+                                  sizeof(struct sockaddr_un));
+       }
+
+       /* to give better errors */
+       if (ret == -1 && errno == ENOENT) {
+               errno = EHOSTUNREACH;
+       }
+
+       if (ret == 0) {
+               si->peername_len = addrlen;
+               si->peername = sockaddr_dup(serv_addr, addrlen);
+               si->connected = 1;
+
+               swrap_dump_packet(si, serv_addr, SWRAP_CONNECT_RECV, NULL, 0);
+               swrap_dump_packet(si, serv_addr, SWRAP_CONNECT_ACK, NULL, 0);
+       } else {
+               swrap_dump_packet(si, serv_addr, SWRAP_CONNECT_UNREACH, NULL, 0);
+       }
+
+       return ret;
+}
+
+_PUBLIC_ int swrap_bind(int s, const struct sockaddr *myaddr, socklen_t addrlen)
+{
+       int ret;
+       struct sockaddr_un un_addr;
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_bind(s, myaddr, addrlen);
+       }
+
+       si->myname_len = addrlen;
+       si->myname = sockaddr_dup(myaddr, addrlen);
+
+       ret = sockaddr_convert_to_un(si, myaddr, addrlen, &un_addr, 1, &si->bcast);
+       if (ret == -1) return -1;
+
+       unlink(un_addr.sun_path);
+
+       ret = real_bind(s, (struct sockaddr *)(void *)&un_addr,
+                       sizeof(struct sockaddr_un));
+
+       if (ret == 0) {
+               si->bound = 1;
+       }
+
+       return ret;
+}
+
+_PUBLIC_ int swrap_listen(int s, int backlog)
+{
+       int ret;
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_listen(s, backlog);
+       }
+
+       ret = real_listen(s, backlog);
+
+       return ret;
+}
+
+_PUBLIC_ int swrap_getpeername(int s, struct sockaddr *name, socklen_t *addrlen)
+{
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_getpeername(s, name, addrlen);
+       }
+
+       if (!si->peername)
+       {
+               errno = ENOTCONN;
+               return -1;
+       }
+
+       memcpy(name, si->peername, si->peername_len);
+       *addrlen = si->peername_len;
+
+       return 0;
+}
+
+_PUBLIC_ int swrap_getsockname(int s, struct sockaddr *name, socklen_t *addrlen)
+{
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_getsockname(s, name, addrlen);
+       }
+
+       memcpy(name, si->myname, si->myname_len);
+       *addrlen = si->myname_len;
+
+       return 0;
+}
+
+_PUBLIC_ int swrap_getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen)
+{
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_getsockopt(s, level, optname, optval, optlen);
+       }
+
+       if (level == SOL_SOCKET) {
+               return real_getsockopt(s, level, optname, optval, optlen);
+       } 
+
+       errno = ENOPROTOOPT;
+       return -1;
+}
+
+_PUBLIC_ int swrap_setsockopt(int s, int  level,  int  optname,  const  void  *optval, socklen_t optlen)
+{
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_setsockopt(s, level, optname, optval, optlen);
+       }
+
+       if (level == SOL_SOCKET) {
+               return real_setsockopt(s, level, optname, optval, optlen);
+       }
+
+       switch (si->family) {
+       case AF_INET:
+               return 0;
+#ifdef HAVE_IPV6
+       case AF_INET6:
+               return 0;
+#endif
+       default:
+               errno = ENOPROTOOPT;
+               return -1;
+       }
+}
+
+_PUBLIC_ int swrap_ioctl(int s, int r, void *p)
+{
+       int ret;
+       struct socket_info *si = find_socket_info(s);
+       int value;
+
+       if (!si) {
+               return real_ioctl(s, r, p);
+       }
+
+       ret = real_ioctl(s, r, p);
+
+       switch (r) {
+       case FIONREAD:
+               value = *((int *)p);
+               if (ret == -1 && errno != EAGAIN && errno != ENOBUFS) {
+                       swrap_dump_packet(si, NULL, SWRAP_PENDING_RST, NULL, 0);
+               } else if (value == 0) { /* END OF FILE */
+                       swrap_dump_packet(si, NULL, SWRAP_PENDING_RST, NULL, 0);
+               }
+               break;
+       }
+
+       return ret;
+}
+
+static ssize_t swrap_sendmsg_before(int fd,
+                                   struct socket_info *si,
+                                   struct msghdr *msg,
+                                   struct iovec *tmp_iov,
+                                   struct sockaddr_un *tmp_un,
+                                   const struct sockaddr_un **to_un,
+                                   const struct sockaddr **to,
+                                   int *bcast)
+{
+       size_t i, len = 0;
+       ssize_t ret;
+
+       if (to_un) {
+               *to_un = NULL;
+       }
+       if (to) {
+               *to = NULL;
+       }
+       if (bcast) {
+               *bcast = 0;
+       }
+
+       switch (si->type) {
+       case SOCK_STREAM:
+               if (!si->connected) {
+                       errno = ENOTCONN;
+                       return -1;
+               }
+
+               if (msg->msg_iovlen == 0) {
+                       break;
+               }
+
+               /*
+                * cut down to 1500 byte packets for stream sockets,
+                * which makes it easier to format PCAP capture files
+                * (as the caller will simply continue from here)
+                */
+
+               for (i=0; i < msg->msg_iovlen; i++) {
+                       size_t nlen;
+                       nlen = len + msg->msg_iov[i].iov_len;
+                       if (nlen > 1500) {
+                               break;
+                       }
+               }
+               msg->msg_iovlen = i;
+               if (msg->msg_iovlen == 0) {
+                       *tmp_iov = msg->msg_iov[0];
+                       tmp_iov->iov_len = MIN(tmp_iov->iov_len, 1500);
+                       msg->msg_iov = tmp_iov;
+                       msg->msg_iovlen = 1;
+               }
+               break;
+
+       case SOCK_DGRAM:
+               if (si->connected) {
+                       if (msg->msg_name) {
+                               errno = EISCONN;
+                               return -1;
+                       }
+               } else {
+                       const struct sockaddr *msg_name;
+                       msg_name = (const struct sockaddr *)msg->msg_name;
+
+                       if (msg_name == NULL) {
+                               errno = ENOTCONN;
+                               return -1;
+                       }
+
+
+                       ret = sockaddr_convert_to_un(si, msg_name, msg->msg_namelen,
+                                                    tmp_un, 0, bcast);
+                       if (ret == -1) return -1;
+
+                       if (to_un) {
+                               *to_un = tmp_un;
+                       }
+                       if (to) {
+                               *to = msg_name;
+                       }
+                       msg->msg_name = tmp_un;
+                       msg->msg_namelen = sizeof(*tmp_un);
+               }
+
+               if (si->bound == 0) {
+                       ret = swrap_auto_bind(fd, si, si->family);
+                       if (ret == -1) return -1;
+               }
+
+               if (!si->defer_connect) {
+                       break;
+               }
+
+               ret = sockaddr_convert_to_un(si, si->peername, si->peername_len,
+                                            tmp_un, 0, NULL);
+               if (ret == -1) return -1;
+
+               ret = real_connect(fd, (struct sockaddr *)(void *)tmp_un,
+                                  sizeof(*tmp_un));
+
+               /* to give better errors */
+               if (ret == -1 && errno == ENOENT) {
+                       errno = EHOSTUNREACH;
+               }
+
+               if (ret == -1) {
+                       return ret;
+               }
+
+               si->defer_connect = 0;
+               break;
+       default:
+               errno = EHOSTUNREACH;
+               return -1;
+       }
+
+       return 0;
+}
+
+static void swrap_sendmsg_after(struct socket_info *si,
+                               struct msghdr *msg,
+                               const struct sockaddr *to,
+                               ssize_t ret)
+{
+       int saved_errno = errno;
+       size_t i, len = 0;
+       uint8_t *buf;
+       off_t ofs = 0;
+       size_t avail = 0;
+       size_t remain;
+
+       /* to give better errors */
+       if (ret == -1 && saved_errno == ENOENT) {
+               saved_errno = EHOSTUNREACH;
+       }
+
+       for (i=0; i < msg->msg_iovlen; i++) {
+               avail += msg->msg_iov[i].iov_len;
+       }
+
+       if (ret == -1) {
+               remain = MIN(80, avail);
+       } else {
+               remain = ret;
+       }
+
+       /* we capture it as one single packet */
+       buf = (uint8_t *)malloc(remain);
+       if (!buf) {
+               /* we just not capture the packet */
+               errno = saved_errno;
+               return;
+       }
+
+       for (i=0; i < msg->msg_iovlen; i++) {
+               size_t this_time = MIN(remain, msg->msg_iov[i].iov_len);
+               memcpy(buf + ofs,
+                      msg->msg_iov[i].iov_base,
+                      this_time);
+               ofs += this_time;
+               remain -= this_time;
+       }
+       len = ofs;
+
+       switch (si->type) {
+       case SOCK_STREAM:
+               if (ret == -1) {
+                       swrap_dump_packet(si, NULL, SWRAP_SEND, buf, len);
+                       swrap_dump_packet(si, NULL, SWRAP_SEND_RST, NULL, 0);
+               } else {
+                       swrap_dump_packet(si, NULL, SWRAP_SEND, buf, len);
+               }
+               break;
+
+       case SOCK_DGRAM:
+               if (si->connected) {
+                       to = si->peername;
+               }
+               if (ret == -1) {
+                       swrap_dump_packet(si, to, SWRAP_SENDTO, buf, len);
+                       swrap_dump_packet(si, to, SWRAP_SENDTO_UNREACH, buf, len);
+               } else {
+                       swrap_dump_packet(si, to, SWRAP_SENDTO, buf, len);
+               }
+               break;
+       }
+
+       free(buf);
+       errno = saved_errno;
+}
+
+_PUBLIC_ ssize_t swrap_recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, socklen_t *fromlen)
+{
+       struct sockaddr_un un_addr;
+       socklen_t un_addrlen = sizeof(un_addr);
+       int ret;
+       struct socket_info *si = find_socket_info(s);
+       struct sockaddr_storage ss;
+       socklen_t ss_len = sizeof(ss);
+
+       if (!si) {
+               return real_recvfrom(s, buf, len, flags, from, fromlen);
+       }
+
+       if (!from) {
+               from = (struct sockaddr *)(void *)&ss;
+               fromlen = &ss_len;
+       }
+
+       if (si->type == SOCK_STREAM) {
+               /* cut down to 1500 byte packets for stream sockets,
+                * which makes it easier to format PCAP capture files
+                * (as the caller will simply continue from here) */
+               len = MIN(len, 1500);
+       }
+
+       /* irix 6.4 forgets to null terminate the sun_path string :-( */
+       memset(&un_addr, 0, sizeof(un_addr));
+       ret = real_recvfrom(s, buf, len, flags,
+                           (struct sockaddr *)(void *)&un_addr, &un_addrlen);
+       if (ret == -1) 
+               return ret;
+
+       if (sockaddr_convert_from_un(si, &un_addr, un_addrlen,
+                                    si->family, from, fromlen) == -1) {
+               return -1;
+       }
+
+       swrap_dump_packet(si, from, SWRAP_RECVFROM, buf, ret);
+
+       return ret;
+}
+
+
+_PUBLIC_ ssize_t swrap_sendto(int s, const void *buf, size_t len, int flags, const struct sockaddr *to, socklen_t tolen)
+{
+       struct msghdr msg;
+       struct iovec tmp;
+       struct sockaddr_un un_addr;
+       const struct sockaddr_un *to_un = NULL;
+       ssize_t ret;
+       struct socket_info *si = find_socket_info(s);
+       int bcast = 0;
+
+       if (!si) {
+               return real_sendto(s, buf, len, flags, to, tolen);
+       }
+
+       tmp.iov_base = discard_const_p(char, buf);
+       tmp.iov_len = len;
+
+       ZERO_STRUCT(msg);
+       msg.msg_name = discard_const_p(struct sockaddr, to); /* optional address */
+       msg.msg_namelen = tolen;       /* size of address */
+       msg.msg_iov = &tmp;            /* scatter/gather array */
+       msg.msg_iovlen = 1;            /* # elements in msg_iov */
+#if 0 /* not available on solaris */
+       msg.msg_control = NULL;        /* ancillary data, see below */
+       msg.msg_controllen = 0;        /* ancillary data buffer len */
+       msg.msg_flags = 0;             /* flags on received message */
+#endif
+
+       ret = swrap_sendmsg_before(s, si, &msg, &tmp, &un_addr, &to_un, &to, &bcast);
+       if (ret == -1) return -1;
+
+       buf = msg.msg_iov[0].iov_base;
+       len = msg.msg_iov[0].iov_len;
+
+       if (bcast) {
+               struct stat st;
+               unsigned int iface;
+               unsigned int prt = ntohs(((const struct sockaddr_in *)to)->sin_port);
+               char type;
+
+               type = SOCKET_TYPE_CHAR_UDP;
+
+               for(iface=0; iface <= MAX_WRAPPED_INTERFACES; iface++) {
+                       snprintf(un_addr.sun_path, sizeof(un_addr.sun_path), "%s/"SOCKET_FORMAT,
+                                socket_wrapper_dir(), type, iface, prt);
+                       if (stat(un_addr.sun_path, &st) != 0) continue;
+
+                       /* ignore the any errors in broadcast sends */
+                       real_sendto(s, buf, len, flags,
+                                   (struct sockaddr *)(void *)&un_addr,
+                                   sizeof(un_addr));
+               }
+
+               swrap_dump_packet(si, to, SWRAP_SENDTO, buf, len);
+
+               return len;
+       }
+
+       ret = real_sendto(s, buf, len, flags, (struct sockaddr *)msg.msg_name,
+                         msg.msg_namelen);
+
+       swrap_sendmsg_after(si, &msg, to, ret);
+
+       return ret;
+}
+
+_PUBLIC_ ssize_t swrap_recv(int s, void *buf, size_t len, int flags)
+{
+       int ret;
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_recv(s, buf, len, flags);
+       }
+
+       if (si->type == SOCK_STREAM) {
+               /* cut down to 1500 byte packets for stream sockets,
+                * which makes it easier to format PCAP capture files
+                * (as the caller will simply continue from here) */
+               len = MIN(len, 1500);
+       }
+
+       ret = real_recv(s, buf, len, flags);
+       if (ret == -1 && errno != EAGAIN && errno != ENOBUFS) {
+               swrap_dump_packet(si, NULL, SWRAP_RECV_RST, NULL, 0);
+       } else if (ret == 0) { /* END OF FILE */
+               swrap_dump_packet(si, NULL, SWRAP_RECV_RST, NULL, 0);
+       } else if (ret > 0) {
+               swrap_dump_packet(si, NULL, SWRAP_RECV, buf, ret);
+       }
+
+       return ret;
+}
+
+_PUBLIC_ ssize_t swrap_read(int s, void *buf, size_t len)
+{
+       int ret;
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_read(s, buf, len);
+       }
+
+       if (si->type == SOCK_STREAM) {
+               /* cut down to 1500 byte packets for stream sockets,
+                * which makes it easier to format PCAP capture files
+                * (as the caller will simply continue from here) */
+               len = MIN(len, 1500);
+       }
+
+       ret = real_read(s, buf, len);
+       if (ret == -1 && errno != EAGAIN && errno != ENOBUFS) {
+               swrap_dump_packet(si, NULL, SWRAP_RECV_RST, NULL, 0);
+       } else if (ret == 0) { /* END OF FILE */
+               swrap_dump_packet(si, NULL, SWRAP_RECV_RST, NULL, 0);
+       } else if (ret > 0) {
+               swrap_dump_packet(si, NULL, SWRAP_RECV, buf, ret);
+       }
+
+       return ret;
+}
+
+
+_PUBLIC_ ssize_t swrap_send(int s, const void *buf, size_t len, int flags)
+{
+       struct msghdr msg;
+       struct iovec tmp;
+       struct sockaddr_un un_addr;
+       ssize_t ret;
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_send(s, buf, len, flags);
+       }
+
+       tmp.iov_base = discard_const_p(char, buf);
+       tmp.iov_len = len;
+
+       ZERO_STRUCT(msg);
+       msg.msg_name = NULL;           /* optional address */
+       msg.msg_namelen = 0;           /* size of address */
+       msg.msg_iov = &tmp;            /* scatter/gather array */
+       msg.msg_iovlen = 1;            /* # elements in msg_iov */
+#if 0 /* not available on solaris */
+       msg.msg_control = NULL;        /* ancillary data, see below */
+       msg.msg_controllen = 0;        /* ancillary data buffer len */
+       msg.msg_flags = 0;             /* flags on received message */
+#endif
+
+       ret = swrap_sendmsg_before(s, si, &msg, &tmp, &un_addr, NULL, NULL, NULL);
+       if (ret == -1) return -1;
+
+       buf = msg.msg_iov[0].iov_base;
+       len = msg.msg_iov[0].iov_len;
+
+       ret = real_send(s, buf, len, flags);
+
+       swrap_sendmsg_after(si, &msg, NULL, ret);
+
+       return ret;
+}
+
+_PUBLIC_ ssize_t swrap_sendmsg(int s, const struct msghdr *omsg, int flags)
+{
+       struct msghdr msg;
+       struct iovec tmp;
+       struct sockaddr_un un_addr;
+       const struct sockaddr_un *to_un = NULL;
+       const struct sockaddr *to = NULL;
+       ssize_t ret;
+       struct socket_info *si = find_socket_info(s);
+       int bcast = 0;
+
+       if (!si) {
+               return real_sendmsg(s, omsg, flags);
+       }
+
+       tmp.iov_base = NULL;
+       tmp.iov_len = 0;
+
+       msg = *omsg;
+#if 0
+       msg.msg_name = omsg->msg_name;             /* optional address */
+       msg.msg_namelen = omsg->msg_namelen;       /* size of address */
+       msg.msg_iov = omsg->msg_iov;               /* scatter/gather array */
+       msg.msg_iovlen = omsg->msg_iovlen;         /* # elements in msg_iov */
+       /* the following is not available on solaris */
+       msg.msg_control = omsg->msg_control;       /* ancillary data, see below */
+       msg.msg_controllen = omsg->msg_controllen; /* ancillary data buffer len */
+       msg.msg_flags = omsg->msg_flags;           /* flags on received message */
+#endif
+
+       ret = swrap_sendmsg_before(s, si, &msg, &tmp, &un_addr, &to_un, &to, &bcast);
+       if (ret == -1) return -1;
+
+       if (bcast) {
+               struct stat st;
+               unsigned int iface;
+               unsigned int prt = ntohs(((const struct sockaddr_in *)to)->sin_port);
+               char type;
+               size_t i, len = 0;
+               uint8_t *buf;
+               off_t ofs = 0;
+               size_t avail = 0;
+               size_t remain;
+
+               for (i=0; i < msg.msg_iovlen; i++) {
+                       avail += msg.msg_iov[i].iov_len;
+               }
+
+               len = avail;
+               remain = avail;
+
+               /* we capture it as one single packet */
+               buf = (uint8_t *)malloc(remain);
+               if (!buf) {
+                       return -1;
+               }
+
+               for (i=0; i < msg.msg_iovlen; i++) {
+                       size_t this_time = MIN(remain, msg.msg_iov[i].iov_len);
+                       memcpy(buf + ofs,
+                              msg.msg_iov[i].iov_base,
+                              this_time);
+                       ofs += this_time;
+                       remain -= this_time;
+               }
+
+               type = SOCKET_TYPE_CHAR_UDP;
+
+               for(iface=0; iface <= MAX_WRAPPED_INTERFACES; iface++) {
+                       snprintf(un_addr.sun_path, sizeof(un_addr.sun_path), "%s/"SOCKET_FORMAT,
+                                socket_wrapper_dir(), type, iface, prt);
+                       if (stat(un_addr.sun_path, &st) != 0) continue;
+
+                       msg.msg_name = &un_addr;           /* optional address */
+                       msg.msg_namelen = sizeof(un_addr); /* size of address */
+
+                       /* ignore the any errors in broadcast sends */
+                       real_sendmsg(s, &msg, flags);
+               }
+
+               swrap_dump_packet(si, to, SWRAP_SENDTO, buf, len);
+               free(buf);
+
+               return len;
+       }
+
+       ret = real_sendmsg(s, &msg, flags);
+
+       swrap_sendmsg_after(si, &msg, to, ret);
+
+       return ret;
+}
+
+int swrap_readv(int s, const struct iovec *vector, size_t count)
+{
+       int ret;
+       struct socket_info *si = find_socket_info(s);
+       struct iovec v;
+
+       if (!si) {
+               return real_readv(s, vector, count);
+       }
+
+       if (!si->connected) {
+               errno = ENOTCONN;
+               return -1;
+       }
+
+       if (si->type == SOCK_STREAM && count > 0) {
+               /* cut down to 1500 byte packets for stream sockets,
+                * which makes it easier to format PCAP capture files
+                * (as the caller will simply continue from here) */
+               size_t i, len = 0;
+
+               for (i=0; i < count; i++) {
+                       size_t nlen;
+                       nlen = len + vector[i].iov_len;
+                       if (nlen > 1500) {
+                               break;
+                       }
+               }
+               count = i;
+               if (count == 0) {
+                       v = vector[0];
+                       v.iov_len = MIN(v.iov_len, 1500);
+                       vector = &v;
+                       count = 1;
+               }
+       }
+
+       ret = real_readv(s, vector, count);
+       if (ret == -1 && errno != EAGAIN && errno != ENOBUFS) {
+               swrap_dump_packet(si, NULL, SWRAP_RECV_RST, NULL, 0);
+       } else if (ret == 0) { /* END OF FILE */
+               swrap_dump_packet(si, NULL, SWRAP_RECV_RST, NULL, 0);
+       } else if (ret > 0) {
+               uint8_t *buf;
+               off_t ofs = 0;
+               size_t i;
+               size_t remain = ret;
+
+               /* we capture it as one single packet */
+               buf = (uint8_t *)malloc(ret);
+               if (!buf) {
+                       /* we just not capture the packet */
+                       errno = 0;
+                       return ret;
+               }
+
+               for (i=0; i < count; i++) {
+                       size_t this_time = MIN(remain, vector[i].iov_len);
+                       memcpy(buf + ofs,
+                              vector[i].iov_base,
+                              this_time);
+                       ofs += this_time;
+                       remain -= this_time;
+               }
+
+               swrap_dump_packet(si, NULL, SWRAP_RECV, buf, ret);
+               free(buf);
+       }
+
+       return ret;
+}
+
+int swrap_writev(int s, const struct iovec *vector, size_t count)
+{
+       struct msghdr msg;
+       struct iovec tmp;
+       struct sockaddr_un un_addr;
+       ssize_t ret;
+       struct socket_info *si = find_socket_info(s);
+
+       if (!si) {
+               return real_writev(s, vector, count);
+       }
+
+       tmp.iov_base = NULL;
+       tmp.iov_len = 0;
+
+       ZERO_STRUCT(msg);
+       msg.msg_name = NULL;           /* optional address */
+       msg.msg_namelen = 0;           /* size of address */
+       msg.msg_iov = discard_const_p(struct iovec, vector); /* scatter/gather array */
+       msg.msg_iovlen = count;        /* # elements in msg_iov */
+#if 0 /* not available on solaris */
+       msg.msg_control = NULL;        /* ancillary data, see below */
+       msg.msg_controllen = 0;        /* ancillary data buffer len */
+       msg.msg_flags = 0;             /* flags on received message */
+#endif
+
+       ret = swrap_sendmsg_before(s, si, &msg, &tmp, &un_addr, NULL, NULL, NULL);
+       if (ret == -1) return -1;
+
+       ret = real_writev(s, msg.msg_iov, msg.msg_iovlen);
+
+       swrap_sendmsg_after(si, &msg, NULL, ret);
+
+       return ret;
+}
+
+_PUBLIC_ int swrap_close(int fd)
+{
+       struct socket_info *si = find_socket_info(fd);
+       struct socket_info_fd *fi;
+       int ret;
+
+       if (!si) {
+               return real_close(fd);
+       }
+
+       for (fi = si->fds; fi; fi = fi->next) {
+               if (fi->fd == fd) {
+                       SWRAP_DLIST_REMOVE(si->fds, fi);
+                       free(fi);
+                       break;
+               }
+       }
+
+       if (si->fds) {
+               /* there are still references left */
+               return real_close(fd);
+       }
+
+       SWRAP_DLIST_REMOVE(sockets, si);
+
+       if (si->myname && si->peername) {
+               swrap_dump_packet(si, NULL, SWRAP_CLOSE_SEND, NULL, 0);
+       }
+
+       ret = real_close(fd);
+
+       if (si->myname && si->peername) {
+               swrap_dump_packet(si, NULL, SWRAP_CLOSE_RECV, NULL, 0);
+               swrap_dump_packet(si, NULL, SWRAP_CLOSE_ACK, NULL, 0);
+       }
+
+       if (si->myname) free(si->myname);
+       if (si->peername) free(si->peername);
+       if (si->tmp_path) {
+               unlink(si->tmp_path);
+               free(si->tmp_path);
+       }
+       free(si);
+
+       return ret;
+}
+
+_PUBLIC_ int swrap_dup(int fd)
+{
+       struct socket_info *si;
+       struct socket_info_fd *fi;
+
+       si = find_socket_info(fd);
+
+       if (!si) {
+               return real_dup(fd);
+       }
+
+       fi = (struct socket_info_fd *)calloc(1, sizeof(struct socket_info_fd));
+       if (fi == NULL) {
+               errno = ENOMEM;
+               return -1;
+       }
+
+       fi->fd = real_dup(fd);
+       if (fi->fd == -1) {
+               int saved_errno = errno;
+               free(fi);
+               errno = saved_errno;
+               return -1;
+       }
+
+       SWRAP_DLIST_ADD(si->fds, fi);
+       return fi->fd;
+}
+
+_PUBLIC_ int swrap_dup2(int fd, int newfd)
+{
+       struct socket_info *si;
+       struct socket_info_fd *fi;
+
+       si = find_socket_info(fd);
+
+       if (!si) {
+               return real_dup2(fd, newfd);
+       }
+
+       if (find_socket_info(newfd)) {
+               /* dup2() does an implicit close of newfd, which we
+                * need to emulate */
+               swrap_close(newfd);
+       }
+
+       fi = (struct socket_info_fd *)calloc(1, sizeof(struct socket_info_fd));
+       if (fi == NULL) {
+               errno = ENOMEM;
+               return -1;
+       }
+
+       fi->fd = real_dup2(fd, newfd);
+       if (fi->fd == -1) {
+               int saved_errno = errno;
+               free(fi);
+               errno = saved_errno;
+               return -1;
+       }
+
+       SWRAP_DLIST_ADD(si->fds, fi);
+       return fi->fd;
+}
diff --git a/ctdb/lib/socket_wrapper/socket_wrapper.h b/ctdb/lib/socket_wrapper/socket_wrapper.h
new file mode 100644 (file)
index 0000000..32c9de6
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Copyright (C) Jelmer Vernooij 2005 <jelmer@samba.org>
+ * Copyright (C) Stefan Metzmacher 2006 <metze@samba.org>
+ *
+ * All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the author nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+#ifndef __SOCKET_WRAPPER_H__
+#define __SOCKET_WRAPPER_H__
+
+const char *socket_wrapper_dir(void);
+unsigned int socket_wrapper_default_iface(void);
+int swrap_socket(int family, int type, int protocol);
+int swrap_accept(int s, struct sockaddr *addr, socklen_t *addrlen);
+int swrap_connect(int s, const struct sockaddr *serv_addr, socklen_t addrlen);
+int swrap_bind(int s, const struct sockaddr *myaddr, socklen_t addrlen);
+int swrap_listen(int s, int backlog);
+int swrap_getpeername(int s, struct sockaddr *name, socklen_t *addrlen);
+int swrap_getsockname(int s, struct sockaddr *name, socklen_t *addrlen);
+int swrap_getsockopt(int s, int level, int optname, void *optval, socklen_t *optlen);
+int swrap_setsockopt(int s, int  level,  int  optname,  const  void  *optval, socklen_t optlen);
+ssize_t swrap_recvfrom(int s, void *buf, size_t len, int flags, struct sockaddr *from, socklen_t *fromlen);
+ssize_t swrap_sendto(int s, const void *buf, size_t len, int flags, const struct sockaddr *to, socklen_t tolen);
+ssize_t swrap_sendmsg(int s, const struct msghdr *msg, int flags);
+ssize_t swrap_recvmsg(int s, struct msghdr *msg, int flags);
+int swrap_ioctl(int s, int req, void *ptr);
+ssize_t swrap_recv(int s, void *buf, size_t len, int flags);
+ssize_t swrap_read(int s, void *buf, size_t len);
+ssize_t swrap_send(int s, const void *buf, size_t len, int flags);
+int swrap_readv(int s, const struct iovec *vector, size_t count);
+int swrap_writev(int s, const struct iovec *vector, size_t count);
+int swrap_close(int);
+int swrap_dup(int oldfd);
+int swrap_dup2(int oldfd, int newfd);
+
+#ifdef SOCKET_WRAPPER_REPLACE
+
+#ifdef accept
+#undef accept
+#endif
+#define accept(s,addr,addrlen)         swrap_accept(s,addr,addrlen)
+
+#ifdef connect
+#undef connect
+#endif
+#define connect(s,serv_addr,addrlen)   swrap_connect(s,serv_addr,addrlen)
+
+#ifdef bind
+#undef bind
+#endif
+#define bind(s,myaddr,addrlen)         swrap_bind(s,myaddr,addrlen)
+
+#ifdef listen
+#undef listen
+#endif
+#define listen(s,blog)                 swrap_listen(s,blog)
+
+#ifdef getpeername
+#undef getpeername
+#endif
+#define getpeername(s,name,addrlen)    swrap_getpeername(s,name,addrlen)
+
+#ifdef getsockname
+#undef getsockname
+#endif
+#define getsockname(s,name,addrlen)    swrap_getsockname(s,name,addrlen)
+
+#ifdef getsockopt
+#undef getsockopt
+#endif
+#define getsockopt(s,level,optname,optval,optlen) swrap_getsockopt(s,level,optname,optval,optlen)
+
+#ifdef setsockopt
+#undef setsockopt
+#endif
+#define setsockopt(s,level,optname,optval,optlen) swrap_setsockopt(s,level,optname,optval,optlen)
+
+#ifdef recvfrom
+#undef recvfrom
+#endif
+#define recvfrom(s,buf,len,flags,from,fromlen)           swrap_recvfrom(s,buf,len,flags,from,fromlen)
+
+#ifdef sendto
+#undef sendto
+#endif
+#define sendto(s,buf,len,flags,to,tolen)          swrap_sendto(s,buf,len,flags,to,tolen)
+
+#ifdef sendmsg
+#undef sendmsg
+#endif
+#define sendmsg(s,msg,flags)            swrap_sendmsg(s,msg,flags)
+
+#ifdef recvmsg
+#undef recvmsg
+#endif
+#define recvmsg(s,msg,flags)            swrap_recvmsg(s,msg,flags)
+
+#ifdef ioctl
+#undef ioctl
+#endif
+#define ioctl(s,req,ptr)               swrap_ioctl(s,req,ptr)
+
+#ifdef recv
+#undef recv
+#endif
+#define recv(s,buf,len,flags)          swrap_recv(s,buf,len,flags)
+
+#ifdef read
+#undef read
+#endif
+#define read(s,buf,len)                swrap_read(s,buf,len)
+
+#ifdef send
+#undef send
+#endif
+#define send(s,buf,len,flags)          swrap_send(s,buf,len,flags)
+
+#ifdef readv
+#undef readv
+#endif
+#define readv(s, vector, count)                swrap_readv(s,vector, count)
+
+#ifdef writev
+#undef writev
+#endif
+#define writev(s, vector, count)       swrap_writev(s,vector, count)
+
+#ifdef socket
+#undef socket
+#endif
+#define socket(domain,type,protocol)   swrap_socket(domain,type,protocol)
+
+#ifdef close
+#undef close
+#endif
+#define close(s)                       swrap_close(s)
+
+#ifdef dup
+#undef dup
+#endif
+#define dup(s)                 swrap_dup(s)
+
+#ifdef dup2
+#undef dup2
+#endif
+#define dup2(s, s2)            swrap_dup2(s, s2)
+
+#endif /* SOCKET_WRAPPER_REPLACE */
+#endif /* __SOCKET_WRAPPER_H__ */
diff --git a/ctdb/lib/socket_wrapper/testsuite.c b/ctdb/lib/socket_wrapper/testsuite.c
new file mode 100644 (file)
index 0000000..9274e7f
--- /dev/null
@@ -0,0 +1,105 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   local testing of the socket wrapper
+
+   Copyright (C) Jelmer Vernooij 2007
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/network.h"
+#include "../socket_wrapper/socket_wrapper.h"
+#include "torture/torture.h"
+
+static char *old_dir = NULL;
+static char *old_iface = NULL;
+
+static void backup_env(void)
+{
+       old_dir = getenv("SOCKET_WRAPPER_DIR");
+       old_iface = getenv("SOCKET_WRAPPER_DEFAULT_IFACE");
+}
+
+static void restore_env(void)
+{
+       if (old_dir == NULL)
+               unsetenv("SOCKET_WRAPPER_DIR");
+       else
+               setenv("SOCKET_WRAPPER_DIR", old_dir, 1);
+       if (old_iface == NULL)
+               unsetenv("SOCKET_WRAPPER_DEFAULT_IFACE");
+       else
+               setenv("SOCKET_WRAPPER_DEFAULT_IFACE", old_iface, 1);
+}
+
+static bool test_socket_wrapper_dir(struct torture_context *tctx)
+{
+       backup_env();
+
+       setenv("SOCKET_WRAPPER_DIR", "foo", 1);
+       torture_assert_str_equal(tctx, socket_wrapper_dir(), "foo", "setting failed");
+       setenv("SOCKET_WRAPPER_DIR", "./foo", 1);
+       torture_assert_str_equal(tctx, socket_wrapper_dir(), "foo", "setting failed");
+       unsetenv("SOCKET_WRAPPER_DIR");
+       torture_assert_str_equal(tctx, socket_wrapper_dir(), NULL, "resetting failed");
+
+       restore_env();
+
+       return true;
+}
+
+static bool test_swrap_socket(struct torture_context *tctx)
+{
+       backup_env();
+       setenv("SOCKET_WRAPPER_DIR", "foo", 1);
+
+       torture_assert_int_equal(tctx, swrap_socket(1337, 1337, 0), -1, "unknown address family fails");
+       torture_assert_int_equal(tctx, errno, EAFNOSUPPORT, "correct errno set");
+       torture_assert_int_equal(tctx, swrap_socket(AF_INET, 1337, 0), -1, "unknown type fails");
+       torture_assert_int_equal(tctx, errno, EPROTONOSUPPORT, "correct errno set");
+       torture_assert_int_equal(tctx, swrap_socket(AF_INET, SOCK_DGRAM, 10), -1, "unknown protocol fails");
+       torture_assert_int_equal(tctx, errno, EPROTONOSUPPORT, "correct errno set");
+
+       restore_env();
+
+       return true;
+}
+
+unsigned int socket_wrapper_default_iface(void);
+static bool test_socket_wrapper_default_iface(struct torture_context *tctx)
+{
+       backup_env();
+       unsetenv("SOCKET_WRAPPER_DEFAULT_IFACE");
+       torture_assert_int_equal(tctx, socket_wrapper_default_iface(), 1, "unset");
+       setenv("SOCKET_WRAPPER_DEFAULT_IFACE", "2", 1);
+       torture_assert_int_equal(tctx, socket_wrapper_default_iface(), 2, "unset");
+       setenv("SOCKET_WRAPPER_DEFAULT_IFACE", "bla", 1);
+       torture_assert_int_equal(tctx, socket_wrapper_default_iface(), 1, "unset");
+       restore_env();
+       return true;
+}
+
+struct torture_suite *torture_local_socket_wrapper(TALLOC_CTX *mem_ctx)
+{
+       struct torture_suite *suite = torture_suite_create(mem_ctx, 
+                                                                                                          "socket-wrapper");
+
+       torture_suite_add_simple_test(suite, "socket_wrapper_dir", test_socket_wrapper_dir);
+       torture_suite_add_simple_test(suite, "socket", test_swrap_socket);
+       torture_suite_add_simple_test(suite, "socket_wrapper_default_iface", test_socket_wrapper_default_iface);
+
+       return suite;
+}
diff --git a/ctdb/lib/socket_wrapper/wscript b/ctdb/lib/socket_wrapper/wscript
new file mode 100644 (file)
index 0000000..9da578f
--- /dev/null
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import Options
+
+def set_options(opt):
+    gr = opt.option_group('developer options')
+    gr.add_option('--enable-socket-wrapper',
+                   help=("Turn on socket wrapper library (default=no)"),
+                   action="store_true", dest='enable_socket_wrapper', default=False)
+
+def configure(conf):
+    if (Options.options.enable_socket_wrapper or Options.options.developer or Options.options.enable_selftest):
+        conf.DEFINE('SOCKET_WRAPPER', 1)
+        conf.ADD_GLOBAL_DEPENDENCY('socket_wrapper')
+
diff --git a/ctdb/lib/socket_wrapper/wscript_build b/ctdb/lib/socket_wrapper/wscript_build
new file mode 100644 (file)
index 0000000..a81c7aa
--- /dev/null
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+
+bld.SAMBA_LIBRARY('socket_wrapper',
+                  source='socket_wrapper.c',
+                  group='base_libraries',
+                  private_library=True,
+                  enabled=bld.CONFIG_SET('SOCKET_WRAPPER'),
+                  deps='replace')
+
diff --git a/ctdb/lib/talloc/ABI/pytalloc-util-2.0.6.sigs b/ctdb/lib/talloc/ABI/pytalloc-util-2.0.6.sigs
new file mode 100644 (file)
index 0000000..961c1a8
--- /dev/null
@@ -0,0 +1,6 @@
+pytalloc_CObject_FromTallocPtr: PyObject *(void *)
+pytalloc_Check: int (PyObject *)
+pytalloc_GetObjectType: PyTypeObject *(void)
+pytalloc_reference_ex: PyObject *(PyTypeObject *, TALLOC_CTX *, void *)
+pytalloc_steal: PyObject *(PyTypeObject *, void *)
+pytalloc_steal_ex: PyObject *(PyTypeObject *, TALLOC_CTX *, void *)
diff --git a/ctdb/lib/talloc/ABI/pytalloc-util-2.0.7.sigs b/ctdb/lib/talloc/ABI/pytalloc-util-2.0.7.sigs
new file mode 100644 (file)
index 0000000..961c1a8
--- /dev/null
@@ -0,0 +1,6 @@
+pytalloc_CObject_FromTallocPtr: PyObject *(void *)
+pytalloc_Check: int (PyObject *)
+pytalloc_GetObjectType: PyTypeObject *(void)
+pytalloc_reference_ex: PyObject *(PyTypeObject *, TALLOC_CTX *, void *)
+pytalloc_steal: PyObject *(PyTypeObject *, void *)
+pytalloc_steal_ex: PyObject *(PyTypeObject *, TALLOC_CTX *, void *)
diff --git a/ctdb/lib/talloc/ABI/pytalloc-util-2.0.8.sigs b/ctdb/lib/talloc/ABI/pytalloc-util-2.0.8.sigs
new file mode 100644 (file)
index 0000000..961c1a8
--- /dev/null
@@ -0,0 +1,6 @@
+pytalloc_CObject_FromTallocPtr: PyObject *(void *)
+pytalloc_Check: int (PyObject *)
+pytalloc_GetObjectType: PyTypeObject *(void)
+pytalloc_reference_ex: PyObject *(PyTypeObject *, TALLOC_CTX *, void *)
+pytalloc_steal: PyObject *(PyTypeObject *, void *)
+pytalloc_steal_ex: PyObject *(PyTypeObject *, TALLOC_CTX *, void *)
diff --git a/ctdb/lib/talloc/ABI/talloc-2.0.2.sigs b/ctdb/lib/talloc/ABI/talloc-2.0.2.sigs
new file mode 100644 (file)
index 0000000..6e236d5
--- /dev/null
@@ -0,0 +1,62 @@
+_talloc: void *(const void *, size_t)
+_talloc_array: void *(const void *, size_t, unsigned int, const char *)
+_talloc_free: int (void *, const char *)
+_talloc_get_type_abort: void *(const void *, const char *, const char *)
+_talloc_memdup: void *(const void *, const void *, size_t, const char *)
+_talloc_move: void *(const void *, const void *)
+_talloc_realloc: void *(const void *, void *, size_t, const char *)
+_talloc_realloc_array: void *(const void *, void *, size_t, unsigned int, const char *)
+_talloc_reference_loc: void *(const void *, const void *, const char *)
+_talloc_set_destructor: void (const void *, int (*)(void *))
+_talloc_steal_loc: void *(const void *, const void *, const char *)
+_talloc_zero: void *(const void *, size_t, const char *)
+_talloc_zero_array: void *(const void *, size_t, unsigned int, const char *)
+talloc_asprintf: char *(const void *, const char *, ...)
+talloc_asprintf_append: char *(char *, const char *, ...)
+talloc_asprintf_append_buffer: char *(char *, const char *, ...)
+talloc_autofree_context: void *(void)
+talloc_check_name: void *(const void *, const char *)
+talloc_disable_null_tracking: void (void)
+talloc_enable_leak_report: void (void)
+talloc_enable_leak_report_full: void (void)
+talloc_enable_null_tracking: void (void)
+talloc_enable_null_tracking_no_autofree: void (void)
+talloc_find_parent_byname: void *(const void *, const char *)
+talloc_free_children: void (void *)
+talloc_get_name: const char *(const void *)
+talloc_get_size: size_t (const void *)
+talloc_increase_ref_count: int (const void *)
+talloc_init: void *(const char *, ...)
+talloc_is_parent: int (const void *, const void *)
+talloc_named: void *(const void *, size_t, const char *, ...)
+talloc_named_const: void *(const void *, size_t, const char *)
+talloc_parent: void *(const void *)
+talloc_parent_name: const char *(const void *)
+talloc_pool: void *(const void *, size_t)
+talloc_realloc_fn: void *(const void *, void *, size_t)
+talloc_reference_count: size_t (const void *)
+talloc_reparent: void *(const void *, const void *, const void *)
+talloc_report: void (const void *, FILE *)
+talloc_report_depth_cb: void (const void *, int, int, void (*)(const void *, int, int, int, void *), void *)
+talloc_report_depth_file: void (const void *, int, int, FILE *)
+talloc_report_full: void (const void *, FILE *)
+talloc_set_abort_fn: void (void (*)(const char *))
+talloc_set_log_fn: void (void (*)(const char *))
+talloc_set_log_stderr: void (void)
+talloc_set_name: const char *(const void *, const char *, ...)
+talloc_set_name_const: void (const void *, const char *)
+talloc_show_parents: void (const void *, FILE *)
+talloc_strdup: char *(const void *, const char *)
+talloc_strdup_append: char *(char *, const char *)
+talloc_strdup_append_buffer: char *(char *, const char *)
+talloc_strndup: char *(const void *, const char *, size_t)
+talloc_strndup_append: char *(char *, const char *, size_t)
+talloc_strndup_append_buffer: char *(char *, const char *, size_t)
+talloc_total_blocks: size_t (const void *)
+talloc_total_size: size_t (const void *)
+talloc_unlink: int (const void *, void *)
+talloc_vasprintf: char *(const void *, const char *, va_list)
+talloc_vasprintf_append: char *(char *, const char *, va_list)
+talloc_vasprintf_append_buffer: char *(char *, const char *, va_list)
+talloc_version_major: int (void)
+talloc_version_minor: int (void)
diff --git a/ctdb/lib/talloc/ABI/talloc-2.0.3.sigs b/ctdb/lib/talloc/ABI/talloc-2.0.3.sigs
new file mode 100644 (file)
index 0000000..6e236d5
--- /dev/null
@@ -0,0 +1,62 @@
+_talloc: void *(const void *, size_t)
+_talloc_array: void *(const void *, size_t, unsigned int, const char *)
+_talloc_free: int (void *, const char *)
+_talloc_get_type_abort: void *(const void *, const char *, const char *)
+_talloc_memdup: void *(const void *, const void *, size_t, const char *)
+_talloc_move: void *(const void *, const void *)
+_talloc_realloc: void *(const void *, void *, size_t, const char *)
+_talloc_realloc_array: void *(const void *, void *, size_t, unsigned int, const char *)
+_talloc_reference_loc: void *(const void *, const void *, const char *)
+_talloc_set_destructor: void (const void *, int (*)(void *))
+_talloc_steal_loc: void *(const void *, const void *, const char *)
+_talloc_zero: void *(const void *, size_t, const char *)
+_talloc_zero_array: void *(const void *, size_t, unsigned int, const char *)
+talloc_asprintf: char *(const void *, const char *, ...)
+talloc_asprintf_append: char *(char *, const char *, ...)
+talloc_asprintf_append_buffer: char *(char *, const char *, ...)
+talloc_autofree_context: void *(void)
+talloc_check_name: void *(const void *, const char *)
+talloc_disable_null_tracking: void (void)
+talloc_enable_leak_report: void (void)
+talloc_enable_leak_report_full: void (void)
+talloc_enable_null_tracking: void (void)
+talloc_enable_null_tracking_no_autofree: void (void)
+talloc_find_parent_byname: void *(const void *, const char *)
+talloc_free_children: void (void *)
+talloc_get_name: const char *(const void *)
+talloc_get_size: size_t (const void *)
+talloc_increase_ref_count: int (const void *)
+talloc_init: void *(const char *, ...)
+talloc_is_parent: int (const void *, const void *)
+talloc_named: void *(const void *, size_t, const char *, ...)
+talloc_named_const: void *(const void *, size_t, const char *)
+talloc_parent: void *(const void *)
+talloc_parent_name: const char *(const void *)
+talloc_pool: void *(const void *, size_t)
+talloc_realloc_fn: void *(const void *, void *, size_t)
+talloc_reference_count: size_t (const void *)
+talloc_reparent: void *(const void *, const void *, const void *)
+talloc_report: void (const void *, FILE *)
+talloc_report_depth_cb: void (const void *, int, int, void (*)(const void *, int, int, int, void *), void *)
+talloc_report_depth_file: void (const void *, int, int, FILE *)
+talloc_report_full: void (const void *, FILE *)
+talloc_set_abort_fn: void (void (*)(const char *))
+talloc_set_log_fn: void (void (*)(const char *))
+talloc_set_log_stderr: void (void)
+talloc_set_name: const char *(const void *, const char *, ...)
+talloc_set_name_const: void (const void *, const char *)
+talloc_show_parents: void (const void *, FILE *)
+talloc_strdup: char *(const void *, const char *)
+talloc_strdup_append: char *(char *, const char *)
+talloc_strdup_append_buffer: char *(char *, const char *)
+talloc_strndup: char *(const void *, const char *, size_t)
+talloc_strndup_append: char *(char *, const char *, size_t)
+talloc_strndup_append_buffer: char *(char *, const char *, size_t)
+talloc_total_blocks: size_t (const void *)
+talloc_total_size: size_t (const void *)
+talloc_unlink: int (const void *, void *)
+talloc_vasprintf: char *(const void *, const char *, va_list)
+talloc_vasprintf_append: char *(char *, const char *, va_list)
+talloc_vasprintf_append_buffer: char *(char *, const char *, va_list)
+talloc_version_major: int (void)
+talloc_version_minor: int (void)
diff --git a/ctdb/lib/talloc/ABI/talloc-2.0.4.sigs b/ctdb/lib/talloc/ABI/talloc-2.0.4.sigs
new file mode 100644 (file)
index 0000000..6e236d5
--- /dev/null
@@ -0,0 +1,62 @@
+_talloc: void *(const void *, size_t)
+_talloc_array: void *(const void *, size_t, unsigned int, const char *)
+_talloc_free: int (void *, const char *)
+_talloc_get_type_abort: void *(const void *, const char *, const char *)
+_talloc_memdup: void *(const void *, const void *, size_t, const char *)
+_talloc_move: void *(const void *, const void *)
+_talloc_realloc: void *(const void *, void *, size_t, const char *)
+_talloc_realloc_array: void *(const void *, void *, size_t, unsigned int, const char *)
+_talloc_reference_loc: void *(const void *, const void *, const char *)
+_talloc_set_destructor: void (const void *, int (*)(void *))
+_talloc_steal_loc: void *(const void *, const void *, const char *)
+_talloc_zero: void *(const void *, size_t, const char *)
+_talloc_zero_array: void *(const void *, size_t, unsigned int, const char *)
+talloc_asprintf: char *(const void *, const char *, ...)
+talloc_asprintf_append: char *(char *, const char *, ...)
+talloc_asprintf_append_buffer: char *(char *, const char *, ...)
+talloc_autofree_context: void *(void)
+talloc_check_name: void *(const void *, const char *)
+talloc_disable_null_tracking: void (void)
+talloc_enable_leak_report: void (void)
+talloc_enable_leak_report_full: void (void)
+talloc_enable_null_tracking: void (void)
+talloc_enable_null_tracking_no_autofree: void (void)
+talloc_find_parent_byname: void *(const void *, const char *)
+talloc_free_children: void (void *)
+talloc_get_name: const char *(const void *)
+talloc_get_size: size_t (const void *)
+talloc_increase_ref_count: int (const void *)
+talloc_init: void *(const char *, ...)
+talloc_is_parent: int (const void *, const void *)
+talloc_named: void *(const void *, size_t, const char *, ...)
+talloc_named_const: void *(const void *, size_t, const char *)
+talloc_parent: void *(const void *)
+talloc_parent_name: const char *(const void *)
+talloc_pool: void *(const void *, size_t)
+talloc_realloc_fn: void *(const void *, void *, size_t)
+talloc_reference_count: size_t (const void *)
+talloc_reparent: void *(const void *, const void *, const void *)
+talloc_report: void (const void *, FILE *)
+talloc_report_depth_cb: void (const void *, int, int, void (*)(const void *, int, int, int, void *), void *)
+talloc_report_depth_file: void (const void *, int, int, FILE *)
+talloc_report_full: void (const void *, FILE *)
+talloc_set_abort_fn: void (void (*)(const char *))
+talloc_set_log_fn: void (void (*)(const char *))
+talloc_set_log_stderr: void (void)
+talloc_set_name: const char *(const void *, const char *, ...)
+talloc_set_name_const: void (const void *, const char *)
+talloc_show_parents: void (const void *, FILE *)
+talloc_strdup: char *(const void *, const char *)
+talloc_strdup_append: char *(char *, const char *)
+talloc_strdup_append_buffer: char *(char *, const char *)
+talloc_strndup: char *(const void *, const char *, size_t)
+talloc_strndup_append: char *(char *, const char *, size_t)
+talloc_strndup_append_buffer: char *(char *, const char *, size_t)
+talloc_total_blocks: size_t (const void *)
+talloc_total_size: size_t (const void *)
+talloc_unlink: int (const void *, void *)
+talloc_vasprintf: char *(const void *, const char *, va_list)
+talloc_vasprintf_append: char *(char *, const char *, va_list)
+talloc_vasprintf_append_buffer: char *(char *, const char *, va_list)
+talloc_version_major: int (void)
+talloc_version_minor: int (void)
diff --git a/ctdb/lib/talloc/ABI/talloc-2.0.5.sigs b/ctdb/lib/talloc/ABI/talloc-2.0.5.sigs
new file mode 100644 (file)
index 0000000..6e236d5
--- /dev/null
@@ -0,0 +1,62 @@
+_talloc: void *(const void *, size_t)
+_talloc_array: void *(const void *, size_t, unsigned int, const char *)
+_talloc_free: int (void *, const char *)
+_talloc_get_type_abort: void *(const void *, const char *, const char *)
+_talloc_memdup: void *(const void *, const void *, size_t, const char *)
+_talloc_move: void *(const void *, const void *)
+_talloc_realloc: void *(const void *, void *, size_t, const char *)
+_talloc_realloc_array: void *(const void *, void *, size_t, unsigned int, const char *)
+_talloc_reference_loc: void *(const void *, const void *, const char *)
+_talloc_set_destructor: void (const void *, int (*)(void *))
+_talloc_steal_loc: void *(const void *, const void *, const char *)
+_talloc_zero: void *(const void *, size_t, const char *)
+_talloc_zero_array: void *(const void *, size_t, unsigned int, const char *)
+talloc_asprintf: char *(const void *, const char *, ...)
+talloc_asprintf_append: char *(char *, const char *, ...)
+talloc_asprintf_append_buffer: char *(char *, const char *, ...)
+talloc_autofree_context: void *(void)
+talloc_check_name: void *(const void *, const char *)
+talloc_disable_null_tracking: void (void)
+talloc_enable_leak_report: void (void)
+talloc_enable_leak_report_full: void (void)
+talloc_enable_null_tracking: void (void)
+talloc_enable_null_tracking_no_autofree: void (void)
+talloc_find_parent_byname: void *(const void *, const char *)
+talloc_free_children: void (void *)
+talloc_get_name: const char *(const void *)
+talloc_get_size: size_t (const void *)
+talloc_increase_ref_count: int (const void *)
+talloc_init: void *(const char *, ...)
+talloc_is_parent: int (const void *, const void *)
+talloc_named: void *(const void *, size_t, const char *, ...)
+talloc_named_const: void *(const void *, size_t, const char *)
+talloc_parent: void *(const void *)
+talloc_parent_name: const char *(const void *)
+talloc_pool: void *(const void *, size_t)
+talloc_realloc_fn: void *(const void *, void *, size_t)
+talloc_reference_count: size_t (const void *)
+talloc_reparent: void *(const void *, const void *, const void *)
+talloc_report: void (const void *, FILE *)
+talloc_report_depth_cb: void (const void *, int, int, void (*)(const void *, int, int, int, void *), void *)
+talloc_report_depth_file: void (const void *, int, int, FILE *)
+talloc_report_full: void (const void *, FILE *)
+talloc_set_abort_fn: void (void (*)(const char *))
+talloc_set_log_fn: void (void (*)(const char *))
+talloc_set_log_stderr: void (void)
+talloc_set_name: const char *(const void *, const char *, ...)
+talloc_set_name_const: void (const void *, const char *)
+talloc_show_parents: void (const void *, FILE *)
+talloc_strdup: char *(const void *, const char *)
+talloc_strdup_append: char *(char *, const char *)
+talloc_strdup_append_buffer: char *(char *, const char *)
+talloc_strndup: char *(const void *, const char *, size_t)
+talloc_strndup_append: char *(char *, const char *, size_t)
+talloc_strndup_append_buffer: char *(char *, const char *, size_t)
+talloc_total_blocks: size_t (const void *)
+talloc_total_size: size_t (const void *)
+talloc_unlink: int (const void *, void *)
+talloc_vasprintf: char *(const void *, const char *, va_list)
+talloc_vasprintf_append: char *(char *, const char *, va_list)
+talloc_vasprintf_append_buffer: char *(char *, const char *, va_list)
+talloc_version_major: int (void)
+talloc_version_minor: int (void)
diff --git a/ctdb/lib/talloc/ABI/talloc-2.0.6.sigs b/ctdb/lib/talloc/ABI/talloc-2.0.6.sigs
new file mode 100644 (file)
index 0000000..6e236d5
--- /dev/null
@@ -0,0 +1,62 @@
+_talloc: void *(const void *, size_t)
+_talloc_array: void *(const void *, size_t, unsigned int, const char *)
+_talloc_free: int (void *, const char *)
+_talloc_get_type_abort: void *(const void *, const char *, const char *)
+_talloc_memdup: void *(const void *, const void *, size_t, const char *)
+_talloc_move: void *(const void *, const void *)
+_talloc_realloc: void *(const void *, void *, size_t, const char *)
+_talloc_realloc_array: void *(const void *, void *, size_t, unsigned int, const char *)
+_talloc_reference_loc: void *(const void *, const void *, const char *)
+_talloc_set_destructor: void (const void *, int (*)(void *))
+_talloc_steal_loc: void *(const void *, const void *, const char *)
+_talloc_zero: void *(const void *, size_t, const char *)
+_talloc_zero_array: void *(const void *, size_t, unsigned int, const char *)
+talloc_asprintf: char *(const void *, const char *, ...)
+talloc_asprintf_append: char *(char *, const char *, ...)
+talloc_asprintf_append_buffer: char *(char *, const char *, ...)
+talloc_autofree_context: void *(void)
+talloc_check_name: void *(const void *, const char *)
+talloc_disable_null_tracking: void (void)
+talloc_enable_leak_report: void (void)
+talloc_enable_leak_report_full: void (void)
+talloc_enable_null_tracking: void (void)
+talloc_enable_null_tracking_no_autofree: void (void)
+talloc_find_parent_byname: void *(const void *, const char *)
+talloc_free_children: void (void *)
+talloc_get_name: const char *(const void *)
+talloc_get_size: size_t (const void *)
+talloc_increase_ref_count: int (const void *)
+talloc_init: void *(const char *, ...)
+talloc_is_parent: int (const void *, const void *)
+talloc_named: void *(const void *, size_t, const char *, ...)
+talloc_named_const: void *(const void *, size_t, const char *)
+talloc_parent: void *(const void *)
+talloc_parent_name: const char *(const void *)
+talloc_pool: void *(const void *, size_t)
+talloc_realloc_fn: void *(const void *, void *, size_t)
+talloc_reference_count: size_t (const void *)
+talloc_reparent: void *(const void *, const void *, const void *)
+talloc_report: void (const void *, FILE *)
+talloc_report_depth_cb: void (const void *, int, int, void (*)(const void *, int, int, int, void *), void *)
+talloc_report_depth_file: void (const void *, int, int, FILE *)
+talloc_report_full: void (const void *, FILE *)
+talloc_set_abort_fn: void (void (*)(const char *))
+talloc_set_log_fn: void (void (*)(const char *))
+talloc_set_log_stderr: void (void)
+talloc_set_name: const char *(const void *, const char *, ...)
+talloc_set_name_const: void (const void *, const char *)
+talloc_show_parents: void (const void *, FILE *)
+talloc_strdup: char *(const void *, const char *)
+talloc_strdup_append: char *(char *, const char *)
+talloc_strdup_append_buffer: char *(char *, const char *)
+talloc_strndup: char *(const void *, const char *, size_t)
+talloc_strndup_append: char *(char *, const char *, size_t)
+talloc_strndup_append_buffer: char *(char *, const char *, size_t)
+talloc_total_blocks: size_t (const void *)
+talloc_total_size: size_t (const void *)
+talloc_unlink: int (const void *, void *)
+talloc_vasprintf: char *(const void *, const char *, va_list)
+talloc_vasprintf_append: char *(char *, const char *, va_list)
+talloc_vasprintf_append_buffer: char *(char *, const char *, va_list)
+talloc_version_major: int (void)
+talloc_version_minor: int (void)
diff --git a/ctdb/lib/talloc/ABI/talloc-2.0.7.sigs b/ctdb/lib/talloc/ABI/talloc-2.0.7.sigs
new file mode 100644 (file)
index 0000000..6e236d5
--- /dev/null
@@ -0,0 +1,62 @@
+_talloc: void *(const void *, size_t)
+_talloc_array: void *(const void *, size_t, unsigned int, const char *)
+_talloc_free: int (void *, const char *)
+_talloc_get_type_abort: void *(const void *, const char *, const char *)
+_talloc_memdup: void *(const void *, const void *, size_t, const char *)
+_talloc_move: void *(const void *, const void *)
+_talloc_realloc: void *(const void *, void *, size_t, const char *)
+_talloc_realloc_array: void *(const void *, void *, size_t, unsigned int, const char *)
+_talloc_reference_loc: void *(const void *, const void *, const char *)
+_talloc_set_destructor: void (const void *, int (*)(void *))
+_talloc_steal_loc: void *(const void *, const void *, const char *)
+_talloc_zero: void *(const void *, size_t, const char *)
+_talloc_zero_array: void *(const void *, size_t, unsigned int, const char *)
+talloc_asprintf: char *(const void *, const char *, ...)
+talloc_asprintf_append: char *(char *, const char *, ...)
+talloc_asprintf_append_buffer: char *(char *, const char *, ...)
+talloc_autofree_context: void *(void)
+talloc_check_name: void *(const void *, const char *)
+talloc_disable_null_tracking: void (void)
+talloc_enable_leak_report: void (void)
+talloc_enable_leak_report_full: void (void)
+talloc_enable_null_tracking: void (void)
+talloc_enable_null_tracking_no_autofree: void (void)
+talloc_find_parent_byname: void *(const void *, const char *)
+talloc_free_children: void (void *)
+talloc_get_name: const char *(const void *)
+talloc_get_size: size_t (const void *)
+talloc_increase_ref_count: int (const void *)
+talloc_init: void *(const char *, ...)
+talloc_is_parent: int (const void *, const void *)
+talloc_named: void *(const void *, size_t, const char *, ...)
+talloc_named_const: void *(const void *, size_t, const char *)
+talloc_parent: void *(const void *)
+talloc_parent_name: const char *(const void *)
+talloc_pool: void *(const void *, size_t)
+talloc_realloc_fn: void *(const void *, void *, size_t)
+talloc_reference_count: size_t (const void *)
+talloc_reparent: void *(const void *, const void *, const void *)
+talloc_report: void (const void *, FILE *)
+talloc_report_depth_cb: void (const void *, int, int, void (*)(const void *, int, int, int, void *), void *)
+talloc_report_depth_file: void (const void *, int, int, FILE *)
+talloc_report_full: void (const void *, FILE *)
+talloc_set_abort_fn: void (void (*)(const char *))
+talloc_set_log_fn: void (void (*)(const char *))
+talloc_set_log_stderr: void (void)
+talloc_set_name: const char *(const void *, const char *, ...)
+talloc_set_name_const: void (const void *, const char *)
+talloc_show_parents: void (const void *, FILE *)
+talloc_strdup: char *(const void *, const char *)
+talloc_strdup_append: char *(char *, const char *)
+talloc_strdup_append_buffer: char *(char *, const char *)
+talloc_strndup: char *(const void *, const char *, size_t)
+talloc_strndup_append: char *(char *, const char *, size_t)
+talloc_strndup_append_buffer: char *(char *, const char *, size_t)
+talloc_total_blocks: size_t (const void *)
+talloc_total_size: size_t (const void *)
+talloc_unlink: int (const void *, void *)
+talloc_vasprintf: char *(const void *, const char *, va_list)
+talloc_vasprintf_append: char *(char *, const char *, va_list)
+talloc_vasprintf_append_buffer: char *(char *, const char *, va_list)
+talloc_version_major: int (void)
+talloc_version_minor: int (void)
diff --git a/ctdb/lib/talloc/ABI/talloc-2.0.8.sigs b/ctdb/lib/talloc/ABI/talloc-2.0.8.sigs
new file mode 100644 (file)
index 0000000..15a9e95
--- /dev/null
@@ -0,0 +1,63 @@
+_talloc: void *(const void *, size_t)
+_talloc_array: void *(const void *, size_t, unsigned int, const char *)
+_talloc_free: int (void *, const char *)
+_talloc_get_type_abort: void *(const void *, const char *, const char *)
+_talloc_memdup: void *(const void *, const void *, size_t, const char *)
+_talloc_move: void *(const void *, const void *)
+_talloc_realloc: void *(const void *, void *, size_t, const char *)
+_talloc_realloc_array: void *(const void *, void *, size_t, unsigned int, const char *)
+_talloc_reference_loc: void *(const void *, const void *, const char *)
+_talloc_set_destructor: void (const void *, int (*)(void *))
+_talloc_steal_loc: void *(const void *, const void *, const char *)
+_talloc_zero: void *(const void *, size_t, const char *)
+_talloc_zero_array: void *(const void *, size_t, unsigned int, const char *)
+talloc_asprintf: char *(const void *, const char *, ...)
+talloc_asprintf_append: char *(char *, const char *, ...)
+talloc_asprintf_append_buffer: char *(char *, const char *, ...)
+talloc_autofree_context: void *(void)
+talloc_check_name: void *(const void *, const char *)
+talloc_disable_null_tracking: void (void)
+talloc_enable_leak_report: void (void)
+talloc_enable_leak_report_full: void (void)
+talloc_enable_null_tracking: void (void)
+talloc_enable_null_tracking_no_autofree: void (void)
+talloc_find_parent_byname: void *(const void *, const char *)
+talloc_free_children: void (void *)
+talloc_get_name: const char *(const void *)
+talloc_get_size: size_t (const void *)
+talloc_increase_ref_count: int (const void *)
+talloc_init: void *(const char *, ...)
+talloc_is_parent: int (const void *, const void *)
+talloc_named: void *(const void *, size_t, const char *, ...)
+talloc_named_const: void *(const void *, size_t, const char *)
+talloc_parent: void *(const void *)
+talloc_parent_name: const char *(const void *)
+talloc_pool: void *(const void *, size_t)
+talloc_realloc_fn: void *(const void *, void *, size_t)
+talloc_reference_count: size_t (const void *)
+talloc_reparent: void *(const void *, const void *, const void *)
+talloc_report: void (const void *, FILE *)
+talloc_report_depth_cb: void (const void *, int, int, void (*)(const void *, int, int, int, void *), void *)
+talloc_report_depth_file: void (const void *, int, int, FILE *)
+talloc_report_full: void (const void *, FILE *)
+talloc_set_abort_fn: void (void (*)(const char *))
+talloc_set_log_fn: void (void (*)(const char *))
+talloc_set_log_stderr: void (void)
+talloc_set_memlimit: int (const void *, size_t)
+talloc_set_name: const char *(const void *, const char *, ...)
+talloc_set_name_const: void (const void *, const char *)
+talloc_show_parents: void (const void *, FILE *)
+talloc_strdup: char *(const void *, const char *)
+talloc_strdup_append: char *(char *, const char *)
+talloc_strdup_append_buffer: char *(char *, const char *)
+talloc_strndup: char *(const void *, const char *, size_t)
+talloc_strndup_append: char *(char *, const char *, size_t)
+talloc_strndup_append_buffer: char *(char *, const char *, size_t)
+talloc_total_blocks: size_t (const void *)
+talloc_total_size: size_t (const void *)
+talloc_unlink: int (const void *, void *)
+talloc_vasprintf: char *(const void *, const char *, va_list)
+talloc_vasprintf_append: char *(char *, const char *, va_list)
+talloc_vasprintf_append_buffer: char *(char *, const char *, va_list)
+talloc_version_major: int (void)
+talloc_version_minor: int (void)
diff --git a/ctdb/lib/talloc/NEWS b/ctdb/lib/talloc/NEWS
new file mode 100644 (file)
index 0000000..e5b3aa0
--- /dev/null
@@ -0,0 +1,13 @@
+1.0.1  26 May 2007
+
+ BUGS
+  
+   * Set name of correctly when using talloc_append_string() (metze)
+
+ LICENSE
+   
+   * Change license of files in lib/replace to LGPL (was GPL). (jelmer)
+
+1.0.0  30 April 2007
+ Initial release.
diff --git a/ctdb/lib/talloc/compat/talloc_compat1.c b/ctdb/lib/talloc/compat/talloc_compat1.c
new file mode 100644 (file)
index 0000000..519e8c3
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+   Samba trivial allocation library - compat functions
+
+   Copyright (C) Stefan Metzmacher 2009
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+ * This file contains only function to build a
+ * compat talloc.so.1 library on top of talloc.so.2
+ */
+
+#include "replace.h"
+#include "talloc.h"
+
+void *_talloc_reference(const void *context, const void *ptr);
+void *_talloc_reference(const void *context, const void *ptr) {
+       return _talloc_reference_loc(context, ptr,
+                                    "Called from talloc compat1 "
+                                    "_talloc_reference");
+}
+
+void *_talloc_steal(const void *new_ctx, const void *ptr);
+void *_talloc_steal(const void *new_ctx, const void *ptr)
+{
+       return talloc_reparent(talloc_parent(ptr), new_ctx, ptr);
+}
+
+#undef talloc_free
+int talloc_free(void *ptr);
+int talloc_free(void *ptr)
+{
+       return talloc_unlink(talloc_parent(ptr), ptr);
+}
+
diff --git a/ctdb/lib/talloc/compat/talloc_compat1.m4 b/ctdb/lib/talloc/compat/talloc_compat1.m4
new file mode 100644 (file)
index 0000000..2ec530e
--- /dev/null
@@ -0,0 +1,14 @@
+TALLOC_COMPAT1_MK=""
+AC_SUBST(TALLOC_COMPAT1_MK)
+
+AC_ARG_ENABLE(talloc-compat1,
+       [AS_HELP_STRING([--enable-talloc-compat1],
+               [Build talloc 1.x.x compat library [default=no]])],
+       [ enable_talloc_compat1=$enableval ],
+       [ enable_talloc_compat1=no ]
+)
+
+if test "x$enable_talloc_compat1" = x"yes"; then
+       TALLOC_COMPAT1_MK='include $(tallocdir)/compat/talloc_compat1.mk'
+fi
+
diff --git a/ctdb/lib/talloc/compat/talloc_compat1.mk b/ctdb/lib/talloc/compat/talloc_compat1.mk
new file mode 100644 (file)
index 0000000..d1817f0
--- /dev/null
@@ -0,0 +1,21 @@
+talloccompatdir := $(tallocdir)/compat
+
+TALLOC_COMPAT1_VERSION_MAJOR = 1
+TALLOC_COMPAT1_OBJ = $(talloccompatdir)/talloc_compat1.o
+
+TALLOC_COMPAT1_SOLIB = libtalloc-compat1-$(TALLOC_VERSION).$(SHLIBEXT)
+TALLOC_COMPAT1_SONAME = libtalloc.$(SHLIBEXT).$(TALLOC_COMPAT1_VERSION_MAJOR)
+
+$(TALLOC_COMPAT1_SOLIB): $(TALLOC_COMPAT1_OBJ) $(TALLOC_SOLIB)
+       $(SHLD) $(SHLD_FLAGS) -o $@ $(TALLOC_COMPAT1_OBJ) \
+               $(TALLOC_SOLIB) $(SONAMEFLAG)$(TALLOC_COMPAT1_SONAME)
+
+all:: $(TALLOC_COMPAT1_SOLIB)
+
+install::
+       ${INSTALLCMD} -d $(DESTDIR)$(libdir)
+       ${INSTALLCMD} -m 755 $(TALLOC_COMPAT1_SOLIB) $(DESTDIR)$(libdir)
+
+clean::
+       rm -f $(TALLOC_COMPAT1_OBJ) $(TALLOC_COMPAT1_SOLIB)
+
diff --git a/ctdb/lib/talloc/doc/context.png b/ctdb/lib/talloc/doc/context.png
new file mode 100644 (file)
index 0000000..48a6ca0
Binary files /dev/null and b/ctdb/lib/talloc/doc/context.png differ
diff --git a/ctdb/lib/talloc/doc/context_tree.png b/ctdb/lib/talloc/doc/context_tree.png
new file mode 100644 (file)
index 0000000..9723459
Binary files /dev/null and b/ctdb/lib/talloc/doc/context_tree.png differ
diff --git a/ctdb/lib/talloc/doc/mainpage.dox b/ctdb/lib/talloc/doc/mainpage.dox
new file mode 100644 (file)
index 0000000..3b56898
--- /dev/null
@@ -0,0 +1,110 @@
+/**
+ * @mainpage
+ *
+ * talloc is a hierarchical, reference counted memory pool system with
+ * destructors. It is the core memory allocator used in Samba.
+ *
+ * @section talloc_download Download
+ *
+ * You can download the latest releases of talloc from the
+ * <a href="http://samba.org/ftp/talloc" target="_blank">talloc directory</a>
+ * on the samba public source archive.
+ *
+ * @section main-tutorial Tutorial
+ *
+ * You should start by reading @subpage libtalloc_tutorial, then reading the documentation of
+ * the interesting functions as you go.
+
+ * @section talloc_bugs Discussion and bug reports
+ *
+ * talloc does not currently have its own mailing list or bug tracking system.
+ * For now, please use the
+ * <a href="https://lists.samba.org/mailman/listinfo/samba-technical" target="_blank">samba-technical</a>
+ * mailing list, and the
+ * <a href="http://bugzilla.samba.org/" target="_blank">Samba bugzilla</a>
+ * bug tracking system.
+ *
+ * @section talloc_devel Development
+ * You can download the latest code either via git or rsync.
+ *
+ * To fetch via git see the following guide:
+ *
+ * <a href="http://wiki.samba.org/index.php/Using_Git_for_Samba_Development" target="_blank">Using Git for Samba Development</a>
+ *
+ * Once you have cloned the tree switch to the master branch and cd into the
+ * lib/tevent directory.
+ *
+ * To fetch via rsync use this command:
+ *
+ * rsync -Pavz samba.org::ftp/unpacked/standalone_projects/lib/talloc .
+ *
+ * @section talloc_preample Preamble
+ *
+ * talloc is a hierarchical, reference counted memory pool system with
+ * destructors.
+ *
+ * Perhaps the biggest difference from other memory pool systems is that there
+ * is no distinction between a "talloc context" and a "talloc pointer". Any
+ * pointer returned from talloc() is itself a valid talloc context. This means
+ * you can do this:
+ *
+ * @code
+ *      struct foo *X = talloc(mem_ctx, struct foo);
+ *      X->name = talloc_strdup(X, "foo");
+ * @endcode
+ *
+ * The pointer X->name would be a "child" of the talloc context "X" which is
+ * itself a child of mem_ctx. So if you do talloc_free(mem_ctx) then it is all
+ * destroyed, whereas if you do talloc_free(X) then just X and X->name are
+ * destroyed, and if you do talloc_free(X->name) then just the name element of
+ * X is destroyed.
+ *
+ * If you think about this, then what this effectively gives you is an n-ary
+ * tree, where you can free any part of the tree with talloc_free().
+ *
+ * If you find this confusing, then run the testsuite to watch talloc in
+ * action. You may also like to add your own tests to testsuite.c to clarify
+ * how some particular situation is handled.
+ *
+ * @section talloc_performance Performance
+ *
+ * All the additional features of talloc() over malloc() do come at a price. We
+ * have a simple performance test in Samba4 that measures talloc() versus
+ * malloc() performance, and it seems that talloc() is about 4% slower than
+ * malloc() on my x86 Debian Linux box. For Samba, the great reduction in code
+ * complexity that we get by using talloc makes this worthwhile, especially as
+ * the total overhead of talloc/malloc in Samba is already quite small.
+ *
+ * @section talloc_named Named blocks
+ *
+ * Every talloc chunk has a name that can be used as a dynamic type-checking
+ * system. If for some reason like a callback function you had to cast a
+ * "struct foo *" to a "void *" variable, later you can safely reassign the
+ * "void *" pointer to a "struct foo *" by using the talloc_get_type() or
+ * talloc_get_type_abort() macros.
+ *
+ * @code
+ *      struct foo *X = talloc_get_type_abort(ptr, struct foo);
+ * @endcode
+ *
+ * This will abort if "ptr" does not contain a pointer that has been created
+ * with talloc(mem_ctx, struct foo).
+ *
+ * @section talloc_threading Multi-threading
+ *
+ * talloc itself does not deal with threads. It is thread-safe (assuming the
+ * underlying "malloc" is), as long as each thread uses different memory
+ * contexts.
+ *
+ * If two threads uses the same context then they need to synchronize in order
+ * to be safe. In particular:
+ *
+ *   - when using talloc_enable_leak_report(), giving directly NULL as a parent
+ *     context implicitly refers to a hidden "null context" global variable, so
+ *     this should not be used in a multi-threaded environment without proper
+ *     synchronization.
+ *   - the context returned by talloc_autofree_context() is also global so
+ *     shouldn't be used by several threads simultaneously without
+ *     synchronization.
+ *
+ */
diff --git a/ctdb/lib/talloc/doc/stealing.png b/ctdb/lib/talloc/doc/stealing.png
new file mode 100644 (file)
index 0000000..8833e06
Binary files /dev/null and b/ctdb/lib/talloc/doc/stealing.png differ
diff --git a/ctdb/lib/talloc/doc/tutorial_bestpractices.dox b/ctdb/lib/talloc/doc/tutorial_bestpractices.dox
new file mode 100644 (file)
index 0000000..3634446
--- /dev/null
@@ -0,0 +1,192 @@
+/**
+@page libtalloc_bestpractices Chapter 7: Best practises
+
+The following sections contain several best practices and good manners that were
+found by the <a href="http://www.samba.org">Samba</a> and
+<a href="https://fedorahosted.org/sssd">SSSD</a> developers over the years.
+These will help you to write code which is better, easier to debug and with as
+few (hopefully none) memory leaks as possible.
+
+@section bp-hierarchy Keep the context hierarchy steady
+
+The talloc is a hierarchy memory allocator. The hierarchy nature is what makes
+the programming more error proof. It makes the memory easier to manage and to
+free.  Therefore, the first thing we should have on our mind is: always project
+your data structures into the talloc context hierarchy.
+
+That means if we have a structure, we should always use it as a parent context
+for its elements. This way we will not encounter any troubles when freeing the
+structure or when changing its parent. The same rule applies for arrays.
+
+For example, the structure <code>user</code> from section @ref context-hierarchy
+should be created with the context hierarchy illustrated on the next image.
+
+@image html context_tree.png
+
+@section bp-tmpctx Every function should use its own context
+
+It is a good practice to create a temporary talloc context at the function
+beginning and free the context just before the return statement. All the data
+must be allocated on this context or on its children. This ensures that no
+memory leaks are created as long as we do not forget to free the temporary
+context.
+
+This pattern applies to both situations - when a function does not return any
+dynamically allocated value and when it does. However, it needs a little
+extension for the latter case.
+
+@subsection bp-tmpctx-1 Functions that do not return any dynamically allocated
+value
+
+If the function does not return any value created on the heap, we will just obey
+the aforementioned pattern.
+
+@code
+int bar()
+{
+  int ret;
+  TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+  if (tmp_ctx == NULL) {
+    ret = ENOMEM;
+    goto done;
+  }
+  /* allocate data on tmp_ctx or on its descendants */
+  ret = EOK;
+done:
+  talloc_free(tmp_ctx);
+  return ret;
+}
+@endcode
+
+@subsection bp-tmpctx-2 Functions returning dynamically allocated values
+
+If our function returns any dynamically allocated data, its first parameter
+should always be the destination talloc context. This context serves as a parent
+for the output values. But again, we will create the output values as the
+descendants of the temporary context. If everything goes well, we will change
+the parent of the output values from the temporary to the destination talloc
+context.
+
+This pattern ensures that if an error occurs (e.g. I/O error or insufficient
+amount of the memory), all allocated data is freed and no garbage appears on
+the destination context.
+
+@code
+int struct_foo_init(TALLOC_CTX *mem_ctx, struct foo **_foo)
+{
+  int ret;
+  struct foo *foo = NULL;
+  TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+  if (tmp_ctx == NULL) {
+    ret = ENOMEM;
+    goto done;
+  }
+  foo = talloc_zero(tmp_ctx, struct foo);
+  /* ... */
+  *_foo = talloc_steal(mem_ctx, foo);
+  ret = EOK;
+done:
+  talloc_free(tmp_ctx);
+  return ret;
+}
+@endcode
+
+@section bp-null Allocate temporary contexts on NULL
+
+As it can be seen on the previous listing, instead of allocating the temporary
+context directly on <code>mem_ctx</code>, we created a new top level context
+using <code>NULL</code> as the parameter for <code>talloc_new()</code> function.
+Take a look at the following example:
+
+@code
+char *create_user_filter(TALLOC_CTX *mem_ctx,
+                         uid_t uid, const char *username)
+{
+  char *filter = NULL;
+  char *sanitized_username = NULL;
+  /* tmp_ctx is a child of mem_ctx */
+  TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+  if (tmp_ctx == NULL) {
+    return NULL;
+  }
+
+  sanitized_username = sanitize_string(tmp_ctx, username);
+  if (sanitized_username == NULL) {
+    talloc_free(tmp_ctx);
+    return NULL;
+  }
+
+  filter = talloc_aprintf(tmp_ctx,"(|(uid=%llu)(uname=%s))",
+                          uid, sanitized_username);
+  if (filter == NULL) {
+    return NULL; /* tmp_ctx is not freed */ (*@\label{lst:tmp-ctx-3:leak}@*)
+  }
+
+  /* filter becomes a child of mem_ctx */
+  filter = talloc_steal(mem_ctx, filter);
+  talloc_free(tmp_ctx);
+  return filter;
+}
+@endcode
+
+We forgot to free <code>tmp_ctx</code> before the <code>return</code> statement
+in the <code>filter == NULL</code> condition. However, it is created as a child
+of <code>mem_ctx</code> context and as such it will be freed as soon as the
+<code>mem_ctx</code> is freed. Therefore, no detectable memory leak is created.
+
+On the other hand, we do not have any way to access the allocated data
+and for all we know <code>mem_ctx</code> may exist for the lifetime of our
+application. For these reasons this should be considered as a memory leak. How
+can we detect if it is unreferenced but still attached to its parent context?
+The only way is to notice the mistake in the source code.
+
+But if we create the temporary context as a top level context, it will not be
+freed and memory diagnostic tools
+(e.g. <a href="http://valgrind.org">valgrind</a>) are able to do their job.
+
+@section bp-pool Temporary contexts and the talloc pool
+
+If we want to take the advantage of the talloc pool but also keep to the
+pattern introduced in the previous section, we are unable to do it directly. The
+best thing to do is to create a conditional build where we can decide how do we
+want to create the temporary context. For example, we can create the following
+macros:
+
+@code
+#ifdef USE_POOL_CONTEXT
+  #define CREATE_POOL_CTX(ctx, size) talloc_pool(ctx, size)
+  #define CREATE_TMP_CTX(ctx)        talloc_new(ctx)
+#else
+  #define CREATE_POOL_CTX(ctx, size) talloc_new(ctx)
+  #define CREATE_TMP_CTX(ctx)        talloc_new(NULL)
+#endif
+@endcode
+
+Now if our application is under development, we will build it with macro
+<code>USE_POOL_CONTEXT</code> undefined. This way, we  can use memory diagnostic
+utilities to detect memory leaks.
+
+The release version will be compiled with the macro defined. This will  enable
+pool contexts and therefore reduce the <code>malloc()</code> calls, which will
+end up in a little bit faster processing.
+
+@code
+int struct_foo_init(TALLOC_CTX *mem_ctx, struct foo **_foo)
+{
+  int ret;
+  struct foo *foo = NULL;
+  TALLOC_CTX *tmp_ctx = CREATE_TMP_CTX(mem_ctx);
+  /* ... */
+}
+
+errno_t handle_request(TALLOC_CTX mem_ctx)
+{
+  int ret;
+  struct foo *foo = NULL;
+  TALLOC_CTX *pool_ctx = CREATE_POOL_CTX(NULL, 1024);
+  ret = struct_foo_init(mem_ctx, &foo);
+  /* ... */
+}
+@endcode
+
+*/
diff --git a/ctdb/lib/talloc/doc/tutorial_context.dox b/ctdb/lib/talloc/doc/tutorial_context.dox
new file mode 100644 (file)
index 0000000..b8bfe26
--- /dev/null
@@ -0,0 +1,198 @@
+/**
+@page libtalloc_context Chapter 1: Talloc context
+@section context Talloc context
+
+The talloc context is the most important part of this library and is
+responsible for every single feature of this memory allocator. It is a logical
+unit which represents a memory space managed by talloc.
+
+From the programmer's point of view, the talloc context is completely
+equivalent to a pointer that would be returned by the memory routines from the
+C standard library. This means that every context that is returned from the
+talloc library can be used directly in functions that do not use talloc
+internally. For example we can do the following:
+
+@code
+char *str1 = strdup("I am NOT a talloc context");
+char *str2 = talloc_strdup(NULL, "I AM a talloc context");
+
+printf("%d\n", strcmp(str1, str2) == 0);
+
+free(str1);
+talloc_free(str2); /* we can not use free() on str2 */
+@endcode
+
+This is possible because the context is internally handled as a special
+fixed-length structure called talloc chunk. Each chunk stores context metadata
+followed by the memory space requested by the programmer. When a talloc
+function returns a context (pointer), it will in fact return a pointer to the user
+space portion of the talloc chunk. If we to manipulate this context using
+talloc functions, the talloc library transforms the user-space pointer back to
+the starting address of the chunk. This is also the reason why we were unable
+to use <code>free(str2)</code> in the previous example - because
+<code>str2</code> does not point at the beginning of the allocated block of
+memory. This is illustrated on the next image:
+
+@image html context.png
+
+The type TALLOC_CTX is defined in talloc.h to identify a talloc context in
+function parameters. However, this type is just an alias for <code>void</code>
+and exists only for semantical reasons - thus we can differentiate between
+<code>void *</code> (arbitrary data) and <code>TALLOC_CTX *</code> (talloc
+context).
+
+@subsection metadata Context meta data
+
+Every talloc context carries several pieces of internal information along with
+the allocated memory:
+
+  - name - which is used in reports of context hierarchy and to simulate
+    a dynamic type system,
+  - size of the requested memory in bytes - this can be used to determine
+    the number of elements in arrays,
+  - attached destructor - which is executed just before the memory block is
+    about to be freed,
+  - references to the context
+  - children and parent contexts - create the hierarchical view on the
+    memory.
+
+@section context-hierarchy Hierarchy of talloc context
+
+Every talloc context contains information about its parent and children. Talloc
+uses this information to create a hierarchical model of memory or to be more
+precise, it creates an n-ary tree where each node represents a single talloc
+context. The root node of the tree is referred to as a top level context - a
+context without any parent.
+
+This approach has several advantages:
+
+  - as a consequence of freeing a talloc context, all of its children
+    will be properly deallocated as well,
+  - the parent of a context can be changed at any time, which
+    results in moving the whole subtree under another node,
+  - it creates a more natural way of managing data structures.
+
+@subsection Example
+
+We have a structure that stores basic information about a user - his/her name,
+identification number and groups he/she is a member of:
+
+@code
+struct user {
+  uid_t uid;
+  char *username;
+  size_t num_groups;
+  char **groups;
+};
+@endcode
+
+We will allocate this structure using talloc. The result will be the following
+context tree:
+
+@image html context_tree.png
+
+@code
+/* create new top level context */
+struct user *user = talloc(NULL, struct user);
+
+user->uid = 1000;
+user->num_groups = N;
+
+/* make user the parent of following contexts */
+user->username = talloc_strdup(user, "Test user");
+user->groups = talloc_array(user, char*, user->num_groups);
+
+for (i = 0; i < user->num_groups; i++) {
+  /* make user->groups the parent of following context */
+  user->groups[i] = talloc_asprintf(user->groups,
+                                    "Test group %d", i);
+}
+@endcode
+
+This way, we have gained a lot of additional capabilities, one of which is
+very simple deallocation of the structure and all of its elements.
+
+With the C standard library we need first to iterate over the array of groups
+and free every element separately. Then we must deallocate the array that stores
+them. Next we deallocate the username and as the last step free the structure
+itself. But with talloc, the only operation we need to execute is freeing the
+structure context. Its descendants will be freed automatically.
+
+@code
+talloc_free(user);
+@endcode
+
+@section keep-hierarchy Always keep the hieararchy steady!
+
+The talloc is a hierarchy memory allocator. The hierarchy nature is what makes
+the programming more error proof. It makes the memory easier to manage and to
+free.  Therefore, the first thing we should have on our mind is: <strong>always
+project our data structures into the talloc context hierarchy</strong>.
+
+That means if we have a structure, we should always use it as a parent context
+for its elements. This way we will not encounter any troubles when freeing this
+structure or when changing its parent. The same rule applies for arrays.
+
+@section creating-context Creating a talloc context
+
+Here are the most important functions that create a new talloc context.
+
+@subsection type-safe Type-safe functions
+
+It allocates the size that is necessary for the given type and returns a new,
+properly-casted pointer. This is the preferred way to create a new context as
+we can rely on the compiler to detect type mismatches.
+
+The name of the context is automatically set to the name of the data type which
+is used to simulate a dynamic type system.
+
+@code
+struct user *user = talloc(ctx, struct user);
+
+/* initialize to default values */
+user->uid = 0;
+user->name = NULL;
+user->num_groups = 0;
+user->groups = NULL;
+
+/* or we can achieve the same result with */
+struct user *user_zero = talloc_zero(ctx, struct user);
+@endcode
+
+@subsection zero-length Zero-length contexts
+
+The zero-length context is basically a context without any special semantical
+meaning. We can use it the same way as any other context. The only difference
+is that it consists only of the meta data about the context. Therefore, it is
+strictly of type <code>TALLOC_CTX*</code>. It is often used in cases where we
+want to aggregate several data structures under one parent (zero-length)
+context, such as a temporary context to contain memory needed within a single
+function that is not interesting to the caller. Allocating on a zero-length
+temporary context will make clean-up of the function simpler.
+
+@code
+TALLOC_CTX *tmp_ctx = NULL;
+struct foo *foo = NULL;
+struct bar *bar = NULL;
+
+/* new zero-length top level context */
+tmp_ctx = talloc_new(NULL);
+if (tmp_ctx == NULL) {
+  return ENOMEM;
+}
+
+foo = talloc(tmp_ctx, struct foo);
+bar = talloc(tmp_ctx, struct bar);
+
+/* free everything at once */
+talloc_free(tmp_ctx);
+@endcode
+
+@subsection context-see-also See also
+
+- talloc_size()
+- talloc_named()
+- @ref talloc_array
+- @ref talloc_string
+
+*/
diff --git a/ctdb/lib/talloc/doc/tutorial_debugging.dox b/ctdb/lib/talloc/doc/tutorial_debugging.dox
new file mode 100644 (file)
index 0000000..aadbb0d
--- /dev/null
@@ -0,0 +1,116 @@
+/**
+@page libtalloc_debugging Chapter 6: Debugging
+
+Although talloc makes memory management significantly easier than the C standard
+library, developers are still only humans and can make mistakes. Therefore, it
+can be handy to know some tools for the inspection of talloc memory usage.
+
+@section log-abort Talloc log and abort
+
+We have already encountered the abort function in section @ref dts.
+In that case it was used when a type mismatch was detected. However, talloc
+calls this abort function in several more situations:
+
+- when the provided pointer is not a valid talloc context,
+- when the meta data is invalid - probably due to memory corruption,
+- and when an access after free is detected.
+
+The third one is probably the most interesting. It can help us with detecting
+an attempt to double-free a context or any other manipulation with it via
+talloc functions (using it as a parent, stealing it, etc.).
+
+Before the context is freed talloc sets a flag in the meta data. This is then
+used to detect the access after free. It basically works on the assumption that
+the memory stays unchanged (at least for a while) even when it is properly
+deallocated. This will work even if the memory is filled with the value
+specified in <code>TALLOC_FREE_FILL</code> environment variable, because it
+fills only the data part and leaves the meta data intact.
+
+Apart from the abort function, talloc uses a log function to provide additional
+information to the aforementioned violations. To enable logging we shall set the
+log function with one of:
+
+- talloc_set_log_fn()
+- talloc_set_log_stderr()
+
+The following code is a sample output of accessing a context after it has been
+freed:
+
+@code
+talloc_set_log_stderr();
+TALLOC_CTX *ctx = talloc_new(NULL);
+
+talloc_free(ctx);
+talloc_free(ctx);
+
+results in:
+talloc: access after free error - first free may be at ../src/main.c:55
+Bad talloc magic value - access after free
+@endcode
+
+Another example is an invalid context:
+
+@code
+talloc_set_log_stderr();
+TALLOC_CTX *ctx = talloc_new(NULL);
+char *str = strdup("not a talloc context");
+talloc_steal(ctx, str);
+
+results in:
+Bad talloc magic value - unknown value
+@endcode
+
+@section reports Memory usage reports
+
+Talloc can print reports of memory usage of a specified talloc context to a
+file (to <code>stdout</code> or <code>stderr</code>). The report can be
+simple or full. The simple report provides information only about the context
+itself and its direct descendants. The full report goes recursively through the
+entire context tree. See:
+
+- talloc_report()
+- talloc_report_full()
+
+We will use the following code to retrieve the sample report:
+
+@code
+struct foo {
+  char *str;
+};
+
+TALLOC_CTX *ctx = talloc_new(NULL);
+char *str =  talloc_strdup(ctx, "my string");
+struct foo *foo = talloc_zero(ctx, struct foo);
+foo->str = talloc_strdup(foo, "I am Foo");
+char *str2 = talloc_strdup(foo, "Foo is my parent");
+
+/* print full report */
+talloc_report_full(ctx, stdout);
+@endcode
+
+It will print a full report of <code>ctx</code> to the standard output.
+The message should be similar to:
+
+@code
+full talloc report on 'talloc_new: ../src/main.c:82' (total 46 bytes in 5 blocks)
+  struct foo contains 34 bytes in 3 blocks (ref 0) 0x1495130
+    Foo is my parent contains 17 bytes in 1 blocks (ref 0) 0x1495200
+    I am Foo contains 9 bytes in 1 blocks (ref 0) 0x1495190
+  my string contains 10 bytes in 1 blocks (ref 0) 0x14950c0
+@endcode
+
+We can notice in this report that something is wrong with the context containing
+<code>struct foo</code>. We know that the structure has only one string element.
+However, we can see in the report that it has two children. This indicates that
+we have either violated the memory hierarchy or forgotten to free it as
+temporary data. Looking into the code, we can see that <code>"Foo is my parent"
+</code> should be attached to <code>ctx</code>.
+
+See also:
+
+- talloc_enable_null_tracking()
+- talloc_disable_null_tracking()
+- talloc_enable_leak_report()
+- talloc_enable_leak_report_full()
+
+*/
diff --git a/ctdb/lib/talloc/doc/tutorial_destructors.dox b/ctdb/lib/talloc/doc/tutorial_destructors.dox
new file mode 100644 (file)
index 0000000..ed06387
--- /dev/null
@@ -0,0 +1,82 @@
+/**
+@page libtalloc_destructors Chapter 4: Using destructors
+
+@section destructors Using destructors
+
+Destructors are well known methods in the world of object oriented programming.
+A destructor is a method of an object that is automatically run when the object
+is destroyed. It is usually used to return resources taken by the object back to
+the system (e.g. closing file descriptors, terminating connection to a database,
+deallocating memory).
+
+With talloc we can take the advantage of destructors even in C. We can easily
+attach our own destructor to a talloc context. When the context is freed, the
+destructor will run automatically.
+
+To attach/detach a destructor to a talloc context use: talloc_set_destructor().
+
+@section destructors-example Example
+
+Imagine that we have a dynamically created linked list. Before we deallocate an
+element of the list, we need to make sure that we have successfully removed it
+from the list. Normally, this would be done by two commands in the exact order:
+remove it from the list and then free the element. With talloc, we can do this
+at once by setting a destructor on the element which will remove it from the
+list and talloc_free() will do the rest.
+
+The destructor would be:
+
+@code
+int list_remove(void *ctx)
+{
+    struct list_el *el = NULL;
+    el = talloc_get_type_abort(ctx, struct list_el);
+    /* remove element from the list */
+}
+@endcode
+
+GCC version 3 and newer can check for the types during the compilation. So if
+it is our major compiler, we can use a more advanced destructor:
+
+@code
+int list_remove(struct list_el *el)
+{
+    /* remove element from the list */
+}
+@endcode
+
+Now we will assign the destructor to the list element. We can do this directly
+in the function that inserts it.
+
+@code
+struct list_el* list_insert(TALLOC_CTX *mem_ctx,
+                            struct list_el *where,
+                            void *ptr)
+{
+  struct list_el *el = talloc(mem_ctx, struct list_el);
+  el->data = ptr;
+  /* insert into list */
+
+  talloc_set_destructor(el, list_remove);
+  return el;
+}
+@endcode
+
+Because talloc is a hierarchical memory allocator, we can go a step further and
+free the data with the element as well:
+
+@code
+struct list_el* list_insert_free(TALLOC_CTX *mem_ctx,
+                                 struct list_el *where,
+                                 void *ptr)
+{
+  struct list_el *el = NULL;
+  el = list_insert(mem_ctx, where, ptr);
+
+  talloc_steal(el, ptr);
+
+  return el;
+}
+@endcode
+
+*/
diff --git a/ctdb/lib/talloc/doc/tutorial_dts.dox b/ctdb/lib/talloc/doc/tutorial_dts.dox
new file mode 100644 (file)
index 0000000..75b5172
--- /dev/null
@@ -0,0 +1,109 @@
+/**
+@page libtalloc_dts Chapter 3: Dynamic type system
+
+@section dts Dynamic type system
+
+Generic programming in the C language is very difficult. There is no inheritance
+nor templates known from object oriented languages. There is no dynamic type
+system. Therefore, generic programming in this language is usually done by
+type-casting a variable to <code>void*</code> and transferring it through
+a generic function to a specialized callback as illustrated on the next listing.
+
+@code
+void generic_function(callback_fn cb, void *pvt)
+{
+  /* do some stuff and call the callback */
+  cb(pvt);
+}
+
+void specific_callback(void *pvt)
+{
+  struct specific_struct *data;
+  data = (struct specific_struct*)pvt;
+  /* ... */
+}
+
+void specific_function()
+{
+  struct specific_struct data;
+  generic_function(callback, &data);
+}
+@endcode
+
+Unfortunately, the type information is lost as a result of this type cast. The
+compiler cannot check the type during the compilation nor are we able to do it
+at runtime. Providing an invalid data type to the callback will result in
+unexpected behaviour (not necessarily a crash) of the application. This mistake
+is usually hard to detect because it is not the first thing which comes the
+mind.
+
+As we already know, every talloc context contains a name. This name is available
+at any time and it can be used to determine the type of a context even if we
+lose the type of a variable.
+
+Although the name of the context can be set to any arbitrary string, the best
+way of using it to simulate the dynamic type system is to set it directly to the
+type of the variable.
+
+It is recommended to use one of talloc() and talloc_array() (or its
+variants) to create the context as they set its name to the name of the
+given type automatically.
+
+If we have a context with such as a name, we can use two similar functions that
+do both the type check and the type cast for us:
+
+- talloc_get_type()
+- talloc_get_type_abort()
+
+@section dts-examples Examples
+
+The following example will show how generic programming with talloc is handled -
+if we provide invalid data to the callback, the program will be aborted. This
+is a sufficient reaction for such an error in most applications.
+
+@code
+void foo_callback(void *pvt)
+{
+  struct foo *data = talloc_get_type_abort(pvt, struct foo);
+  /* ... */
+}
+
+int do_foo()
+{
+  struct foo *data = talloc_zero(NULL, struct foo);
+  /* ... */
+  return generic_function(foo_callback, data);
+}
+@endcode
+
+But what if we are creating a service application that should be running for the
+uptime of a server, we may want to abort the application during the development
+process (to make sure the error is not overlooked) and try to recover from the
+error in the customer release. This can be achieved by creating a custom abort
+function with a conditional build.
+
+@code
+void my_abort(const char *reason)
+{
+  fprintf(stderr, "talloc abort: %s\n", reason);
+#ifdef ABORT_ON_TYPE_MISMATCH
+  abort();
+#endif
+}
+@endcode
+
+The usage of talloc_get_type_abort() would be then:
+
+@code
+talloc_set_abort_fn(my_abort);
+
+TALLOC_CTX *ctx = talloc_new(NULL);
+char *str = talloc_get_type_abort(ctx, char);
+if (str == NULL) {
+  /* recovery code */
+}
+/* talloc abort: ../src/main.c:25: Type mismatch:
+   name[talloc_new: ../src/main.c:24] expected[char] */
+@endcode
+
+*/
diff --git a/ctdb/lib/talloc/doc/tutorial_introduction.dox b/ctdb/lib/talloc/doc/tutorial_introduction.dox
new file mode 100644 (file)
index 0000000..02777b9
--- /dev/null
@@ -0,0 +1,43 @@
+/**
+@page libtalloc_tutorial The Tutorial
+@section introduction Introduction
+
+Talloc is a hierarchical, reference counted memory pool system with destructors.
+It is built atop the C standard library and it defines a set of utility
+functions that altogether simplifies allocation and deallocation of data,
+especially for complex structures that contain many dynamically allocated
+elements such as strings and arrays.
+
+The main goals of this library are: removing the needs for creating a cleanup
+function for every complex structure, providing a logical organization of
+allocated memory blocks and reducing the likelihood of creating memory leaks in
+long-running applications. All of this is achieved by allocating memory in a
+hierarchical structure of talloc contexts such that deallocating one context
+recursively frees all of its descendants as well.
+
+@section main-features Main features
+- An open source project
+- A hierarchical memory model
+- Natural projection of data structures into the memory space
+- Simplifies memory management of large data structures
+- Automatic execution of a destructor before the memory is freed
+- Simulates a dynamic type system
+- Implements a transparent memory pool
+
+@section toc Table of contents:
+
+@subpage libtalloc_context
+
+@subpage libtalloc_stealing
+
+@subpage libtalloc_dts
+
+@subpage libtalloc_destructors
+
+@subpage libtalloc_pools
+
+@subpage libtalloc_debugging
+
+@subpage libtalloc_bestpractices
+
+*/
\ No newline at end of file
diff --git a/ctdb/lib/talloc/doc/tutorial_pools.dox b/ctdb/lib/talloc/doc/tutorial_pools.dox
new file mode 100644 (file)
index 0000000..a0d1e1a
--- /dev/null
@@ -0,0 +1,93 @@
+/**
+@page libtalloc_pools Chapter 5: Memory pools
+
+@section pools Memory pools
+
+Allocation of a new memory is an expensive operation and large programs can
+contain thousands of calls of malloc() for a single computation, where every
+call allocates only a very small amount of the memory. This can result in an
+undesirable slowdown of the application. We can avoid this slowdown by
+decreasing the number of malloc() calls by using a memory pool.
+
+A memory pool is a preallocated memory space with a fixed size. If we need to
+allocate new data we will take the desired amount of the memory from the pool
+instead of requesting a new memory from the system. This is done by creating a
+pointer that points inside the preallocated memory. Such a pool must not be
+reallocated as it would change its location - pointers that were pointing
+inside the pool would become invalid. Therefore, a memory pool requires a very
+good estimate of the required memory space.
+
+The talloc library contains its own implementation of a memory pool. It is
+highly transparent for the programmer. The only thing that needs to be done is
+an initialization of a new pool context using talloc_pool() -
+which can be used in the same way as any other context.
+
+Refactoring of existing code (that uses talloc) to take the advantage of a
+memory pool is quite simple due to the following properties of the pool context:
+
+- if we are allocating data on a pool context, it takes the desired
+  amount of memory from the pool,
+- if the context is a descendant of the pool context, it takes the space
+  from the pool as well,
+- if the pool does not have sufficient portion of memory left, it will
+  create a new non-pool context, leaving the pool intact
+
+@code
+/* allocate 1KiB in a pool */
+TALLOC_CTX *pool_ctx = talloc_pool(NULL, 1024);
+
+/* Take 512B from the pool, 512B is left there */
+void *ptr = talloc_size(pool_ctx, 512);
+
+/* 1024B > 512B, this will create new talloc chunk outside
+   the pool */
+void *ptr2 = talloc_size(ptr, 1024);
+
+/* The pool still contains 512 free bytes
+ * this will take 200B from them. */
+void *ptr3 = talloc_size(ptr, 200);
+
+/* This will destroy context 'ptr3' but the memory
+ * is not freed, the available space in the pool
+ * will increase to 512B. */
+talloc_free(ptr3);
+
+/* This will free memory taken by 'pool_ctx'
+ * and 'ptr2' as well. */
+talloc_free(pool_ctx);
+@endcode
+
+The above given is very convenient, but there is one big issue to be kept in
+mind. If the parent of a talloc pool child is changed to a parent that is
+outside of this pool, the whole pool memory will not be freed until the child is
+freed. For this reason we must be very careful when stealing a descendant of a
+pool context.
+
+@code
+TALLOC_CTX *mem_ctx = talloc_new(NULL);
+TALLOC_CTX *pool_ctx = talloc_pool(NULL, 1024);
+struct foo *foo = talloc(pool_ctx, struct foo);
+
+/* mem_ctx is not in the pool */
+talloc_steal(mem_ctx, foo);
+
+/* pool_ctx is marked as freed but the memory is not
+   deallocated, accessing the pool_ctx again will cause
+   an error */
+talloc_free(pool_ctx);
+
+/* This deallocates the pool_ctx. */
+talloc_free(mem_ctx);
+@endcode
+
+It may often be better to copy the memory we want instead of stealing it to
+avoid this problem. If we do not need to retain the context name (to keep the
+type information), we can use talloc_memdup() to do this.
+
+Copying the memory out of the pool may, however, discard all the performance
+boost given by the pool, depending on the size of the copied memory. Therefore,
+the code should be well profiled before taking this path. In general, the
+golden rule is: if we need to steal from the pool context, we should not
+use a pool context.
+
+*/
diff --git a/ctdb/lib/talloc/doc/tutorial_stealing.dox b/ctdb/lib/talloc/doc/tutorial_stealing.dox
new file mode 100644 (file)
index 0000000..67eae1d
--- /dev/null
@@ -0,0 +1,55 @@
+/**
+@page libtalloc_stealing Chapter 2: Stealing a context
+
+@section stealing Stealing a context
+
+Talloc has the ability to change the parent of a talloc context to another
+one. This operation is commonly referred to as stealing and it is one of
+the most important actions performed with talloc contexts.
+
+Stealing a context is necessary if we want the pointer to outlive the context it
+is created on. This has many possible use cases, for instance stealing a result
+of a database search to an in-memory cache context, changing the parent of a
+field of a generic structure to a more specific one or vice-versa. The most
+common scenario, at least in Samba, is to steal output data from a function-specific
+context to the output context given as an argument of that function.
+
+@code
+struct foo {
+    char *a1;
+    char *a2;
+    char *a3;
+};
+
+struct bar {
+    char *wurst;
+    struct foo *foo;
+};
+
+struct foo *foo = talloc_zero(ctx, struct foo);
+foo->a1 = talloc_strdup(foo, "a1");
+foo->a2 = talloc_strdup(foo, "a2");
+foo->a3 = talloc_strdup(foo, "a3");
+
+struct bar *bar = talloc_zero(NULL, struct bar);
+/* change parent of foo from ctx to bar */
+bar->foo = talloc_steal(bar, foo);
+
+/* or do the same but assign foo = NULL */
+bar->foo = talloc_move(bar, &foo);
+@endcode
+
+The talloc_move() function is similar to the talloc_steal() function but
+additionally sets the source pointer to NULL.
+
+In general, the source pointer itself is not changed (it only replaces the
+parent in the meta data). But the common usage is that the result is
+assigned to another variable, thus further accessing the pointer from the
+original variable should be avoided unless it is necessary. In this case
+talloc_move() is the preferred way of stealing a context. Additionally sets the
+source pointer to NULL, thus.protects the pointer from being accidentally freed
+and accessed using the old variable after its parent has been changed.
+
+@image html stealing.png
+
+*/
diff --git a/ctdb/lib/talloc/doxy.config b/ctdb/lib/talloc/doxy.config
new file mode 100644 (file)
index 0000000..0e27d61
--- /dev/null
@@ -0,0 +1,1807 @@
+# Doxyfile 1.8.0
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = talloc
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         = 2.0
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
+# comments according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you
+# can mix doxygen, HTML, and XML commands with Markdown formatting.
+# Disable only in case of backward compatibilities issues.
+
+MARKDOWN_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE      = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal scope will be included in the documentation.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = . \
+                         doc
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *.cpp \
+                         *.cc \
+                         *.c \
+                         *.h \
+                         *.hh \
+                         *.hpp \
+                         *.dox
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */.git/*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             = doc
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# style sheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          =
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = NONE
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you may also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to
+# the MathJax Content Delivery Network so you can quickly see the result without
+# installing MathJax.
+# However, it is strongly recommended to install a local
+# copy of MathJax from http://www.mathjax.org before deployment.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = NO
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = YES
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = DOXYGEN \
+                         PRINTF_ATTRIBUTE(x,y)=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. For each
+# tag file the location of the external documentation should be added. The
+# format of a tag file without this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths
+# or URLs. Note that each tag file must have a unique name (where the name does
+# NOT include the path). If a tag file is not located in the directory in which
+# doxygen is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside
+# the class node. If there are many fields or methods and many nodes the
+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
+# threshold limits the number of items for each type to make the size more
+# managable. Set this to 0 for no limit. Note that the threshold may be
+# exceeded by 50% before the limit is enforced.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/ctdb/lib/talloc/install-sh b/ctdb/lib/talloc/install-sh
new file mode 100755 (executable)
index 0000000..5871924
--- /dev/null
@@ -0,0 +1,238 @@
+#! /bin/sh
+#
+# install - install a program, script, or datafile
+# This comes from X11R5.
+#
+# Calling this script install-sh is preferred over install.sh, to prevent
+# `make' implicit rules from creating a file called install from it
+# when there is no Makefile.
+#
+# This script is compatible with the BSD install script, but was written
+# from scratch.
+#
+
+
+# set DOITPROG to echo to test this script
+
+# Don't use :- since 4.3BSD and earlier shells don't like it.
+doit="${DOITPROG-}"
+
+
+# put in absolute paths if you don't have them in your path; or use env. vars.
+
+mvprog="${MVPROG-mv}"
+cpprog="${CPPROG-cp}"
+chmodprog="${CHMODPROG-chmod}"
+chownprog="${CHOWNPROG-chown}"
+chgrpprog="${CHGRPPROG-chgrp}"
+stripprog="${STRIPPROG-strip}"
+rmprog="${RMPROG-rm}"
+mkdirprog="${MKDIRPROG-mkdir}"
+
+transformbasename=""
+transform_arg=""
+instcmd="$mvprog"
+chmodcmd="$chmodprog 0755"
+chowncmd=""
+chgrpcmd=""
+stripcmd=""
+rmcmd="$rmprog -f"
+mvcmd="$mvprog"
+src=""
+dst=""
+dir_arg=""
+
+while [ x"$1" != x ]; do
+    case $1 in
+       -c) instcmd="$cpprog"
+           shift
+           continue;;
+
+       -d) dir_arg=true
+           shift
+           continue;;
+
+       -m) chmodcmd="$chmodprog $2"
+           shift
+           shift
+           continue;;
+
+       -o) chowncmd="$chownprog $2"
+           shift
+           shift
+           continue;;
+
+       -g) chgrpcmd="$chgrpprog $2"
+           shift
+           shift
+           continue;;
+
+       -s) stripcmd="$stripprog"
+           shift
+           continue;;
+
+       -t=*) transformarg=`echo $1 | sed 's/-t=//'`
+           shift
+           continue;;
+
+       -b=*) transformbasename=`echo $1 | sed 's/-b=//'`
+           shift
+           continue;;
+
+       *)  if [ x"$src" = x ]
+           then
+               src=$1
+           else
+               # this colon is to work around a 386BSD /bin/sh bug
+               :
+               dst=$1
+           fi
+           shift
+           continue;;
+    esac
+done
+
+if [ x"$src" = x ]
+then
+       echo "install:  no input file specified"
+       exit 1
+else
+       true
+fi
+
+if [ x"$dir_arg" != x ]; then
+       dst=$src
+       src=""
+       
+       if [ -d $dst ]; then
+               instcmd=:
+       else
+               instcmd=mkdir
+       fi
+else
+
+# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
+# might cause directories to be created, which would be especially bad 
+# if $src (and thus $dsttmp) contains '*'.
+
+       if [ -f $src -o -d $src ]
+       then
+               true
+       else
+               echo "install:  $src does not exist"
+               exit 1
+       fi
+       
+       if [ x"$dst" = x ]
+       then
+               echo "install:  no destination specified"
+               exit 1
+       else
+               true
+       fi
+
+# If destination is a directory, append the input filename; if your system
+# does not like double slashes in filenames, you may need to add some logic
+
+       if [ -d $dst ]
+       then
+               dst="$dst"/`basename $src`
+       else
+               true
+       fi
+fi
+
+## this sed command emulates the dirname command
+dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
+
+# Make sure that the destination directory exists.
+#  this part is taken from Noah Friedman's mkinstalldirs script
+
+# Skip lots of stat calls in the usual case.
+if [ ! -d "$dstdir" ]; then
+defaultIFS='   
+'
+IFS="${IFS-${defaultIFS}}"
+
+oIFS="${IFS}"
+# Some sh's can't handle IFS=/ for some reason.
+IFS='%'
+set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
+IFS="${oIFS}"
+
+pathcomp=''
+
+while [ $# -ne 0 ] ; do
+       pathcomp="${pathcomp}${1}"
+       shift
+
+       if [ ! -d "${pathcomp}" ] ;
+        then
+               $mkdirprog "${pathcomp}"
+       else
+               true
+       fi
+
+       pathcomp="${pathcomp}/"
+done
+fi
+
+if [ x"$dir_arg" != x ]
+then
+       $doit $instcmd $dst &&
+
+       if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
+       if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
+       if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
+       if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
+else
+
+# If we're going to rename the final executable, determine the name now.
+
+       if [ x"$transformarg" = x ] 
+       then
+               dstfile=`basename $dst`
+       else
+               dstfile=`basename $dst $transformbasename | 
+                       sed $transformarg`$transformbasename
+       fi
+
+# don't allow the sed command to completely eliminate the filename
+
+       if [ x"$dstfile" = x ] 
+       then
+               dstfile=`basename $dst`
+       else
+               true
+       fi
+
+# Make a temp file name in the proper directory.
+
+       dsttmp=$dstdir/#inst.$$#
+
+# Move or copy the file name to the temp name
+
+       $doit $instcmd $src $dsttmp &&
+
+       trap "rm -f ${dsttmp}" 0 &&
+
+# and set any options; do chmod last to preserve setuid bits
+
+# If any of these fail, we abort the whole thing.  If we want to
+# ignore errors from any of these, just make sure not to ignore
+# errors from the above "$doit $instcmd $src $dsttmp" command.
+
+       if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
+       if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
+       if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
+       if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
+
+# Now rename the file to the real destination.
+
+       $doit $rmcmd -f $dstdir/$dstfile &&
+       $doit $mvcmd $dsttmp $dstdir/$dstfile 
+
+fi &&
+
+
+exit 0
diff --git a/ctdb/lib/talloc/libtalloc.m4 b/ctdb/lib/talloc/libtalloc.m4
new file mode 100644 (file)
index 0000000..65227d8
--- /dev/null
@@ -0,0 +1,63 @@
+dnl Check to see if we should use the included talloc
+
+INCLUDED_TALLOC=auto
+AC_ARG_WITH(included-talloc,
+    [AC_HELP_STRING([--with-included-talloc], [use bundled talloc library, not from system])],
+    [ INCLUDED_TALLOC=$withval ])
+
+AC_SUBST(TALLOC_LIBS)
+AC_SUBST(TALLOC_CFLAGS)
+
+if test x"$INCLUDED_TALLOC" != x"yes" ; then
+    AC_CHECK_HEADERS(talloc.h)
+    AC_CHECK_LIB(talloc, talloc_init, [ TALLOC_LIBS="-ltalloc" ])
+    if test x"$ac_cv_header_talloc_h" = x"no" -o x"$ac_cv_lib_talloc_talloc_init" = x"no" ; then
+        INCLUDED_TALLOC=yes
+        TALLOC_CFLAGS=""
+    else
+        INCLUDED_TALLOC=no
+    fi
+fi
+
+AC_MSG_CHECKING(whether to use included talloc)
+AC_MSG_RESULT($INCLUDED_TALLOC)
+if test x"$INCLUDED_TALLOC" != x"no" ; then
+    dnl find the talloc sources. This is meant to work both for 
+    dnl talloc standalone builds, and builds of packages using talloc
+    tallocdir=""
+    tallocpaths=". lib/talloc talloc ../talloc ../lib/talloc"
+    for d in $tallocpaths; do
+       if test -f "$srcdir/$d/talloc.c"; then
+               tallocdir="$d"
+               AC_SUBST(tallocdir)
+               break
+       fi
+    done
+    if test x"$tallocdir" = "x"; then
+        AC_MSG_ERROR([cannot find talloc source in $tallocpaths])
+    fi
+    TALLOC_OBJ="talloc.o"
+    AC_SUBST(TALLOC_OBJ)
+
+    TALLOC_CFLAGS="-I$srcdir/$tallocdir"
+    AC_SUBST(TALLOC_CFLAGS)
+
+    TALLOC_LIBS=""
+    AC_SUBST(TALLOC_LIBS)
+fi
+
+AC_CHECK_SIZEOF(size_t,cross)
+AC_CHECK_SIZEOF(void *,cross)
+
+if test $ac_cv_sizeof_size_t -lt $ac_cv_sizeof_void_p; then
+       AC_WARN([size_t cannot represent the amount of used memory of a process])
+       AC_WARN([please report this to <samba-technical@samba.org>])
+       AC_WARN([sizeof(size_t) = $ac_cv_sizeof_size_t])
+       AC_WARN([sizeof(void *) = $ac_cv_sizeof_void_p])
+       AC_ERROR([sizeof(size_t) < sizeof(void *)])
+fi
+
+if test x"$VERSIONSCRIPT" != "x"; then
+    EXPORTSFILE=talloc.exports
+    AC_SUBST(EXPORTSFILE)
+fi
diff --git a/ctdb/lib/talloc/pytalloc-util.pc.in b/ctdb/lib/talloc/pytalloc-util.pc.in
new file mode 100644 (file)
index 0000000..bc704b4
--- /dev/null
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: pytalloc-util
+Description: Utility functions for using talloc objects with Python
+Version: @TALLOC_VERSION@
+Libs: -L${libdir} -lpytalloc-util
+Cflags: @LIB_RPATH@ -I${includedir}
+URL: http://talloc.samba.org/
diff --git a/ctdb/lib/talloc/pytalloc.c b/ctdb/lib/talloc/pytalloc.c
new file mode 100644 (file)
index 0000000..80196c6
--- /dev/null
@@ -0,0 +1,134 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Python Talloc Module
+   Copyright (C) Jelmer Vernooij <jelmer@samba.org> 2010-2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Python.h>
+#include <talloc.h>
+#include <pytalloc.h>
+
+void inittalloc(void);
+
+/* print a talloc tree report for a talloc python object */
+static PyObject *pytalloc_report_full(PyObject *self, PyObject *args)
+{
+       PyObject *py_obj = Py_None;
+
+       if (!PyArg_ParseTuple(args, "|O", &py_obj))
+               return NULL;
+
+       if (py_obj == Py_None) {
+               talloc_report_full(NULL, stdout);
+       } else {
+               talloc_report_full(pytalloc_get_mem_ctx(py_obj), stdout);
+       }
+       return Py_None;
+}
+
+/* enable null tracking */
+static PyObject *pytalloc_enable_null_tracking(PyObject *self)
+{
+       talloc_enable_null_tracking();
+       return Py_None;
+}
+
+/* return the number of talloc blocks */
+static PyObject *pytalloc_total_blocks(PyObject *self, PyObject *args)
+{
+       PyObject *py_obj = Py_None;
+
+       if (!PyArg_ParseTuple(args, "|O", &py_obj))
+               return NULL;
+
+       if (py_obj == Py_None) {
+               return PyLong_FromLong(talloc_total_blocks(NULL));
+       }
+
+       return PyLong_FromLong(talloc_total_blocks(pytalloc_get_mem_ctx(py_obj)));
+}
+
+static PyMethodDef talloc_methods[] = {
+       { "report_full", (PyCFunction)pytalloc_report_full, METH_VARARGS,
+               "show a talloc tree for an object"},
+       { "enable_null_tracking", (PyCFunction)pytalloc_enable_null_tracking, METH_NOARGS,
+               "enable tracking of the NULL object"},
+       { "total_blocks", (PyCFunction)pytalloc_total_blocks, METH_VARARGS,
+               "return talloc block count"},
+       { NULL }
+};
+
+/**
+ * Default (but only slightly more useful than the default) implementation of Repr().
+ */
+static PyObject *pytalloc_default_repr(PyObject *obj)
+{
+       pytalloc_Object *talloc_obj = (pytalloc_Object *)obj;
+       PyTypeObject *type = (PyTypeObject*)PyObject_Type(obj);
+
+       return PyString_FromFormat("<%s talloc object at 0x%p>", 
+                                  type->tp_name, talloc_obj->ptr);
+}
+
+/**
+ * Simple dealloc for talloc-wrapping PyObjects
+ */
+static void pytalloc_dealloc(PyObject* self)
+{
+       pytalloc_Object *obj = (pytalloc_Object *)self;
+       assert(talloc_unlink(NULL, obj->talloc_ctx) != -1);
+       obj->talloc_ctx = NULL;
+       self->ob_type->tp_free(self);
+}
+
+/**
+ * Default (but only slightly more useful than the default) implementation of cmp.
+ */
+static int pytalloc_default_cmp(PyObject *_obj1, PyObject *_obj2)
+{
+       pytalloc_Object *obj1 = (pytalloc_Object *)_obj1,
+                                        *obj2 = (pytalloc_Object *)_obj2;
+       if (obj1->ob_type != obj2->ob_type)
+               return (obj1->ob_type - obj2->ob_type);
+
+       return ((char *)pytalloc_get_ptr(obj1) - (char *)pytalloc_get_ptr(obj2));
+}
+
+static PyTypeObject TallocObject_Type = {
+       .tp_name = "talloc.Object",
+       .tp_doc = "Python wrapper for a talloc-maintained object.",
+       .tp_basicsize = sizeof(pytalloc_Object),
+       .tp_dealloc = (destructor)pytalloc_dealloc,
+       .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
+       .tp_repr = pytalloc_default_repr,
+       .tp_compare = pytalloc_default_cmp,
+};
+
+void inittalloc(void)
+{
+       PyObject *m;
+
+       if (PyType_Ready(&TallocObject_Type) < 0)
+               return;
+
+       m = Py_InitModule3("talloc", talloc_methods,
+                                          "Python wrapping of talloc-maintained objects.");
+       if (m == NULL)
+               return;
+
+       Py_INCREF(&TallocObject_Type);
+       PyModule_AddObject(m, "Object", (PyObject *)&TallocObject_Type);
+}
diff --git a/ctdb/lib/talloc/pytalloc.h b/ctdb/lib/talloc/pytalloc.h
new file mode 100644 (file)
index 0000000..2d2c57b
--- /dev/null
@@ -0,0 +1,54 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba utility functions
+   Copyright (C) Jelmer Vernooij <jelmer@samba.org> 2008
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _PYTALLOC_H_
+#define _PYTALLOC_H_
+
+#include <Python.h>
+#include <talloc.h>
+
+typedef struct {
+       PyObject_HEAD
+       TALLOC_CTX *talloc_ctx;
+       void *ptr;
+} pytalloc_Object;
+
+PyTypeObject *pytalloc_GetObjectType(void);
+int pytalloc_Check(PyObject *);
+
+/* Retrieve the pointer for a pytalloc_object. Like talloc_get_type() 
+ * but for pytalloc_Objects. */
+
+/* FIXME: Call PyErr_SetString(PyExc_TypeError, "expected " __STR(type) ") 
+ * when talloc_get_type() returns NULL. */
+#define pytalloc_get_type(py_obj, type) (talloc_get_type(pytalloc_get_ptr(py_obj), type))
+
+#define pytalloc_get_ptr(py_obj) (((pytalloc_Object *)py_obj)->ptr)
+#define pytalloc_get_mem_ctx(py_obj)  ((pytalloc_Object *)py_obj)->talloc_ctx
+
+PyObject *pytalloc_steal_ex(PyTypeObject *py_type, TALLOC_CTX *mem_ctx, void *ptr);
+PyObject *pytalloc_steal(PyTypeObject *py_type, void *ptr);
+PyObject *pytalloc_reference_ex(PyTypeObject *py_type, TALLOC_CTX *mem_ctx, void *ptr);
+#define pytalloc_reference(py_type, talloc_ptr) pytalloc_reference_ex(py_type, talloc_ptr, talloc_ptr)
+
+#define pytalloc_new(type, typeobj) pytalloc_steal(typeobj, talloc_zero(NULL, type))
+
+PyObject *pytalloc_CObject_FromTallocPtr(void *);
+
+#endif /* _PYTALLOC_H_ */
diff --git a/ctdb/lib/talloc/pytalloc_util.c b/ctdb/lib/talloc/pytalloc_util.c
new file mode 100644 (file)
index 0000000..89a093b
--- /dev/null
@@ -0,0 +1,118 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Python/Talloc glue
+   Copyright (C) Jelmer Vernooij <jelmer@samba.org> 2008
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Python.h>
+#include "replace.h"
+#include <talloc.h>
+#include "pytalloc.h"
+#include <assert.h>
+
+_PUBLIC_ PyTypeObject *pytalloc_GetObjectType(void)
+{
+       static PyTypeObject *type = NULL;
+       PyObject *mod;
+
+       if (type != NULL) {
+               return type;
+       }
+
+       mod = PyImport_ImportModule("talloc");
+       if (mod == NULL) {
+               return NULL;
+       }
+
+       type = (PyTypeObject *)PyObject_GetAttrString(mod, "Object");
+       Py_DECREF(mod);
+
+       return type;
+}
+
+/**
+ * Import an existing talloc pointer into a Python object.
+ */
+_PUBLIC_ PyObject *pytalloc_steal_ex(PyTypeObject *py_type, TALLOC_CTX *mem_ctx,
+                                                  void *ptr)
+{
+       pytalloc_Object *ret = (pytalloc_Object *)py_type->tp_alloc(py_type, 0);
+       ret->talloc_ctx = talloc_new(NULL);
+       if (ret->talloc_ctx == NULL) {
+               return NULL;
+       }
+       if (talloc_steal(ret->talloc_ctx, mem_ctx) == NULL) {
+               return NULL;
+       }
+       talloc_set_name_const(ret->talloc_ctx, py_type->tp_name);
+       ret->ptr = ptr;
+       return (PyObject *)ret;
+}
+
+/**
+ * Import an existing talloc pointer into a Python object.
+ */
+_PUBLIC_ PyObject *pytalloc_steal(PyTypeObject *py_type, void *ptr)
+{
+       return pytalloc_steal_ex(py_type, ptr, ptr);
+}
+
+
+/**
+ * Import an existing talloc pointer into a Python object, leaving the
+ * original parent, and creating a reference to the object in the python
+ * object
+ */
+_PUBLIC_ PyObject *pytalloc_reference_ex(PyTypeObject *py_type, TALLOC_CTX *mem_ctx, void *ptr)
+{
+       pytalloc_Object *ret;
+
+       if (ptr == NULL) {
+               Py_RETURN_NONE;
+       }
+
+       ret = (pytalloc_Object *)py_type->tp_alloc(py_type, 0);
+       ret->talloc_ctx = talloc_new(NULL);
+       if (ret->talloc_ctx == NULL) {
+               return NULL;
+       }
+       if (talloc_reference(ret->talloc_ctx, mem_ctx) == NULL) {
+               return NULL;
+       }
+       talloc_set_name_const(ret->talloc_ctx, py_type->tp_name);
+       ret->ptr = ptr;
+       return (PyObject *)ret;
+}
+
+static void py_cobject_talloc_free(void *ptr)
+{
+       talloc_free(ptr);
+}
+
+_PUBLIC_ PyObject *pytalloc_CObject_FromTallocPtr(void *ptr)
+{
+       if (ptr == NULL) {
+               Py_RETURN_NONE;
+       }
+       return PyCObject_FromVoidPtr(ptr, py_cobject_talloc_free);
+}
+
+_PUBLIC_ int pytalloc_Check(PyObject *obj)
+{
+       PyTypeObject *tp = pytalloc_GetObjectType();
+
+       return PyObject_TypeCheck(obj, tp);
+}
diff --git a/ctdb/lib/talloc/talloc.3.xml b/ctdb/lib/talloc/talloc.3.xml
new file mode 100644 (file)
index 0000000..99e8bcd
--- /dev/null
@@ -0,0 +1,801 @@
+<?xml version="1.0"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<refentry>
+  <refmeta>
+    <refentrytitle>talloc</refentrytitle>
+    <manvolnum>3</manvolnum>
+  </refmeta>
+  <refnamediv>
+    <refname>talloc</refname>
+<refpurpose>hierarchical reference counted memory pool system with destructors</refpurpose>
+  </refnamediv>
+  <refsynopsisdiv>
+<synopsis>#include &lt;talloc.h&gt;</synopsis>
+  </refsynopsisdiv>
+  <refsect1><title>DESCRIPTION</title>
+    <para>
+      If you are used to talloc from Samba3 then please read this
+      carefully, as talloc has changed a lot.
+    </para>
+    <para>
+      The new talloc is a hierarchical, reference counted memory pool
+      system with destructors. Quite a mouthful really, but not too bad
+      once you get used to it.
+    </para>
+    <para>
+      Perhaps the biggest change from Samba3 is that there is no
+      distinction between a "talloc context" and a "talloc pointer".  Any
+      pointer returned from talloc() is itself a valid talloc context. 
+      This means you can do this:
+    </para>
+    <programlisting>
+    struct foo *X = talloc(mem_ctx, struct foo);
+    X->name = talloc_strdup(X, "foo");
+    </programlisting>
+    <para>
+      and the pointer <literal role="code">X-&gt;name</literal>
+      would be a "child" of the talloc context <literal
+      role="code">X</literal> which is itself a child of
+      <literal role="code">mem_ctx</literal>.  So if you do
+      <literal role="code">talloc_free(mem_ctx)</literal> then
+      it is all destroyed, whereas if you do <literal
+      role="code">talloc_free(X)</literal> then just <literal
+      role="code">X</literal> and <literal
+      role="code">X-&gt;name</literal> are destroyed, and if
+      you do <literal
+      role="code">talloc_free(X-&gt;name)</literal> then just
+      the name element of <literal role="code">X</literal> is
+      destroyed.
+    </para>
+    <para>
+      If you think about this, then what this effectively gives you is an
+      n-ary tree, where you can free any part of the tree with
+      talloc_free().
+    </para>
+    <para>
+      If you find this confusing, then I suggest you run the <literal
+      role="code">testsuite</literal> program to watch talloc
+      in action.  You may also like to add your own tests to <literal
+      role="code">testsuite.c</literal> to clarify how some
+      particular situation is handled.
+    </para>
+  </refsect1>
+  <refsect1><title>TALLOC API</title>
+    <para>
+      The following is a complete guide to the talloc API. Read it all at
+      least twice.
+    </para>
+    <refsect2><title>(type *)talloc(const void *ctx, type);</title>
+        <para>
+         The talloc() macro is the core of the talloc library.  It takes a
+         memory <emphasis role="italic">ctx</emphasis> and a <emphasis
+         role="italic">type</emphasis>, and returns a pointer to a new
+         area of memory of the given <emphasis
+         role="italic">type</emphasis>.
+        </para>
+        <para>
+         The returned pointer is itself a talloc context, so you can use
+         it as the <emphasis role="italic">ctx</emphasis> argument to more
+         calls to talloc() if you wish.
+        </para>
+        <para>
+         The returned pointer is a "child" of the supplied context.  This
+         means that if you talloc_free() the <emphasis
+         role="italic">ctx</emphasis> then the new child disappears as
+         well.  Alternatively you can free just the child.
+        </para>
+        <para>
+         The <emphasis role="italic">ctx</emphasis> argument to talloc()
+         can be NULL, in which case a new top level context is created.
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_size(const void *ctx, size_t size);</title>
+        <para>
+         The function talloc_size() should be used when you don't have a
+         convenient type to pass to talloc().  Unlike talloc(), it is not
+         type safe (as it returns a void *), so you are on your own for
+         type checking.
+        </para>
+    </refsect2>
+    <refsect2><title>(typeof(ptr)) talloc_ptrtype(const void *ctx, ptr);</title>
+        <para>
+         The talloc_ptrtype() macro should be used when you have a pointer and
+         want to allocate memory to point at with this pointer. When compiling
+         with gcc >= 3 it is typesafe. Note this is a wrapper of talloc_size()
+         and talloc_get_name() will return the current location in the source file.
+         and not the type.
+        </para>
+    </refsect2>
+    <refsect2><title>int talloc_free(void *ptr);</title>
+        <para>
+         The talloc_free() function frees a piece of talloc memory, and
+         all its children.  You can call talloc_free() on any pointer
+         returned by talloc().
+        </para>
+        <para>
+         The return value of talloc_free() indicates success or failure,
+         with 0 returned for success and -1 for failure.  The only
+         possible failure condition is if <emphasis
+         role="italic">ptr</emphasis> had a destructor attached to it and
+         the destructor returned -1.  See <link
+         linkend="talloc_set_destructor"><quote>talloc_set_destructor()</quote></link>
+         for details on destructors.
+        </para>
+        <para>
+         If this pointer has an additional parent when talloc_free() is
+         called then the memory is not actually released, but instead the
+         most recently established parent is destroyed.  See <link
+         linkend="talloc_reference"><quote>talloc_reference()</quote></link>
+         for details on establishing additional parents.
+        </para>
+        <para>
+         For more control on which parent is removed, see <link
+         linkend="talloc_unlink"><quote>talloc_unlink()</quote></link>.
+        </para>
+        <para>
+         talloc_free() operates recursively on its children.
+        </para>
+       <para>
+         From the 2.0 version of talloc, as a special case,
+         talloc_free() is refused on pointers that have more than one
+         parent, as talloc would have no way of knowing which parent
+         should be removed. To free a pointer that has more than one
+         parent please use talloc_unlink().
+       </para>
+       <para>
+         To help you find problems in your code caused by this behaviour, if
+         you do try and free a pointer with more than one parent then the
+         talloc logging function will be called to give output like this:
+       </para>
+       <para>
+         <screen format="linespecific">
+           ERROR: talloc_free with references at some_dir/source/foo.c:123
+               reference at some_dir/source/other.c:325
+               reference at some_dir/source/third.c:121
+         </screen>
+       </para>
+       <para>
+         Please see the documentation for talloc_set_log_fn() and
+         talloc_set_log_stderr() for more information on talloc logging
+         functions.
+       </para>
+    </refsect2>
+    <refsect2 id="talloc_reference"><title>void *talloc_reference(const void *ctx, const void *ptr);</title>
+        <para>
+         The talloc_reference() function makes <emphasis
+         role="italic">ctx</emphasis> an additional parent of <emphasis
+         role="italic">ptr</emphasis>.
+        </para>
+        <para>
+         The return value of talloc_reference() is always the original
+         pointer <emphasis role="italic">ptr</emphasis>, unless talloc ran
+         out of memory in creating the reference in which case it will
+         return NULL (each additional reference consumes around 48 bytes
+         of memory on intel x86 platforms).
+        </para>
+        <para>
+         If <emphasis role="italic">ptr</emphasis> is NULL, then the
+         function is a no-op, and simply returns NULL.
+        </para>
+        <para>
+         After creating a reference you can free it in one of the
+         following ways:
+        </para>
+      <para>
+        <itemizedlist>
+          <listitem>
+            <para>
+             you can talloc_free() any parent of the original pointer. 
+             That will reduce the number of parents of this pointer by 1,
+             and will cause this pointer to be freed if it runs out of
+             parents.
+            </para>
+          </listitem>
+          <listitem>
+            <para>
+             you can talloc_free() the pointer itself.  That will destroy
+             the most recently established parent to the pointer and leave
+             the pointer as a child of its current parent.
+            </para>
+          </listitem>
+        </itemizedlist>
+      </para>
+      <para>
+       For more control on which parent to remove, see <link
+       linkend="talloc_unlink"><quote>talloc_unlink()</quote></link>.
+      </para>
+    </refsect2>
+    <refsect2 id="talloc_unlink"><title>int talloc_unlink(const void *ctx, const void *ptr);</title>
+        <para>
+         The talloc_unlink() function removes a specific parent from
+         <emphasis role="italic">ptr</emphasis>. The <emphasis
+         role="italic">ctx</emphasis> passed must either be a context used
+         in talloc_reference() with this pointer, or must be a direct
+         parent of ptr.
+        </para>
+        <para>
+         Note that if the parent has already been removed using
+         talloc_free() then this function will fail and will return -1. 
+         Likewise, if <emphasis role="italic">ptr</emphasis> is NULL, then
+         the function will make no modifications and return -1.
+        </para>
+        <para>
+         Usually you can just use talloc_free() instead of
+         talloc_unlink(), but sometimes it is useful to have the
+         additional control on which parent is removed.
+        </para>
+    </refsect2>
+    <refsect2 id="talloc_set_destructor"><title>void talloc_set_destructor(const void *ptr, int (*destructor)(void *));</title>
+        <para>
+         The function talloc_set_destructor() sets the <emphasis
+         role="italic">destructor</emphasis> for the pointer <emphasis
+         role="italic">ptr</emphasis>.  A <emphasis
+         role="italic">destructor</emphasis> is a function that is called
+         when the memory used by a pointer is about to be released.  The
+         destructor receives <emphasis role="italic">ptr</emphasis> as an
+         argument, and should return 0 for success and -1 for failure.
+        </para>
+        <para>
+         The <emphasis role="italic">destructor</emphasis> can do anything
+         it wants to, including freeing other pieces of memory.  A common
+         use for destructors is to clean up operating system resources
+         (such as open file descriptors) contained in the structure the
+         destructor is placed on.
+        </para>
+        <para>
+         You can only place one destructor on a pointer.  If you need more
+         than one destructor then you can create a zero-length child of
+         the pointer and place an additional destructor on that.
+        </para>
+        <para>
+         To remove a destructor call talloc_set_destructor() with NULL for
+         the destructor.
+        </para>
+        <para>
+         If your destructor attempts to talloc_free() the pointer that it
+         is the destructor for then talloc_free() will return -1 and the
+         free will be ignored.  This would be a pointless operation
+         anyway, as the destructor is only called when the memory is just
+         about to go away.
+        </para>
+    </refsect2>
+    <refsect2><title>int talloc_increase_ref_count(const void *<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         The talloc_increase_ref_count(<emphasis
+         role="italic">ptr</emphasis>) function is exactly equivalent to:
+        </para>
+        <programlisting>talloc_reference(NULL, ptr);</programlisting>
+        <para>
+         You can use either syntax, depending on which you think is
+         clearer in your code.
+        </para>
+        <para>
+         It returns 0 on success and -1 on failure.
+        </para>
+    </refsect2>
+    <refsect2><title>size_t talloc_reference_count(const void *<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         Return the number of references to the pointer.
+        </para>
+    </refsect2>
+    <refsect2 id="talloc_set_name"><title>void talloc_set_name(const void *ptr, const char *fmt, ...);</title>
+        <para>
+         Each talloc pointer has a "name".  The name is used principally
+         for debugging purposes, although it is also possible to set and
+         get the name on a pointer in as a way of "marking" pointers in
+         your code.
+        </para>
+        <para>
+         The main use for names on pointer is for "talloc reports".  See
+         <link
+         linkend="talloc_report"><quote>talloc_report_depth_cb()</quote></link>,
+         <link
+         linkend="talloc_report"><quote>talloc_report_depth_file()</quote></link>,
+         <link
+         linkend="talloc_report"><quote>talloc_report()</quote></link>
+         <link
+         linkend="talloc_report"><quote>talloc_report()</quote></link>
+         and <link
+         linkend="talloc_report_full"><quote>talloc_report_full()</quote></link>
+         for details.  Also see <link
+         linkend="talloc_enable_leak_report"><quote>talloc_enable_leak_report()</quote></link>
+         and <link
+         linkend="talloc_enable_leak_report_full"><quote>talloc_enable_leak_report_full()</quote></link>.
+        </para>
+        <para>
+         The talloc_set_name() function allocates memory as a child of the
+         pointer.  It is logically equivalent to:
+        </para>
+        <programlisting>talloc_set_name_const(ptr, talloc_asprintf(ptr, fmt, ...));</programlisting>
+        <para>
+         Note that multiple calls to talloc_set_name() will allocate more
+         memory without releasing the name.  All of the memory is released
+         when the ptr is freed using talloc_free().
+        </para>
+    </refsect2>
+    <refsect2><title>void talloc_set_name_const(const void *<emphasis role="italic">ptr</emphasis>, const char *<emphasis role="italic">name</emphasis>);</title>
+        <para>
+         The function talloc_set_name_const() is just like
+         talloc_set_name(), but it takes a string constant, and is much
+         faster.  It is extensively used by the "auto naming" macros, such
+         as talloc_p().
+        </para>
+        <para>
+         This function does not allocate any memory.  It just copies the
+         supplied pointer into the internal representation of the talloc
+         ptr. This means you must not pass a <emphasis
+         role="italic">name</emphasis> pointer to memory that will
+         disappear before <emphasis role="italic">ptr</emphasis> is freed
+         with talloc_free().
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_named(const void *<emphasis role="italic">ctx</emphasis>, size_t <emphasis role="italic">size</emphasis>, const char *<emphasis role="italic">fmt</emphasis>, ...);</title>
+        <para>
+         The talloc_named() function creates a named talloc pointer.  It
+         is equivalent to:
+        </para>
+        <programlisting>ptr = talloc_size(ctx, size);
+talloc_set_name(ptr, fmt, ....);</programlisting>
+    </refsect2>
+    <refsect2><title>void *talloc_named_const(const void *<emphasis role="italic">ctx</emphasis>, size_t <emphasis role="italic">size</emphasis>, const char *<emphasis role="italic">name</emphasis>);</title>
+        <para>
+         This is equivalent to:
+        </para>
+        <programlisting>ptr = talloc_size(ctx, size);
+talloc_set_name_const(ptr, name);</programlisting>
+    </refsect2>
+    <refsect2><title>const char *talloc_get_name(const void *<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         This returns the current name for the given talloc pointer,
+         <emphasis role="italic">ptr</emphasis>. See <link
+         linkend="talloc_set_name"><quote>talloc_set_name()</quote></link>
+         for details.
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_init(const char *<emphasis role="italic">fmt</emphasis>, ...);</title>
+        <para>
+         This function creates a zero length named talloc context as a top
+         level context.  It is equivalent to:
+        </para>
+        <programlisting>talloc_named(NULL, 0, fmt, ...);</programlisting>
+    </refsect2>
+    <refsect2><title>void *talloc_new(void *<emphasis role="italic">ctx</emphasis>);</title>
+        <para>
+         This is a utility macro that creates a new memory context hanging
+         off an existing context, automatically naming it "talloc_new:
+         __location__" where __location__ is the source line it is called
+         from.  It is particularly useful for creating a new temporary
+         working context.
+        </para>
+    </refsect2>
+    <refsect2><title>(<emphasis role="italic">type</emphasis> *)talloc_realloc(const void *<emphasis role="italic">ctx</emphasis>, void *<emphasis role="italic">ptr</emphasis>, <emphasis role="italic">type</emphasis>, <emphasis role="italic">count</emphasis>);</title>
+        <para>
+         The talloc_realloc() macro changes the size of a talloc pointer. 
+         It has the following equivalences:
+        </para>
+        <programlisting>talloc_realloc(ctx, NULL, type, 1) ==> talloc(ctx, type);
+talloc_realloc(ctx, ptr, type, 0)  ==> talloc_free(ptr);</programlisting>
+        <para>
+         The <emphasis role="italic">ctx</emphasis> argument is only used
+         if <emphasis role="italic">ptr</emphasis> is not NULL, otherwise
+         it is ignored.
+        </para>
+        <para>
+         talloc_realloc() returns the new pointer, or NULL on failure. 
+         The call will fail either due to a lack of memory, or because the
+         pointer has more than one parent (see <link
+         linkend="talloc_reference"><quote>talloc_reference()</quote></link>).
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_realloc_size(const void *ctx, void *ptr, size_t size);</title>
+        <para>
+         the talloc_realloc_size() function is useful when the type is not
+         known so the type-safe talloc_realloc() cannot be used.
+        </para>
+    </refsect2>
+    <refsect2><title>TYPE *talloc_steal(const void *<emphasis role="italic">new_ctx</emphasis>, const TYPE *<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         The talloc_steal() function changes the parent context of a
+         talloc pointer.  It is typically used when the context that the
+         pointer is currently a child of is going to be freed and you wish
+         to keep the memory for a longer time.
+        </para>
+        <para>
+         The talloc_steal() function returns the pointer that you pass it.
+          It does not have any failure modes.
+        </para>
+        <para>
+         It is possible to produce loops in the parent/child
+         relationship if you are not careful with talloc_steal().  No
+         guarantees are provided as to your sanity or the safety of your
+         data if you do this.
+        </para>
+        <para>
+         Note that if you try and call talloc_steal() on a pointer that has
+         more than one parent then the result is ambiguous. Talloc will choose
+         to remove the parent that is currently indicated by talloc_parent()
+         and replace it with the chosen parent. You will also get a message
+         like this via the talloc logging functions:
+        </para>
+        <para>
+         <screen format="linespecific">
+         WARNING: talloc_steal with references at some_dir/source/foo.c:123
+               reference at some_dir/source/other.c:325
+               reference at some_dir/source/third.c:121
+         </screen>
+        </para>
+        <para>
+         To unambiguously change the parent of a pointer please see
+         the
+         function <link linkend="talloc_reference"><quote>talloc_reparent()</quote></link>. See
+         the talloc_set_log_fn() documentation for more information
+         on talloc logging.  
+       </para>
+    </refsect2>
+    <refsect2><title>TYPE *talloc_reparent(const void *<emphasis role="italic">old_parent</emphasis>, const void *<emphasis role="italic">new_parent</emphasis>, const TYPE *<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         The talloc_reparent() function changes the parent context of a talloc
+         pointer. It is typically used when the context that the pointer is
+         currently a child of is going to be freed and you wish to keep the
+         memory for a longer time.
+        </para>
+        <para>
+         The talloc_reparent() function returns the pointer that you pass it. It
+         does not have any failure modes.
+        </para>
+        <para>
+         The difference between talloc_reparent() and talloc_steal() is that
+         talloc_reparent() can specify which parent you wish to change. This is
+         useful when a pointer has multiple parents via references.
+        </para>
+    </refsect2>
+    <refsect2><title>TYPE *talloc_move(const void *<emphasis role="italic">new_ctx</emphasis>, TYPE **<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         The talloc_move() function is a wrapper around
+         talloc_steal() which zeros the source pointer after the
+         move. This avoids a potential source of bugs where a
+         programmer leaves a pointer in two structures, and uses the
+         pointer from the old structure after it has been moved to a
+         new one.
+        </para>
+    </refsect2>
+    <refsect2><title>size_t talloc_total_size(const void *<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         The talloc_total_size() function returns the total size in bytes
+         used by this pointer and all child pointers.  Mostly useful for
+         debugging.
+        </para>
+        <para>
+         Passing NULL is allowed, but it will only give a meaningful
+         result if talloc_enable_leak_report() or
+         talloc_enable_leak_report_full() has been called.
+        </para>
+    </refsect2>
+    <refsect2><title>size_t talloc_total_blocks(const void *<emphasis role="italic">ptr</emphasis>);</title>
+        <para>
+         The talloc_total_blocks() function returns the total memory block
+         count used by this pointer and all child pointers.  Mostly useful
+         for debugging.
+        </para>
+        <para>
+         Passing NULL is allowed, but it will only give a meaningful
+         result if talloc_enable_leak_report() or
+         talloc_enable_leak_report_full() has been called.
+        </para>
+    </refsect2>
+    <refsect2 id="talloc_report"><title>void talloc_report(const void *ptr, FILE *f);</title>
+        <para>
+         The talloc_report() function prints a summary report of all
+         memory used by <emphasis role="italic">ptr</emphasis>.  One line
+         of report is printed for each immediate child of ptr, showing the
+         total memory and number of blocks used by that child.
+        </para>
+        <para>
+         You can pass NULL for the pointer, in which case a report is
+         printed for the top level memory context, but only if
+         talloc_enable_leak_report() or talloc_enable_leak_report_full()
+         has been called.
+        </para>
+    </refsect2>
+    <refsect2 id="talloc_report_full"><title>void talloc_report_full(const void *<emphasis role="italic">ptr</emphasis>, FILE *<emphasis role="italic">f</emphasis>);</title>
+        <para>
+         This provides a more detailed report than talloc_report().  It
+         will recursively print the entire tree of memory referenced by
+         the pointer. References in the tree are shown by giving the name
+         of the pointer that is referenced.
+        </para>
+        <para>
+         You can pass NULL for the pointer, in which case a report is
+         printed for the top level memory context, but only if
+         talloc_enable_leak_report() or talloc_enable_leak_report_full()
+         has been called.
+        </para>
+    </refsect2>
+    <refsect2 id="talloc_report_depth_cb">
+     <funcsynopsis><funcprototype>
+      <funcdef>void <function>talloc_report_depth_cb</function></funcdef>
+      <paramdef><parameter>const void *ptr</parameter></paramdef>
+      <paramdef><parameter>int depth</parameter></paramdef>
+      <paramdef><parameter>int max_depth</parameter></paramdef>
+      <paramdef><parameter>void (*callback)(const void *ptr, int depth, int max_depth, int is_ref, void *priv)</parameter></paramdef>
+      <paramdef><parameter>void *priv</parameter></paramdef>
+     </funcprototype></funcsynopsis>
+        <para>
+         This provides a more flexible reports than talloc_report(). It
+         will recursively call the callback for the entire tree of memory
+         referenced by the pointer. References in the tree are passed with
+         <emphasis role="italic">is_ref = 1</emphasis> and the pointer that is referenced.
+        </para>
+        <para>
+         You can pass NULL for the pointer, in which case a report is
+         printed for the top level memory context, but only if
+         talloc_enable_leak_report() or talloc_enable_leak_report_full()
+         has been called.
+        </para>
+        <para>
+         The recursion is stopped when depth >= max_depth.
+         max_depth = -1 means only stop at leaf nodes.
+        </para>
+    </refsect2>
+    <refsect2 id="talloc_report_depth_file">
+     <funcsynopsis><funcprototype>
+      <funcdef>void <function>talloc_report_depth_file</function></funcdef>
+      <paramdef><parameter>const void *ptr</parameter></paramdef>
+      <paramdef><parameter>int depth</parameter></paramdef>
+      <paramdef><parameter>int max_depth</parameter></paramdef>
+      <paramdef><parameter>FILE *f</parameter></paramdef>
+     </funcprototype></funcsynopsis>
+        <para>
+         This provides a more flexible reports than talloc_report(). It
+         will let you specify the depth and max_depth.
+        </para>
+    </refsect2>
+    <refsect2 id="talloc_enable_leak_report"><title>void talloc_enable_leak_report(void);</title>
+        <para>
+         This enables calling of talloc_report(NULL, stderr) when the
+         program exits.  In Samba4 this is enabled by using the
+         --leak-report command line option.
+        </para>
+        <para>
+         For it to be useful, this function must be called before any
+         other talloc function as it establishes a "null context" that
+         acts as the top of the tree.  If you don't call this function
+         first then passing NULL to talloc_report() or
+         talloc_report_full() won't give you the full tree printout.
+        </para>
+        <para>
+         Here is a typical talloc report:
+        </para>
+        <screen format="linespecific">talloc report on 'null_context' (total 267 bytes in 15 blocks)
+libcli/auth/spnego_parse.c:55  contains   31 bytes in   2 blocks
+libcli/auth/spnego_parse.c:55  contains   31 bytes in   2 blocks
+iconv(UTF8,CP850)              contains   42 bytes in   2 blocks
+libcli/auth/spnego_parse.c:55  contains   31 bytes in   2 blocks
+iconv(CP850,UTF8)              contains   42 bytes in   2 blocks
+iconv(UTF8,UTF-16LE)           contains   45 bytes in   2 blocks
+iconv(UTF-16LE,UTF8)           contains   45 bytes in   2 blocks
+      </screen>
+    </refsect2>
+    <refsect2 id="talloc_enable_leak_report_full"><title>void talloc_enable_leak_report_full(void);</title>
+        <para>
+         This enables calling of talloc_report_full(NULL, stderr) when the
+         program exits.  In Samba4 this is enabled by using the
+         --leak-report-full command line option.
+        </para>
+        <para>
+         For it to be useful, this function must be called before any
+         other talloc function as it establishes a "null context" that
+         acts as the top of the tree.  If you don't call this function
+         first then passing NULL to talloc_report() or
+         talloc_report_full() won't give you the full tree printout.
+        </para>
+        <para>
+         Here is a typical full report:
+        </para>
+        <screen format="linespecific">full talloc report on 'root' (total 18 bytes in 8 blocks)
+p1               contains     18 bytes in   7 blocks (ref 0)
+    r1               contains     13 bytes in   2 blocks (ref 0)
+        reference to: p2
+    p2               contains      1 bytes in   1 blocks (ref 1)
+    x3               contains      1 bytes in   1 blocks (ref 0)
+    x2               contains      1 bytes in   1 blocks (ref 0)
+    x1               contains      1 bytes in   1 blocks (ref 0)
+      </screen>
+    </refsect2>
+    <refsect2><title>(<emphasis role="italic">type</emphasis> *)talloc_zero(const void *<emphasis role="italic">ctx</emphasis>, <emphasis role="italic">type</emphasis>);</title>
+        <para>
+         The talloc_zero() macro is equivalent to:
+        </para>
+        <programlisting>ptr = talloc(ctx, type);
+if (ptr) memset(ptr, 0, sizeof(type));</programlisting>
+    </refsect2>
+    <refsect2><title>void *talloc_zero_size(const void *<emphasis role="italic">ctx</emphasis>, size_t <emphasis role="italic">size</emphasis>)</title>
+        <para>
+         The talloc_zero_size() function is useful when you don't have a
+         known type.
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_memdup(const void *<emphasis role="italic">ctx</emphasis>, const void *<emphasis role="italic">p</emphasis>, size_t size);</title>
+        <para>
+         The talloc_memdup() function is equivalent to:
+        </para>
+        <programlisting>ptr = talloc_size(ctx, size);
+if (ptr) memcpy(ptr, p, size);</programlisting>
+    </refsect2>
+    <refsect2><title>char *talloc_strdup(const void *<emphasis role="italic">ctx</emphasis>, const char *<emphasis role="italic">p</emphasis>);</title>
+        <para>
+         The talloc_strdup() function is equivalent to:
+        </para>
+        <programlisting>ptr = talloc_size(ctx, strlen(p)+1);
+if (ptr) memcpy(ptr, p, strlen(p)+1);</programlisting>
+        <para>
+         This function sets the name of the new pointer to the passed
+         string. This is equivalent to:
+        </para>
+        <programlisting>talloc_set_name_const(ptr, ptr)</programlisting>
+    </refsect2>
+    <refsect2><title>char *talloc_strndup(const void *<emphasis role="italic">t</emphasis>, const char *<emphasis role="italic">p</emphasis>, size_t <emphasis role="italic">n</emphasis>);</title>
+        <para>
+         The talloc_strndup() function is the talloc equivalent of the C
+         library function strndup(3).
+        </para>
+        <para>
+         This function sets the name of the new pointer to the passed
+         string. This is equivalent to:
+        </para>
+        <programlisting>talloc_set_name_const(ptr, ptr)</programlisting>
+    </refsect2>
+    <refsect2><title>char *talloc_vasprintf(const void *<emphasis role="italic">t</emphasis>, const char *<emphasis role="italic">fmt</emphasis>, va_list <emphasis role="italic">ap</emphasis>);</title>
+        <para>
+         The talloc_vasprintf() function is the talloc equivalent of the C
+         library function vasprintf(3).
+        </para>
+        <para>
+         This function sets the name of the new pointer to the new
+         string. This is equivalent to:
+        </para>
+        <programlisting>talloc_set_name_const(ptr, ptr)</programlisting>
+    </refsect2>
+    <refsect2><title>char *talloc_asprintf(const void *<emphasis role="italic">t</emphasis>, const char *<emphasis role="italic">fmt</emphasis>, ...);</title>
+        <para>
+         The talloc_asprintf() function is the talloc equivalent of the C
+         library function asprintf(3).
+        </para>
+        <para>
+         This function sets the name of the new pointer to the passed
+         string. This is equivalent to:
+        </para>
+        <programlisting>talloc_set_name_const(ptr, ptr)</programlisting>
+    </refsect2>
+    <refsect2><title>char *talloc_asprintf_append(char *s, const char *fmt, ...);</title>
+        <para>
+         The talloc_asprintf_append() function appends the given formatted
+         string to the given string.
+        </para>
+        <para>
+         This function sets the name of the new pointer to the new
+         string. This is equivalent to:
+        </para>
+        <programlisting>talloc_set_name_const(ptr, ptr)</programlisting>
+    </refsect2>
+    <refsect2><title>(type *)talloc_array(const void *ctx, type, unsigned int count);</title>
+        <para>
+         The talloc_array() macro is equivalent to:
+        </para>
+        <programlisting>(type *)talloc_size(ctx, sizeof(type) * count);</programlisting>
+        <para>
+         except that it provides integer overflow protection for the
+         multiply, returning NULL if the multiply overflows.
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_array_size(const void *ctx, size_t size, unsigned int count);</title>
+        <para>
+         The talloc_array_size() function is useful when the type is not
+         known. It operates in the same way as talloc_array(), but takes a
+         size instead of a type.
+        </para>
+    </refsect2>
+    <refsect2><title>(typeof(ptr)) talloc_array_ptrtype(const void *ctx, ptr, unsigned int count);</title>
+        <para>
+         The talloc_ptrtype() macro should be used when you have a pointer to an array
+         and want to allocate memory of an array to point at with this pointer. When compiling
+         with gcc >= 3 it is typesafe. Note this is a wrapper of talloc_array_size()
+         and talloc_get_name() will return the current location in the source file.
+         and not the type.
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_realloc_fn(const void *ctx, void *ptr, size_t size)</title>
+        <para>
+         This is a non-macro version of talloc_realloc(), which is useful
+         as libraries sometimes want a realloc function pointer.  A
+         realloc(3) implementation encapsulates the functionality of
+         malloc(3), free(3) and realloc(3) in one call, which is why it is
+         useful to be able to pass around a single function pointer.
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_autofree_context(void);</title>
+        <para>
+         This is a handy utility function that returns a talloc context
+         which will be automatically freed on program exit.  This can be
+         used to reduce the noise in memory leak reports.
+        </para>
+    </refsect2>
+    <refsect2><title>void *talloc_check_name(const void *ptr, const char *name);</title>
+        <para>
+         This function checks if a pointer has the specified <emphasis
+         role="italic">name</emphasis>.  If it does then the pointer is
+         returned.  It it doesn't then NULL is returned.
+        </para>
+    </refsect2>
+    <refsect2><title>(type *)talloc_get_type(const void *ptr, type);</title>
+        <para>
+         This macro allows you to do type checking on talloc pointers.  It
+         is particularly useful for void* private pointers.  It is
+         equivalent to this:
+        </para>
+        <programlisting>(type *)talloc_check_name(ptr, #type)</programlisting>
+    </refsect2>
+    <refsect2><title>talloc_set_type(const void *ptr, type);</title>
+        <para>
+         This macro allows you to force the name of a pointer to be a
+         particular <emphasis>type</emphasis>.  This can be
+         used in conjunction with talloc_get_type() to do type checking on
+         void* pointers.
+        </para>
+        <para>
+         It is equivalent to this:
+        </para>
+        <programlisting>talloc_set_name_const(ptr, #type)</programlisting>
+    </refsect2>
+    <refsect2><title>talloc_set_log_fn(void (*log_fn)(const char *message));</title>
+        <para>
+         This function sets a logging function that talloc will use for
+         warnings and errors. By default talloc will not print any warnings or
+         errors.
+       </para>
+    </refsect2>
+    <refsect2><title>talloc_set_log_stderr(void);</title>
+        <para>
+         This sets the talloc log function to write log messages to stderr
+       </para>
+    </refsect2>
+  </refsect1>
+  <refsect1><title>PERFORMANCE</title>
+    <para>
+      All the additional features of talloc(3) over malloc(3) do come at a
+      price.  We have a simple performance test in Samba4 that measures
+      talloc() versus malloc() performance, and it seems that talloc() is
+      about 10% slower than malloc() on my x86 Debian Linux box.  For
+      Samba, the great reduction in code complexity that we get by using
+      talloc makes this worthwhile, especially as the total overhead of
+      talloc/malloc in Samba is already quite small.
+    </para>
+  </refsect1>
+  <refsect1><title>SEE ALSO</title>
+    <para>
+      malloc(3), strndup(3), vasprintf(3), asprintf(3), 
+      <ulink url="http://talloc.samba.org/"/>
+    </para>
+  </refsect1>
+  <refsect1><title>COPYRIGHT/LICENSE</title>
+    <para>
+      Copyright (C) Andrew Tridgell 2004
+    </para>
+    <para>
+      This program is free software; you can redistribute it and/or modify
+      it under the terms of the GNU Lesser General Public License as 
+      published by the Free Software Foundation; either version 3 of the
+      License, or (at your option) any later version.
+    </para>
+    <para>
+      This program is distributed in the hope that it will be useful, but
+      WITHOUT ANY WARRANTY; without even the implied warranty of
+      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+      General Public License for more details.
+    </para>
+    <para>
+      You should have received a copy of the GNU General Public License
+      along with this program; if not, see http://www.gnu.org/licenses/.
+    </para>
+  </refsect1>
+</refentry>
diff --git a/ctdb/lib/talloc/talloc.c b/ctdb/lib/talloc/talloc.c
new file mode 100644 (file)
index 0000000..3e33fc0
--- /dev/null
@@ -0,0 +1,2597 @@
+/* 
+   Samba Unix SMB/CIFS implementation.
+
+   Samba trivial allocation library - new interface
+
+   NOTE: Please read talloc_guide.txt for full documentation
+
+   Copyright (C) Andrew Tridgell 2004
+   Copyright (C) Stefan Metzmacher 2006
+   
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+  inspired by http://swapped.cc/halloc/
+*/
+
+#include "replace.h"
+#include "talloc.h"
+
+#ifdef TALLOC_BUILD_VERSION_MAJOR
+#if (TALLOC_VERSION_MAJOR != TALLOC_BUILD_VERSION_MAJOR)
+#error "TALLOC_VERSION_MAJOR != TALLOC_BUILD_VERSION_MAJOR"
+#endif
+#endif
+
+#ifdef TALLOC_BUILD_VERSION_MINOR
+#if (TALLOC_VERSION_MINOR != TALLOC_BUILD_VERSION_MINOR)
+#error "TALLOC_VERSION_MINOR != TALLOC_BUILD_VERSION_MINOR"
+#endif
+#endif
+
+/* Special macros that are no-ops except when run under Valgrind on
+ * x86.  They've moved a little bit from valgrind 1.0.4 to 1.9.4 */
+#ifdef HAVE_VALGRIND_MEMCHECK_H
+        /* memcheck.h includes valgrind.h */
+#include <valgrind/memcheck.h>
+#elif defined(HAVE_VALGRIND_H)
+#include <valgrind.h>
+#endif
+
+/* use this to force every realloc to change the pointer, to stress test
+   code that might not cope */
+#define ALWAYS_REALLOC 0
+
+
+#define MAX_TALLOC_SIZE 0x10000000
+#define TALLOC_MAGIC_BASE 0xe814ec70
+#define TALLOC_MAGIC ( \
+       TALLOC_MAGIC_BASE + \
+       (TALLOC_VERSION_MAJOR << 12) + \
+       (TALLOC_VERSION_MINOR << 4) \
+)
+
+#define TALLOC_FLAG_FREE 0x01
+#define TALLOC_FLAG_LOOP 0x02
+#define TALLOC_FLAG_POOL 0x04          /* This is a talloc pool */
+#define TALLOC_FLAG_POOLMEM 0x08       /* This is allocated in a pool */
+
+#define TALLOC_MAGIC_REFERENCE ((const char *)1)
+
+/* by default we abort when given a bad pointer (such as when talloc_free() is called 
+   on a pointer that came from malloc() */
+#ifndef TALLOC_ABORT
+#define TALLOC_ABORT(reason) abort()
+#endif
+
+#ifndef discard_const_p
+#if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
+# define discard_const_p(type, ptr) ((type *)((intptr_t)(ptr)))
+#else
+# define discard_const_p(type, ptr) ((type *)(ptr))
+#endif
+#endif
+
+/* these macros gain us a few percent of speed on gcc */
+#if (__GNUC__ >= 3)
+/* the strange !! is to ensure that __builtin_expect() takes either 0 or 1
+   as its first argument */
+#ifndef likely
+#define likely(x)   __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#else
+#ifndef likely
+#define likely(x) (x)
+#endif
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+#endif
+
+/* this null_context is only used if talloc_enable_leak_report() or
+   talloc_enable_leak_report_full() is called, otherwise it remains
+   NULL
+*/
+static void *null_context;
+static void *autofree_context;
+
+/* used to enable fill of memory on free, which can be useful for
+ * catching use after free errors when valgrind is too slow
+ */
+static struct {
+       bool initialised;
+       bool enabled;
+       uint8_t fill_value;
+} talloc_fill;
+
+#define TALLOC_FILL_ENV "TALLOC_FREE_FILL"
+
+/*
+ * do not wipe the header, to allow the
+ * double-free logic to still work
+ */
+#define TC_INVALIDATE_FULL_FILL_CHUNK(_tc) do { \
+       if (unlikely(talloc_fill.enabled)) { \
+               size_t _flen = (_tc)->size; \
+               char *_fptr = (char *)TC_PTR_FROM_CHUNK(_tc); \
+               memset(_fptr, talloc_fill.fill_value, _flen); \
+       } \
+} while (0)
+
+#if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_NOACCESS)
+/* Mark the whole chunk as not accessable */
+#define TC_INVALIDATE_FULL_VALGRIND_CHUNK(_tc) do { \
+       size_t _flen = TC_HDR_SIZE + (_tc)->size; \
+       char *_fptr = (char *)(_tc); \
+       VALGRIND_MAKE_MEM_NOACCESS(_fptr, _flen); \
+} while(0)
+#else
+#define TC_INVALIDATE_FULL_VALGRIND_CHUNK(_tc) do { } while (0)
+#endif
+
+#define TC_INVALIDATE_FULL_CHUNK(_tc) do { \
+       TC_INVALIDATE_FULL_FILL_CHUNK(_tc); \
+       TC_INVALIDATE_FULL_VALGRIND_CHUNK(_tc); \
+} while (0)
+
+#define TC_INVALIDATE_SHRINK_FILL_CHUNK(_tc, _new_size) do { \
+       if (unlikely(talloc_fill.enabled)) { \
+               size_t _flen = (_tc)->size - (_new_size); \
+               char *_fptr = (char *)TC_PTR_FROM_CHUNK(_tc); \
+               _fptr += (_new_size); \
+               memset(_fptr, talloc_fill.fill_value, _flen); \
+       } \
+} while (0)
+
+#if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_NOACCESS)
+/* Mark the unused bytes not accessable */
+#define TC_INVALIDATE_SHRINK_VALGRIND_CHUNK(_tc, _new_size) do { \
+       size_t _flen = (_tc)->size - (_new_size); \
+       char *_fptr = (char *)TC_PTR_FROM_CHUNK(_tc); \
+       _fptr += (_new_size); \
+       VALGRIND_MAKE_MEM_NOACCESS(_fptr, _flen); \
+} while (0)
+#else
+#define TC_INVALIDATE_SHRINK_VALGRIND_CHUNK(_tc, _new_size) do { } while (0)
+#endif
+
+#define TC_INVALIDATE_SHRINK_CHUNK(_tc, _new_size) do { \
+       TC_INVALIDATE_SHRINK_FILL_CHUNK(_tc, _new_size); \
+       TC_INVALIDATE_SHRINK_VALGRIND_CHUNK(_tc, _new_size); \
+} while (0)
+
+#define TC_UNDEFINE_SHRINK_FILL_CHUNK(_tc, _new_size) do { \
+       if (unlikely(talloc_fill.enabled)) { \
+               size_t _flen = (_tc)->size - (_new_size); \
+               char *_fptr = (char *)TC_PTR_FROM_CHUNK(_tc); \
+               _fptr += (_new_size); \
+               memset(_fptr, talloc_fill.fill_value, _flen); \
+       } \
+} while (0)
+
+#if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_UNDEFINED)
+/* Mark the unused bytes as undefined */
+#define TC_UNDEFINE_SHRINK_VALGRIND_CHUNK(_tc, _new_size) do { \
+       size_t _flen = (_tc)->size - (_new_size); \
+       char *_fptr = (char *)TC_PTR_FROM_CHUNK(_tc); \
+       _fptr += (_new_size); \
+       VALGRIND_MAKE_MEM_UNDEFINED(_fptr, _flen); \
+} while (0)
+#else
+#define TC_UNDEFINE_SHRINK_VALGRIND_CHUNK(_tc, _new_size) do { } while (0)
+#endif
+
+#define TC_UNDEFINE_SHRINK_CHUNK(_tc, _new_size) do { \
+       TC_UNDEFINE_SHRINK_FILL_CHUNK(_tc, _new_size); \
+       TC_UNDEFINE_SHRINK_VALGRIND_CHUNK(_tc, _new_size); \
+} while (0)
+
+#if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_UNDEFINED)
+/* Mark the new bytes as undefined */
+#define TC_UNDEFINE_GROW_VALGRIND_CHUNK(_tc, _new_size) do { \
+       size_t _old_used = TC_HDR_SIZE + (_tc)->size; \
+       size_t _new_used = TC_HDR_SIZE + (_new_size); \
+       size_t _flen = _new_used - _old_used; \
+       char *_fptr = _old_used + (char *)(_tc); \
+       VALGRIND_MAKE_MEM_UNDEFINED(_fptr, _flen); \
+} while (0)
+#else
+#define TC_UNDEFINE_GROW_VALGRIND_CHUNK(_tc, _new_size) do { } while (0)
+#endif
+
+#define TC_UNDEFINE_GROW_CHUNK(_tc, _new_size) do { \
+       TC_UNDEFINE_GROW_VALGRIND_CHUNK(_tc, _new_size); \
+} while (0)
+
+struct talloc_reference_handle {
+       struct talloc_reference_handle *next, *prev;
+       void *ptr;
+       const char *location;
+};
+
+struct talloc_memlimit {
+       struct talloc_chunk *parent;
+       struct talloc_memlimit *upper;
+       size_t max_size;
+       size_t cur_size;
+};
+
+static bool talloc_memlimit_check(struct talloc_memlimit *limit, size_t size);
+static bool talloc_memlimit_update(struct talloc_memlimit *limit,
+                                  size_t old_size, size_t new_size);
+
+typedef int (*talloc_destructor_t)(void *);
+
+struct talloc_chunk {
+       struct talloc_chunk *next, *prev;
+       struct talloc_chunk *parent, *child;
+       struct talloc_reference_handle *refs;
+       talloc_destructor_t destructor;
+       const char *name;
+       size_t size;
+       unsigned flags;
+
+       /*
+        * limit semantics:
+        * if 'limit' is set it means all *new* children of the context will
+        * be limited to a total aggregate size ox max_size for memory
+        * allocations.
+        * cur_size is used to kep track of the current use
+        */
+       struct talloc_memlimit *limit;
+
+       /*
+        * "pool" has dual use:
+        *
+        * For the talloc pool itself (i.e. TALLOC_FLAG_POOL is set), "pool"
+        * marks the end of the currently allocated area.
+        *
+        * For members of the pool (i.e. TALLOC_FLAG_POOLMEM is set), "pool"
+        * is a pointer to the struct talloc_chunk of the pool that it was
+        * allocated from. This way children can quickly find the pool to chew
+        * from.
+        */
+       void *pool;
+};
+
+/* 16 byte alignment seems to keep everyone happy */
+#define TC_ALIGN16(s) (((s)+15)&~15)
+#define TC_HDR_SIZE TC_ALIGN16(sizeof(struct talloc_chunk))
+#define TC_PTR_FROM_CHUNK(tc) ((void *)(TC_HDR_SIZE + (char*)tc))
+
+_PUBLIC_ int talloc_version_major(void)
+{
+       return TALLOC_VERSION_MAJOR;
+}
+
+_PUBLIC_ int talloc_version_minor(void)
+{
+       return TALLOC_VERSION_MINOR;
+}
+
+static void (*talloc_log_fn)(const char *message);
+
+_PUBLIC_ void talloc_set_log_fn(void (*log_fn)(const char *message))
+{
+       talloc_log_fn = log_fn;
+}
+
+static void talloc_log(const char *fmt, ...) PRINTF_ATTRIBUTE(1,2);
+static void talloc_log(const char *fmt, ...)
+{
+       va_list ap;
+       char *message;
+
+       if (!talloc_log_fn) {
+               return;
+       }
+
+       va_start(ap, fmt);
+       message = talloc_vasprintf(NULL, fmt, ap);
+       va_end(ap);
+
+       talloc_log_fn(message);
+       talloc_free(message);
+}
+
+static void talloc_log_stderr(const char *message)
+{
+       fprintf(stderr, "%s", message);
+}
+
+_PUBLIC_ void talloc_set_log_stderr(void)
+{
+       talloc_set_log_fn(talloc_log_stderr);
+}
+
+static void (*talloc_abort_fn)(const char *reason);
+
+_PUBLIC_ void talloc_set_abort_fn(void (*abort_fn)(const char *reason))
+{
+       talloc_abort_fn = abort_fn;
+}
+
+static void talloc_abort(const char *reason)
+{
+       talloc_log("%s\n", reason);
+
+       if (!talloc_abort_fn) {
+               TALLOC_ABORT(reason);
+       }
+
+       talloc_abort_fn(reason);
+}
+
+static void talloc_abort_magic(unsigned magic)
+{
+       unsigned striped = magic - TALLOC_MAGIC_BASE;
+       unsigned major = (striped & 0xFFFFF000) >> 12;
+       unsigned minor = (striped & 0x00000FF0) >> 4;
+       talloc_log("Bad talloc magic[0x%08X/%u/%u] expected[0x%08X/%u/%u]\n",
+                  magic, major, minor,
+                  TALLOC_MAGIC, TALLOC_VERSION_MAJOR, TALLOC_VERSION_MINOR);
+       talloc_abort("Bad talloc magic value - wrong talloc version used/mixed");
+}
+
+static void talloc_abort_access_after_free(void)
+{
+       talloc_abort("Bad talloc magic value - access after free");
+}
+
+static void talloc_abort_unknown_value(void)
+{
+       talloc_abort("Bad talloc magic value - unknown value");
+}
+
+/* panic if we get a bad magic value */
+static inline struct talloc_chunk *talloc_chunk_from_ptr(const void *ptr)
+{
+       const char *pp = (const char *)ptr;
+       struct talloc_chunk *tc = discard_const_p(struct talloc_chunk, pp - TC_HDR_SIZE);
+       if (unlikely((tc->flags & (TALLOC_FLAG_FREE | ~0xF)) != TALLOC_MAGIC)) { 
+               if ((tc->flags & (~0xFFF)) == TALLOC_MAGIC_BASE) {
+                       talloc_abort_magic(tc->flags & (~0xF));
+                       return NULL;
+               }
+
+               if (tc->flags & TALLOC_FLAG_FREE) {
+                       talloc_log("talloc: access after free error - first free may be at %s\n", tc->name);
+                       talloc_abort_access_after_free();
+                       return NULL;
+               } else {
+                       talloc_abort_unknown_value();
+                       return NULL;
+               }
+       }
+       return tc;
+}
+
+/* hook into the front of the list */
+#define _TLIST_ADD(list, p) \
+do { \
+        if (!(list)) { \
+               (list) = (p); \
+               (p)->next = (p)->prev = NULL; \
+       } else { \
+               (list)->prev = (p); \
+               (p)->next = (list); \
+               (p)->prev = NULL; \
+               (list) = (p); \
+       }\
+} while (0)
+
+/* remove an element from a list - element doesn't have to be in list. */
+#define _TLIST_REMOVE(list, p) \
+do { \
+       if ((p) == (list)) { \
+               (list) = (p)->next; \
+               if (list) (list)->prev = NULL; \
+       } else { \
+               if ((p)->prev) (p)->prev->next = (p)->next; \
+               if ((p)->next) (p)->next->prev = (p)->prev; \
+       } \
+       if ((p) && ((p) != (list))) (p)->next = (p)->prev = NULL; \
+} while (0)
+
+
+/*
+  return the parent chunk of a pointer
+*/
+static inline struct talloc_chunk *talloc_parent_chunk(const void *ptr)
+{
+       struct talloc_chunk *tc;
+
+       if (unlikely(ptr == NULL)) {
+               return NULL;
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+       while (tc->prev) tc=tc->prev;
+
+       return tc->parent;
+}
+
+_PUBLIC_ void *talloc_parent(const void *ptr)
+{
+       struct talloc_chunk *tc = talloc_parent_chunk(ptr);
+       return tc? TC_PTR_FROM_CHUNK(tc) : NULL;
+}
+
+/*
+  find parents name
+*/
+_PUBLIC_ const char *talloc_parent_name(const void *ptr)
+{
+       struct talloc_chunk *tc = talloc_parent_chunk(ptr);
+       return tc? tc->name : NULL;
+}
+
+/*
+  A pool carries an in-pool object count count in the first 16 bytes.
+  bytes. This is done to support talloc_steal() to a parent outside of the
+  pool. The count includes the pool itself, so a talloc_free() on a pool will
+  only destroy the pool if the count has dropped to zero. A talloc_free() of a
+  pool member will reduce the count, and eventually also call free(3) on the
+  pool memory.
+
+  The object count is not put into "struct talloc_chunk" because it is only
+  relevant for talloc pools and the alignment to 16 bytes would increase the
+  memory footprint of each talloc chunk by those 16 bytes.
+*/
+
+union talloc_pool_chunk {
+       /* This lets object_count nestle into 16-byte padding of talloc_chunk,
+        * on 32-bit platforms. */
+       struct tc_pool_hdr {
+               struct talloc_chunk c;
+               unsigned int object_count;
+       } hdr;
+       /* This makes it always 16 byte aligned. */
+       char pad[TC_ALIGN16(sizeof(struct tc_pool_hdr))];
+};
+
+static void *tc_pool_end(union talloc_pool_chunk *pool_tc)
+{
+       return (char *)pool_tc + TC_HDR_SIZE + pool_tc->hdr.c.size;
+}
+
+static size_t tc_pool_space_left(union talloc_pool_chunk *pool_tc)
+{
+       return (char *)tc_pool_end(pool_tc) - (char *)pool_tc->hdr.c.pool;
+}
+
+static void *tc_pool_first_chunk(union talloc_pool_chunk *pool_tc)
+{
+       return pool_tc + 1;
+}
+
+/* If tc is inside a pool, this gives the next neighbour. */
+static void *tc_next_chunk(struct talloc_chunk *tc)
+{
+       return (char *)tc + TC_ALIGN16(TC_HDR_SIZE + tc->size);
+}
+
+/* Mark the whole remaining pool as not accessable */
+static void tc_invalidate_pool(union talloc_pool_chunk *pool_tc)
+{
+       size_t flen = tc_pool_space_left(pool_tc);
+
+       if (unlikely(talloc_fill.enabled)) {
+               memset(pool_tc->hdr.c.pool, talloc_fill.fill_value, flen);
+       }
+
+#if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_NOACCESS)
+       VALGRIND_MAKE_MEM_NOACCESS(pool_tc->hdr.c.pool, flen);
+#endif
+}
+
+/*
+  Allocate from a pool
+*/
+
+static struct talloc_chunk *talloc_alloc_pool(struct talloc_chunk *parent,
+                                             size_t size)
+{
+       union talloc_pool_chunk *pool_ctx = NULL;
+       size_t space_left;
+       struct talloc_chunk *result;
+       size_t chunk_size;
+
+       if (parent == NULL) {
+               return NULL;
+       }
+
+       if (parent->flags & TALLOC_FLAG_POOL) {
+               pool_ctx = (union talloc_pool_chunk *)parent;
+       }
+       else if (parent->flags & TALLOC_FLAG_POOLMEM) {
+               pool_ctx = (union talloc_pool_chunk *)parent->pool;
+       }
+
+       if (pool_ctx == NULL) {
+               return NULL;
+       }
+
+       space_left = tc_pool_space_left(pool_ctx);
+
+       /*
+        * Align size to 16 bytes
+        */
+       chunk_size = TC_ALIGN16(size);
+
+       if (space_left < chunk_size) {
+               return NULL;
+       }
+
+       result = (struct talloc_chunk *)pool_ctx->hdr.c.pool;
+
+#if defined(DEVELOPER) && defined(VALGRIND_MAKE_MEM_UNDEFINED)
+       VALGRIND_MAKE_MEM_UNDEFINED(result, size);
+#endif
+
+       pool_ctx->hdr.c.pool = (void *)((char *)result + chunk_size);
+
+       result->flags = TALLOC_MAGIC | TALLOC_FLAG_POOLMEM;
+       result->pool = pool_ctx;
+
+       pool_ctx->hdr.object_count++;
+
+       return result;
+}
+
+/* 
+   Allocate a bit of memory as a child of an existing pointer
+*/
+static inline void *__talloc(const void *context, size_t size)
+{
+       struct talloc_chunk *tc = NULL;
+       struct talloc_memlimit *limit = NULL;
+
+       if (unlikely(context == NULL)) {
+               context = null_context;
+       }
+
+       if (unlikely(size >= MAX_TALLOC_SIZE)) {
+               return NULL;
+       }
+
+       if (context != NULL) {
+               struct talloc_chunk *ptc = talloc_chunk_from_ptr(context);
+
+               if (ptc->limit != NULL) {
+                       limit = ptc->limit;
+               }
+
+               if (!talloc_memlimit_check(limit, (TC_HDR_SIZE+size))) {
+                       errno = ENOMEM;
+                       return NULL;
+               }
+
+               tc = talloc_alloc_pool(ptc, TC_HDR_SIZE+size);
+       }
+
+       if (tc == NULL) {
+               tc = (struct talloc_chunk *)malloc(TC_HDR_SIZE+size);
+               if (unlikely(tc == NULL)) return NULL;
+               tc->flags = TALLOC_MAGIC;
+               tc->pool  = NULL;
+       }
+
+       if (limit != NULL) {
+               struct talloc_memlimit *l;
+
+               for (l = limit; l != NULL; l = l->upper) {
+                       l->cur_size += TC_HDR_SIZE+size;
+               }
+       }
+
+       tc->limit = limit;
+       tc->size = size;
+       tc->destructor = NULL;
+       tc->child = NULL;
+       tc->name = NULL;
+       tc->refs = NULL;
+
+       if (likely(context)) {
+               struct talloc_chunk *parent = talloc_chunk_from_ptr(context);
+
+               if (parent->child) {
+                       parent->child->parent = NULL;
+                       tc->next = parent->child;
+                       tc->next->prev = tc;
+               } else {
+                       tc->next = NULL;
+               }
+               tc->parent = parent;
+               tc->prev = NULL;
+               parent->child = tc;
+       } else {
+               tc->next = tc->prev = tc->parent = NULL;
+       }
+
+       return TC_PTR_FROM_CHUNK(tc);
+}
+
+/*
+ * Create a talloc pool
+ */
+
+_PUBLIC_ void *talloc_pool(const void *context, size_t size)
+{
+       union talloc_pool_chunk *pool_tc;
+       void *result = __talloc(context, sizeof(*pool_tc) - TC_HDR_SIZE + size);
+
+       if (unlikely(result == NULL)) {
+               return NULL;
+       }
+
+       pool_tc = (union talloc_pool_chunk *)talloc_chunk_from_ptr(result);
+       if (unlikely(pool_tc->hdr.c.flags & TALLOC_FLAG_POOLMEM)) {
+               /* We don't handle this correctly, so fail. */
+               talloc_log("talloc: cannot allocate pool off another pool %s\n",
+                          talloc_get_name(context));
+               talloc_free(result);
+               return NULL;
+       }
+       pool_tc->hdr.c.flags |= TALLOC_FLAG_POOL;
+       pool_tc->hdr.c.pool = tc_pool_first_chunk(pool_tc);
+
+       pool_tc->hdr.object_count = 1;
+
+       tc_invalidate_pool(pool_tc);
+
+       return result;
+}
+
+/*
+  setup a destructor to be called on free of a pointer
+  the destructor should return 0 on success, or -1 on failure.
+  if the destructor fails then the free is failed, and the memory can
+  be continued to be used
+*/
+_PUBLIC_ void _talloc_set_destructor(const void *ptr, int (*destructor)(void *))
+{
+       struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+       tc->destructor = destructor;
+}
+
+/*
+  increase the reference count on a piece of memory. 
+*/
+_PUBLIC_ int talloc_increase_ref_count(const void *ptr)
+{
+       if (unlikely(!talloc_reference(null_context, ptr))) {
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  helper for talloc_reference()
+
+  this is referenced by a function pointer and should not be inline
+*/
+static int talloc_reference_destructor(struct talloc_reference_handle *handle)
+{
+       struct talloc_chunk *ptr_tc = talloc_chunk_from_ptr(handle->ptr);
+       _TLIST_REMOVE(ptr_tc->refs, handle);
+       return 0;
+}
+
+/*
+   more efficient way to add a name to a pointer - the name must point to a 
+   true string constant
+*/
+static inline void _talloc_set_name_const(const void *ptr, const char *name)
+{
+       struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+       tc->name = name;
+}
+
+/*
+  internal talloc_named_const()
+*/
+static inline void *_talloc_named_const(const void *context, size_t size, const char *name)
+{
+       void *ptr;
+
+       ptr = __talloc(context, size);
+       if (unlikely(ptr == NULL)) {
+               return NULL;
+       }
+
+       _talloc_set_name_const(ptr, name);
+
+       return ptr;
+}
+
+/*
+  make a secondary reference to a pointer, hanging off the given context.
+  the pointer remains valid until both the original caller and this given
+  context are freed.
+  
+  the major use for this is when two different structures need to reference the 
+  same underlying data, and you want to be able to free the two instances separately,
+  and in either order
+*/
+_PUBLIC_ void *_talloc_reference_loc(const void *context, const void *ptr, const char *location)
+{
+       struct talloc_chunk *tc;
+       struct talloc_reference_handle *handle;
+       if (unlikely(ptr == NULL)) return NULL;
+
+       tc = talloc_chunk_from_ptr(ptr);
+       handle = (struct talloc_reference_handle *)_talloc_named_const(context,
+                                                  sizeof(struct talloc_reference_handle),
+                                                  TALLOC_MAGIC_REFERENCE);
+       if (unlikely(handle == NULL)) return NULL;
+
+       /* note that we hang the destructor off the handle, not the
+          main context as that allows the caller to still setup their
+          own destructor on the context if they want to */
+       talloc_set_destructor(handle, talloc_reference_destructor);
+       handle->ptr = discard_const_p(void, ptr);
+       handle->location = location;
+       _TLIST_ADD(tc->refs, handle);
+       return handle->ptr;
+}
+
+static void *_talloc_steal_internal(const void *new_ctx, const void *ptr);
+
+static inline void _talloc_free_poolmem(struct talloc_chunk *tc,
+                                       const char *location)
+{
+       union talloc_pool_chunk *pool;
+       void *next_tc;
+
+       pool = (union talloc_pool_chunk *)tc->pool;
+       next_tc = tc_next_chunk(tc);
+
+       tc->flags |= TALLOC_FLAG_FREE;
+
+       /* we mark the freed memory with where we called the free
+        * from. This means on a double free error we can report where
+        * the first free came from
+        */
+       tc->name = location;
+
+       TC_INVALIDATE_FULL_CHUNK(tc);
+
+       if (unlikely(pool->hdr.object_count == 0)) {
+               talloc_abort("Pool object count zero!");
+               return;
+       }
+
+       pool->hdr.object_count--;
+
+       if (unlikely(pool->hdr.object_count == 1
+                    && !(pool->hdr.c.flags & TALLOC_FLAG_FREE))) {
+               /*
+                * if there is just one object left in the pool
+                * and pool->flags does not have TALLOC_FLAG_FREE,
+                * it means this is the pool itself and
+                * the rest is available for new objects
+                * again.
+                */
+               pool->hdr.c.pool = tc_pool_first_chunk(pool);
+               tc_invalidate_pool(pool);
+       } else if (unlikely(pool->hdr.object_count == 0)) {
+               /*
+                * we mark the freed memory with where we called the free
+                * from. This means on a double free error we can report where
+                * the first free came from
+                */
+               pool->hdr.c.name = location;
+
+               TC_INVALIDATE_FULL_CHUNK(&pool->hdr.c);
+               free(pool);
+       } else if (pool->hdr.c.pool == next_tc) {
+               /*
+                * if pool->pool still points to end of
+                * 'tc' (which is stored in the 'next_tc' variable),
+                * we can reclaim the memory of 'tc'.
+                */
+               pool->hdr.c.pool = tc;
+       }
+}
+
+static inline void _talloc_free_children_internal(struct talloc_chunk *tc,
+                                                 void *ptr,
+                                                 const char *location);
+
+/* 
+   internal talloc_free call
+*/
+static inline int _talloc_free_internal(void *ptr, const char *location)
+{
+       struct talloc_chunk *tc;
+
+       if (unlikely(ptr == NULL)) {
+               return -1;
+       }
+
+       /* possibly initialised the talloc fill value */
+       if (unlikely(!talloc_fill.initialised)) {
+               const char *fill = getenv(TALLOC_FILL_ENV);
+               if (fill != NULL) {
+                       talloc_fill.enabled = true;
+                       talloc_fill.fill_value = strtoul(fill, NULL, 0);
+               }
+               talloc_fill.initialised = true;
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+
+       if (unlikely(tc->refs)) {
+               int is_child;
+               /* check if this is a reference from a child or
+                * grandchild back to it's parent or grandparent
+                *
+                * in that case we need to remove the reference and
+                * call another instance of talloc_free() on the current
+                * pointer.
+                */
+               is_child = talloc_is_parent(tc->refs, ptr);
+               _talloc_free_internal(tc->refs, location);
+               if (is_child) {
+                       return _talloc_free_internal(ptr, location);
+               }
+               return -1;
+       }
+
+       if (unlikely(tc->flags & TALLOC_FLAG_LOOP)) {
+               /* we have a free loop - stop looping */
+               return 0;
+       }
+
+       if (unlikely(tc->destructor)) {
+               talloc_destructor_t d = tc->destructor;
+               if (d == (talloc_destructor_t)-1) {
+                       return -1;
+               }
+               tc->destructor = (talloc_destructor_t)-1;
+               if (d(ptr) == -1) {
+                       tc->destructor = d;
+                       return -1;
+               }
+               tc->destructor = NULL;
+       }
+
+       if (tc->parent) {
+               _TLIST_REMOVE(tc->parent->child, tc);
+               if (tc->parent->child) {
+                       tc->parent->child->parent = tc->parent;
+               }
+       } else {
+               if (tc->prev) tc->prev->next = tc->next;
+               if (tc->next) tc->next->prev = tc->prev;
+               tc->prev = tc->next = NULL;
+       }
+
+       tc->flags |= TALLOC_FLAG_LOOP;
+
+       _talloc_free_children_internal(tc, ptr, location);
+
+       tc->flags |= TALLOC_FLAG_FREE;
+
+       /*
+        * If we are part of a memory limited context hierarchy
+        * we need to subtract the memory used from the counters
+        */
+       if (tc->limit) {
+               struct talloc_memlimit *l;
+
+               for (l = tc->limit; l != NULL; l = l->upper) {
+                       if (l->cur_size >= tc->size+TC_HDR_SIZE) {
+                               l->cur_size -= tc->size+TC_HDR_SIZE;
+                       } else {
+                               talloc_abort("cur_size memlimit counter not correct!");
+                               return 0;
+                       }
+               }
+
+               if (tc->limit->parent == tc) {
+                       free(tc->limit);
+               }
+
+               tc->limit = NULL;
+       }
+
+       /* we mark the freed memory with where we called the free
+        * from. This means on a double free error we can report where
+        * the first free came from 
+        */      
+       tc->name = location;
+
+       if (tc->flags & TALLOC_FLAG_POOL) {
+               union talloc_pool_chunk *pool = (union talloc_pool_chunk *)tc;
+
+               if (unlikely(pool->hdr.object_count == 0)) {
+                       talloc_abort("Pool object count zero!");
+                       return 0;
+               }
+
+               pool->hdr.object_count--;
+               if (unlikely(pool->hdr.object_count == 0)) {
+                       TC_INVALIDATE_FULL_CHUNK(tc);
+                       free(tc);
+               }
+       } else if (tc->flags & TALLOC_FLAG_POOLMEM) {
+               _talloc_free_poolmem(tc, location);
+       } else {
+               TC_INVALIDATE_FULL_CHUNK(tc);
+               free(tc);
+       }
+       return 0;
+}
+
+static size_t _talloc_total_limit_size(const void *ptr,
+                                       struct talloc_memlimit *old_limit,
+                                       struct talloc_memlimit *new_limit);
+
+/* 
+   move a lump of memory from one talloc context to another return the
+   ptr on success, or NULL if it could not be transferred.
+   passing NULL as ptr will always return NULL with no side effects.
+*/
+static void *_talloc_steal_internal(const void *new_ctx, const void *ptr)
+{
+       struct talloc_chunk *tc, *new_tc;
+       size_t ctx_size = 0;
+
+       if (unlikely(!ptr)) {
+               return NULL;
+       }
+
+       if (unlikely(new_ctx == NULL)) {
+               new_ctx = null_context;
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+
+       if (tc->limit != NULL) {
+
+               ctx_size = _talloc_total_limit_size(ptr, NULL, NULL);
+
+               if (!talloc_memlimit_update(tc->limit->upper, ctx_size, 0)) {
+                       talloc_abort("cur_size memlimit counter not correct!");
+                       errno = EINVAL;
+                       return NULL;
+               }
+
+               if (tc->limit->parent == tc) {
+                       tc->limit->upper = NULL;
+               } else {
+                       tc->limit = NULL;
+               }
+       }
+
+       if (unlikely(new_ctx == NULL)) {
+               if (tc->parent) {
+                       _TLIST_REMOVE(tc->parent->child, tc);
+                       if (tc->parent->child) {
+                               tc->parent->child->parent = tc->parent;
+                       }
+               } else {
+                       if (tc->prev) tc->prev->next = tc->next;
+                       if (tc->next) tc->next->prev = tc->prev;
+               }
+
+               tc->parent = tc->next = tc->prev = NULL;
+               return discard_const_p(void, ptr);
+       }
+
+       new_tc = talloc_chunk_from_ptr(new_ctx);
+
+       if (unlikely(tc == new_tc || tc->parent == new_tc)) {
+               return discard_const_p(void, ptr);
+       }
+
+       if (tc->parent) {
+               _TLIST_REMOVE(tc->parent->child, tc);
+               if (tc->parent->child) {
+                       tc->parent->child->parent = tc->parent;
+               }
+       } else {
+               if (tc->prev) tc->prev->next = tc->next;
+               if (tc->next) tc->next->prev = tc->prev;
+               tc->prev = tc->next = NULL;
+       }
+
+       tc->parent = new_tc;
+       if (new_tc->child) new_tc->child->parent = NULL;
+       _TLIST_ADD(new_tc->child, tc);
+
+       if (tc->limit || new_tc->limit) {
+               ctx_size = _talloc_total_limit_size(ptr, tc->limit,
+                                                   new_tc->limit);
+       }
+
+       if (new_tc->limit) {
+               struct talloc_memlimit *l;
+
+               for (l = new_tc->limit; l != NULL; l = l->upper) {
+                       l->cur_size += ctx_size;
+               }
+       }
+
+       return discard_const_p(void, ptr);
+}
+
+/* 
+   move a lump of memory from one talloc context to another return the
+   ptr on success, or NULL if it could not be transferred.
+   passing NULL as ptr will always return NULL with no side effects.
+*/
+_PUBLIC_ void *_talloc_steal_loc(const void *new_ctx, const void *ptr, const char *location)
+{
+       struct talloc_chunk *tc;
+
+       if (unlikely(ptr == NULL)) {
+               return NULL;
+       }
+       
+       tc = talloc_chunk_from_ptr(ptr);
+       
+       if (unlikely(tc->refs != NULL) && talloc_parent(ptr) != new_ctx) {
+               struct talloc_reference_handle *h;
+
+               talloc_log("WARNING: talloc_steal with references at %s\n",
+                          location);
+
+               for (h=tc->refs; h; h=h->next) {
+                       talloc_log("\treference at %s\n",
+                                  h->location);
+               }
+       }
+
+#if 0
+       /* this test is probably too expensive to have on in the
+          normal build, but it useful for debugging */
+       if (talloc_is_parent(new_ctx, ptr)) {
+               talloc_log("WARNING: stealing into talloc child at %s\n", location);
+       }
+#endif
+       
+       return _talloc_steal_internal(new_ctx, ptr);
+}
+
+/* 
+   this is like a talloc_steal(), but you must supply the old
+   parent. This resolves the ambiguity in a talloc_steal() which is
+   called on a context that has more than one parent (via references)
+
+   The old parent can be either a reference or a parent
+*/
+_PUBLIC_ void *talloc_reparent(const void *old_parent, const void *new_parent, const void *ptr)
+{
+       struct talloc_chunk *tc;
+       struct talloc_reference_handle *h;
+
+       if (unlikely(ptr == NULL)) {
+               return NULL;
+       }
+
+       if (old_parent == talloc_parent(ptr)) {
+               return _talloc_steal_internal(new_parent, ptr);
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+       for (h=tc->refs;h;h=h->next) {
+               if (talloc_parent(h) == old_parent) {
+                       if (_talloc_steal_internal(new_parent, h) != h) {
+                               return NULL;
+                       }
+                       return discard_const_p(void, ptr);
+               }
+       }       
+
+       /* it wasn't a parent */
+       return NULL;
+}
+
+/*
+  remove a secondary reference to a pointer. This undo's what
+  talloc_reference() has done. The context and pointer arguments
+  must match those given to a talloc_reference()
+*/
+static inline int talloc_unreference(const void *context, const void *ptr)
+{
+       struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+       struct talloc_reference_handle *h;
+
+       if (unlikely(context == NULL)) {
+               context = null_context;
+       }
+
+       for (h=tc->refs;h;h=h->next) {
+               struct talloc_chunk *p = talloc_parent_chunk(h);
+               if (p == NULL) {
+                       if (context == NULL) break;
+               } else if (TC_PTR_FROM_CHUNK(p) == context) {
+                       break;
+               }
+       }
+       if (h == NULL) {
+               return -1;
+       }
+
+       return _talloc_free_internal(h, __location__);
+}
+
+/*
+  remove a specific parent context from a pointer. This is a more
+  controlled variant of talloc_free()
+*/
+_PUBLIC_ int talloc_unlink(const void *context, void *ptr)
+{
+       struct talloc_chunk *tc_p, *new_p, *tc_c;
+       void *new_parent;
+
+       if (ptr == NULL) {
+               return -1;
+       }
+
+       if (context == NULL) {
+               context = null_context;
+       }
+
+       if (talloc_unreference(context, ptr) == 0) {
+               return 0;
+       }
+
+       if (context != NULL) {
+               tc_c = talloc_chunk_from_ptr(context);
+       } else {
+               tc_c = NULL;
+       }
+       if (tc_c != talloc_parent_chunk(ptr)) {
+               return -1;
+       }
+
+       tc_p = talloc_chunk_from_ptr(ptr);
+
+       if (tc_p->refs == NULL) {
+               return _talloc_free_internal(ptr, __location__);
+       }
+
+       new_p = talloc_parent_chunk(tc_p->refs);
+       if (new_p) {
+               new_parent = TC_PTR_FROM_CHUNK(new_p);
+       } else {
+               new_parent = NULL;
+       }
+
+       if (talloc_unreference(new_parent, ptr) != 0) {
+               return -1;
+       }
+
+       _talloc_steal_internal(new_parent, ptr);
+
+       return 0;
+}
+
+/*
+  add a name to an existing pointer - va_list version
+*/
+static inline const char *talloc_set_name_v(const void *ptr, const char *fmt, va_list ap) PRINTF_ATTRIBUTE(2,0);
+
+static inline const char *talloc_set_name_v(const void *ptr, const char *fmt, va_list ap)
+{
+       struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+       tc->name = talloc_vasprintf(ptr, fmt, ap);
+       if (likely(tc->name)) {
+               _talloc_set_name_const(tc->name, ".name");
+       }
+       return tc->name;
+}
+
+/*
+  add a name to an existing pointer
+*/
+_PUBLIC_ const char *talloc_set_name(const void *ptr, const char *fmt, ...)
+{
+       const char *name;
+       va_list ap;
+       va_start(ap, fmt);
+       name = talloc_set_name_v(ptr, fmt, ap);
+       va_end(ap);
+       return name;
+}
+
+
+/*
+  create a named talloc pointer. Any talloc pointer can be named, and
+  talloc_named() operates just like talloc() except that it allows you
+  to name the pointer.
+*/
+_PUBLIC_ void *talloc_named(const void *context, size_t size, const char *fmt, ...)
+{
+       va_list ap;
+       void *ptr;
+       const char *name;
+
+       ptr = __talloc(context, size);
+       if (unlikely(ptr == NULL)) return NULL;
+
+       va_start(ap, fmt);
+       name = talloc_set_name_v(ptr, fmt, ap);
+       va_end(ap);
+
+       if (unlikely(name == NULL)) {
+               _talloc_free_internal(ptr, __location__);
+               return NULL;
+       }
+
+       return ptr;
+}
+
+/*
+  return the name of a talloc ptr, or "UNNAMED"
+*/
+_PUBLIC_ const char *talloc_get_name(const void *ptr)
+{
+       struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+       if (unlikely(tc->name == TALLOC_MAGIC_REFERENCE)) {
+               return ".reference";
+       }
+       if (likely(tc->name)) {
+               return tc->name;
+       }
+       return "UNNAMED";
+}
+
+
+/*
+  check if a pointer has the given name. If it does, return the pointer,
+  otherwise return NULL
+*/
+_PUBLIC_ void *talloc_check_name(const void *ptr, const char *name)
+{
+       const char *pname;
+       if (unlikely(ptr == NULL)) return NULL;
+       pname = talloc_get_name(ptr);
+       if (likely(pname == name || strcmp(pname, name) == 0)) {
+               return discard_const_p(void, ptr);
+       }
+       return NULL;
+}
+
+static void talloc_abort_type_mismatch(const char *location,
+                                       const char *name,
+                                       const char *expected)
+{
+       const char *reason;
+
+       reason = talloc_asprintf(NULL,
+                                "%s: Type mismatch: name[%s] expected[%s]",
+                                location,
+                                name?name:"NULL",
+                                expected);
+       if (!reason) {
+               reason = "Type mismatch";
+       }
+
+       talloc_abort(reason);
+}
+
+_PUBLIC_ void *_talloc_get_type_abort(const void *ptr, const char *name, const char *location)
+{
+       const char *pname;
+
+       if (unlikely(ptr == NULL)) {
+               talloc_abort_type_mismatch(location, NULL, name);
+               return NULL;
+       }
+
+       pname = talloc_get_name(ptr);
+       if (likely(pname == name || strcmp(pname, name) == 0)) {
+               return discard_const_p(void, ptr);
+       }
+
+       talloc_abort_type_mismatch(location, pname, name);
+       return NULL;
+}
+
+/*
+  this is for compatibility with older versions of talloc
+*/
+_PUBLIC_ void *talloc_init(const char *fmt, ...)
+{
+       va_list ap;
+       void *ptr;
+       const char *name;
+
+       ptr = __talloc(NULL, 0);
+       if (unlikely(ptr == NULL)) return NULL;
+
+       va_start(ap, fmt);
+       name = talloc_set_name_v(ptr, fmt, ap);
+       va_end(ap);
+
+       if (unlikely(name == NULL)) {
+               _talloc_free_internal(ptr, __location__);
+               return NULL;
+       }
+
+       return ptr;
+}
+
+static inline void _talloc_free_children_internal(struct talloc_chunk *tc,
+                                                 void *ptr,
+                                                 const char *location)
+{
+       while (tc->child) {
+               /* we need to work out who will own an abandoned child
+                  if it cannot be freed. In priority order, the first
+                  choice is owner of any remaining reference to this
+                  pointer, the second choice is our parent, and the
+                  final choice is the null context. */
+               void *child = TC_PTR_FROM_CHUNK(tc->child);
+               const void *new_parent = null_context;
+               if (unlikely(tc->child->refs)) {
+                       struct talloc_chunk *p = talloc_parent_chunk(tc->child->refs);
+                       if (p) new_parent = TC_PTR_FROM_CHUNK(p);
+               }
+               if (unlikely(_talloc_free_internal(child, location) == -1)) {
+                       if (new_parent == null_context) {
+                               struct talloc_chunk *p = talloc_parent_chunk(ptr);
+                               if (p) new_parent = TC_PTR_FROM_CHUNK(p);
+                       }
+                       _talloc_steal_internal(new_parent, child);
+               }
+       }
+}
+
+/*
+  this is a replacement for the Samba3 talloc_destroy_pool functionality. It
+  should probably not be used in new code. It's in here to keep the talloc
+  code consistent across Samba 3 and 4.
+*/
+_PUBLIC_ void talloc_free_children(void *ptr)
+{
+       struct talloc_chunk *tc_name = NULL;
+       struct talloc_chunk *tc;
+
+       if (unlikely(ptr == NULL)) {
+               return;
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+
+       /* we do not want to free the context name if it is a child .. */
+       if (likely(tc->child)) {
+               for (tc_name = tc->child; tc_name; tc_name = tc_name->next) {
+                       if (tc->name == TC_PTR_FROM_CHUNK(tc_name)) break;
+               }
+               if (tc_name) {
+                       _TLIST_REMOVE(tc->child, tc_name);
+                       if (tc->child) {
+                               tc->child->parent = tc;
+                       }
+               }
+       }
+
+       _talloc_free_children_internal(tc, ptr, __location__);
+
+       /* .. so we put it back after all other children have been freed */
+       if (tc_name) {
+               if (tc->child) {
+                       tc->child->parent = NULL;
+               }
+               tc_name->parent = tc;
+               _TLIST_ADD(tc->child, tc_name);
+       }
+}
+
+/* 
+   Allocate a bit of memory as a child of an existing pointer
+*/
+_PUBLIC_ void *_talloc(const void *context, size_t size)
+{
+       return __talloc(context, size);
+}
+
+/*
+  externally callable talloc_set_name_const()
+*/
+_PUBLIC_ void talloc_set_name_const(const void *ptr, const char *name)
+{
+       _talloc_set_name_const(ptr, name);
+}
+
+/*
+  create a named talloc pointer. Any talloc pointer can be named, and
+  talloc_named() operates just like talloc() except that it allows you
+  to name the pointer.
+*/
+_PUBLIC_ void *talloc_named_const(const void *context, size_t size, const char *name)
+{
+       return _talloc_named_const(context, size, name);
+}
+
+/* 
+   free a talloc pointer. This also frees all child pointers of this 
+   pointer recursively
+
+   return 0 if the memory is actually freed, otherwise -1. The memory
+   will not be freed if the ref_count is > 1 or the destructor (if
+   any) returns non-zero
+*/
+_PUBLIC_ int _talloc_free(void *ptr, const char *location)
+{
+       struct talloc_chunk *tc;
+
+       if (unlikely(ptr == NULL)) {
+               return -1;
+       }
+       
+       tc = talloc_chunk_from_ptr(ptr);
+       
+       if (unlikely(tc->refs != NULL)) {
+               struct talloc_reference_handle *h;
+
+               if (talloc_parent(ptr) == null_context && tc->refs->next == NULL) {
+                       /* in this case we do know which parent should
+                          get this pointer, as there is really only
+                          one parent */
+                       return talloc_unlink(null_context, ptr);
+               }
+
+               talloc_log("ERROR: talloc_free with references at %s\n",
+                          location);
+
+               for (h=tc->refs; h; h=h->next) {
+                       talloc_log("\treference at %s\n",
+                                  h->location);
+               }
+               return -1;
+       }
+       
+       return _talloc_free_internal(ptr, location);
+}
+
+
+
+/*
+  A talloc version of realloc. The context argument is only used if
+  ptr is NULL
+*/
+_PUBLIC_ void *_talloc_realloc(const void *context, void *ptr, size_t size, const char *name)
+{
+       struct talloc_chunk *tc;
+       void *new_ptr;
+       bool malloced = false;
+       union talloc_pool_chunk *pool_tc = NULL;
+
+       /* size zero is equivalent to free() */
+       if (unlikely(size == 0)) {
+               talloc_unlink(context, ptr);
+               return NULL;
+       }
+
+       if (unlikely(size >= MAX_TALLOC_SIZE)) {
+               return NULL;
+       }
+
+       /* realloc(NULL) is equivalent to malloc() */
+       if (ptr == NULL) {
+               return _talloc_named_const(context, size, name);
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+
+       /* don't allow realloc on referenced pointers */
+       if (unlikely(tc->refs)) {
+               return NULL;
+       }
+
+       /* don't let anybody try to realloc a talloc_pool */
+       if (unlikely(tc->flags & TALLOC_FLAG_POOL)) {
+               return NULL;
+       }
+
+       if (tc->limit && (size - tc->size > 0)) {
+               if (!talloc_memlimit_check(tc->limit, (size - tc->size))) {
+                       errno = ENOMEM;
+                       return NULL;
+               }
+       }
+
+       /* handle realloc inside a talloc_pool */
+       if (unlikely(tc->flags & TALLOC_FLAG_POOLMEM)) {
+               pool_tc = (union talloc_pool_chunk *)tc->pool;
+       }
+
+#if (ALWAYS_REALLOC == 0)
+       /* don't shrink if we have less than 1k to gain */
+       if (size < tc->size && tc->limit == NULL) {
+               if (pool_tc) {
+                       void *next_tc = tc_next_chunk(tc);
+                       TC_INVALIDATE_SHRINK_CHUNK(tc, size);
+                       tc->size = size;
+                       if (next_tc == pool_tc->hdr.c.pool) {
+                               /* note: tc->size has changed, so this works */
+                               pool_tc->hdr.c.pool = tc_next_chunk(tc);
+                       }
+                       return ptr;
+               } else if ((tc->size - size) < 1024) {
+                       /*
+                        * if we call TC_INVALIDATE_SHRINK_CHUNK() here
+                        * we would need to call TC_UNDEFINE_GROW_CHUNK()
+                        * after each realloc call, which slows down
+                        * testing a lot :-(.
+                        *
+                        * That is why we only mark memory as undefined here.
+                        */
+                       TC_UNDEFINE_SHRINK_CHUNK(tc, size);
+
+                       /* do not shrink if we have less than 1k to gain */
+                       tc->size = size;
+                       return ptr;
+               }
+       } else if (tc->size == size) {
+               /*
+                * do not change the pointer if it is exactly
+                * the same size.
+                */
+               return ptr;
+       }
+#endif
+
+       /* by resetting magic we catch users of the old memory */
+       tc->flags |= TALLOC_FLAG_FREE;
+
+#if ALWAYS_REALLOC
+       if (pool_tc) {
+               new_ptr = talloc_alloc_pool(tc, size + TC_HDR_SIZE);
+               pool_tc->hdr.object_count--;
+
+               if (new_ptr == NULL) {
+                       new_ptr = malloc(TC_HDR_SIZE+size);
+                       malloced = true;
+               }
+
+               if (new_ptr) {
+                       memcpy(new_ptr, tc, MIN(tc->size,size) + TC_HDR_SIZE);
+                       TC_INVALIDATE_FULL_CHUNK(tc);
+               }
+       } else {
+               new_ptr = malloc(size + TC_HDR_SIZE);
+               if (new_ptr) {
+                       memcpy(new_ptr, tc, MIN(tc->size, size) + TC_HDR_SIZE);
+                       free(tc);
+               }
+       }
+#else
+       if (pool_tc) {
+               void *next_tc = tc_next_chunk(tc);
+               size_t old_chunk_size = TC_ALIGN16(TC_HDR_SIZE + tc->size);
+               size_t new_chunk_size = TC_ALIGN16(TC_HDR_SIZE + size);
+               size_t space_needed;
+               size_t space_left;
+               unsigned int chunk_count = pool_tc->hdr.object_count;
+
+               if (!(pool_tc->hdr.c.flags & TALLOC_FLAG_FREE)) {
+                       chunk_count -= 1;
+               }
+
+               if (chunk_count == 1) {
+                       /*
+                        * optimize for the case where 'tc' is the only
+                        * chunk in the pool.
+                        */
+                       char *start = tc_pool_first_chunk(pool_tc);
+                       space_needed = new_chunk_size;
+                       space_left = (char *)tc_pool_end(pool_tc) - start;
+
+                       if (space_left >= space_needed) {
+                               size_t old_used = TC_HDR_SIZE + tc->size;
+                               size_t new_used = TC_HDR_SIZE + size;
+                               new_ptr = start;
+                               memmove(new_ptr, tc, old_used);
+
+                               tc = (struct talloc_chunk *)new_ptr;
+                               TC_UNDEFINE_GROW_CHUNK(tc, size);
+
+                               /*
+                                * first we do not align the pool pointer
+                                * because we want to invalidate the padding
+                                * too.
+                                */
+                               pool_tc->hdr.c.pool = new_used + (char *)new_ptr;
+                               tc_invalidate_pool(pool_tc);
+
+                               /* now the aligned pointer */
+                               pool_tc->hdr.c.pool = new_chunk_size + (char *)new_ptr;
+                               goto got_new_ptr;
+                       }
+
+                       next_tc = NULL;
+               }
+
+               if (new_chunk_size == old_chunk_size) {
+                       TC_UNDEFINE_GROW_CHUNK(tc, size);
+                       tc->flags &= ~TALLOC_FLAG_FREE;
+                       if (!talloc_memlimit_update(tc->limit,
+                                                       tc->size, size)) {
+                               talloc_abort("cur_size memlimit counter not"
+                                            " correct!");
+                               errno = EINVAL;
+                               return NULL;
+                       }
+
+                       tc->size = size;
+                       return ptr;
+               }
+
+               if (next_tc == pool_tc->hdr.c.pool) {
+                       /*
+                        * optimize for the case where 'tc' is the last
+                        * chunk in the pool.
+                        */
+                       space_needed = new_chunk_size - old_chunk_size;
+                       space_left = tc_pool_space_left(pool_tc);
+
+                       if (space_left >= space_needed) {
+                               TC_UNDEFINE_GROW_CHUNK(tc, size);
+                               tc->flags &= ~TALLOC_FLAG_FREE;
+                               if (!talloc_memlimit_update(tc->limit,
+                                                       tc->size, size)) {
+                                       talloc_abort("cur_size memlimit "
+                                                    "counter not correct!");
+                                       errno = EINVAL;
+                                       return NULL;
+                               }
+                               tc->size = size;
+                               pool_tc->hdr.c.pool = tc_next_chunk(tc);
+                               return ptr;
+                       }
+               }
+
+               new_ptr = talloc_alloc_pool(tc, size + TC_HDR_SIZE);
+
+               if (new_ptr == NULL) {
+                       new_ptr = malloc(TC_HDR_SIZE+size);
+                       malloced = true;
+               }
+
+               if (new_ptr) {
+                       memcpy(new_ptr, tc, MIN(tc->size,size) + TC_HDR_SIZE);
+
+                       _talloc_free_poolmem(tc, __location__ "_talloc_realloc");
+               }
+       }
+       else {
+               new_ptr = realloc(tc, size + TC_HDR_SIZE);
+       }
+got_new_ptr:
+#endif
+       if (unlikely(!new_ptr)) {       
+               tc->flags &= ~TALLOC_FLAG_FREE; 
+               return NULL; 
+       }
+
+       tc = (struct talloc_chunk *)new_ptr;
+       tc->flags &= ~TALLOC_FLAG_FREE;
+       if (malloced) {
+               tc->flags &= ~TALLOC_FLAG_POOLMEM;
+       }
+       if (tc->parent) {
+               tc->parent->child = tc;
+       }
+       if (tc->child) {
+               tc->child->parent = tc;
+       }
+
+       if (tc->prev) {
+               tc->prev->next = tc;
+       }
+       if (tc->next) {
+               tc->next->prev = tc;
+       }
+
+       if (!talloc_memlimit_update(tc->limit, tc->size, size)) {
+               talloc_abort("cur_size memlimit counter not correct!");
+               errno = EINVAL;
+               return NULL;
+       }
+       tc->size = size;
+       _talloc_set_name_const(TC_PTR_FROM_CHUNK(tc), name);
+
+       return TC_PTR_FROM_CHUNK(tc);
+}
+
+/*
+  a wrapper around talloc_steal() for situations where you are moving a pointer
+  between two structures, and want the old pointer to be set to NULL
+*/
+_PUBLIC_ void *_talloc_move(const void *new_ctx, const void *_pptr)
+{
+       const void **pptr = discard_const_p(const void *,_pptr);
+       void *ret = talloc_steal(new_ctx, discard_const_p(void, *pptr));
+       (*pptr) = NULL;
+       return ret;
+}
+
+enum talloc_mem_count_type {
+       TOTAL_MEM_SIZE,
+       TOTAL_MEM_BLOCKS,
+       TOTAL_MEM_LIMIT,
+};
+
+static size_t _talloc_total_mem_internal(const void *ptr,
+                                        enum talloc_mem_count_type type,
+                                        struct talloc_memlimit *old_limit,
+                                        struct talloc_memlimit *new_limit)
+{
+       size_t total = 0;
+       struct talloc_chunk *c, *tc;
+
+       if (ptr == NULL) {
+               ptr = null_context;
+       }
+       if (ptr == NULL) {
+               return 0;
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+
+       if (old_limit || new_limit) {
+               if (tc->limit && tc->limit->upper == old_limit) {
+                       tc->limit->upper = new_limit;
+               }
+       }
+
+       /* optimize in the memlimits case */
+       if (type == TOTAL_MEM_LIMIT &&
+           tc->limit != NULL &&
+           tc->limit != old_limit &&
+           tc->limit->parent == tc) {
+               return tc->limit->cur_size;
+       }
+
+       if (tc->flags & TALLOC_FLAG_LOOP) {
+               return 0;
+       }
+
+       tc->flags |= TALLOC_FLAG_LOOP;
+
+       if (old_limit || new_limit) {
+               if (old_limit == tc->limit) {
+                       tc->limit = new_limit;
+               }
+       }
+
+       switch (type) {
+       case TOTAL_MEM_SIZE:
+               if (likely(tc->name != TALLOC_MAGIC_REFERENCE)) {
+                       total = tc->size;
+               }
+               break;
+       case TOTAL_MEM_BLOCKS:
+               total++;
+               break;
+       case TOTAL_MEM_LIMIT:
+               if (likely(tc->name != TALLOC_MAGIC_REFERENCE)) {
+                       total = tc->size + TC_HDR_SIZE;
+               }
+               break;
+       }
+       for (c = tc->child; c; c = c->next) {
+               total += _talloc_total_mem_internal(TC_PTR_FROM_CHUNK(c), type,
+                                                   old_limit, new_limit);
+       }
+
+       tc->flags &= ~TALLOC_FLAG_LOOP;
+
+       return total;
+}
+
+/*
+  return the total size of a talloc pool (subtree)
+*/
+_PUBLIC_ size_t talloc_total_size(const void *ptr)
+{
+       return _talloc_total_mem_internal(ptr, TOTAL_MEM_SIZE, NULL, NULL);
+}
+
+/*
+  return the total number of blocks in a talloc pool (subtree)
+*/
+_PUBLIC_ size_t talloc_total_blocks(const void *ptr)
+{
+       return _talloc_total_mem_internal(ptr, TOTAL_MEM_BLOCKS, NULL, NULL);
+}
+
+/*
+  return the number of external references to a pointer
+*/
+_PUBLIC_ size_t talloc_reference_count(const void *ptr)
+{
+       struct talloc_chunk *tc = talloc_chunk_from_ptr(ptr);
+       struct talloc_reference_handle *h;
+       size_t ret = 0;
+
+       for (h=tc->refs;h;h=h->next) {
+               ret++;
+       }
+       return ret;
+}
+
+/*
+  report on memory usage by all children of a pointer, giving a full tree view
+*/
+_PUBLIC_ void talloc_report_depth_cb(const void *ptr, int depth, int max_depth,
+                           void (*callback)(const void *ptr,
+                                            int depth, int max_depth,
+                                            int is_ref,
+                                            void *private_data),
+                           void *private_data)
+{
+       struct talloc_chunk *c, *tc;
+
+       if (ptr == NULL) {
+               ptr = null_context;
+       }
+       if (ptr == NULL) return;
+
+       tc = talloc_chunk_from_ptr(ptr);
+
+       if (tc->flags & TALLOC_FLAG_LOOP) {
+               return;
+       }
+
+       callback(ptr, depth, max_depth, 0, private_data);
+
+       if (max_depth >= 0 && depth >= max_depth) {
+               return;
+       }
+
+       tc->flags |= TALLOC_FLAG_LOOP;
+       for (c=tc->child;c;c=c->next) {
+               if (c->name == TALLOC_MAGIC_REFERENCE) {
+                       struct talloc_reference_handle *h = (struct talloc_reference_handle *)TC_PTR_FROM_CHUNK(c);
+                       callback(h->ptr, depth + 1, max_depth, 1, private_data);
+               } else {
+                       talloc_report_depth_cb(TC_PTR_FROM_CHUNK(c), depth + 1, max_depth, callback, private_data);
+               }
+       }
+       tc->flags &= ~TALLOC_FLAG_LOOP;
+}
+
+static void talloc_report_depth_FILE_helper(const void *ptr, int depth, int max_depth, int is_ref, void *_f)
+{
+       const char *name = talloc_get_name(ptr);
+       struct talloc_chunk *tc;
+       FILE *f = (FILE *)_f;
+
+       if (is_ref) {
+               fprintf(f, "%*sreference to: %s\n", depth*4, "", name);
+               return;
+       }
+
+       tc = talloc_chunk_from_ptr(ptr);
+       if (tc->limit && tc->limit->parent == tc) {
+               fprintf(f, "%*s%-30s is a memlimit context"
+                       " (max_size = %lu bytes, cur_size = %lu bytes)\n",
+                       depth*4, "",
+                       name,
+                       (unsigned long)tc->limit->max_size,
+                       (unsigned long)tc->limit->cur_size);
+       }
+
+       if (depth == 0) {
+               fprintf(f,"%stalloc report on '%s' (total %6lu bytes in %3lu blocks)\n", 
+                       (max_depth < 0 ? "full " :""), name,
+                       (unsigned long)talloc_total_size(ptr),
+                       (unsigned long)talloc_total_blocks(ptr));
+               return;
+       }
+
+       fprintf(f, "%*s%-30s contains %6lu bytes in %3lu blocks (ref %d) %p\n", 
+               depth*4, "",
+               name,
+               (unsigned long)talloc_total_size(ptr),
+               (unsigned long)talloc_total_blocks(ptr),
+               (int)talloc_reference_count(ptr), ptr);
+
+#if 0
+       fprintf(f, "content: ");
+       if (talloc_total_size(ptr)) {
+               int tot = talloc_total_size(ptr);
+               int i;
+
+               for (i = 0; i < tot; i++) {
+                       if ((((char *)ptr)[i] > 31) && (((char *)ptr)[i] < 126)) {
+                               fprintf(f, "%c", ((char *)ptr)[i]);
+                       } else {
+                               fprintf(f, "~%02x", ((char *)ptr)[i]);
+                       }
+               }
+       }
+       fprintf(f, "\n");
+#endif
+}
+
+/*
+  report on memory usage by all children of a pointer, giving a full tree view
+*/
+_PUBLIC_ void talloc_report_depth_file(const void *ptr, int depth, int max_depth, FILE *f)
+{
+       if (f) {
+               talloc_report_depth_cb(ptr, depth, max_depth, talloc_report_depth_FILE_helper, f);
+               fflush(f);
+       }
+}
+
+/*
+  report on memory usage by all children of a pointer, giving a full tree view
+*/
+_PUBLIC_ void talloc_report_full(const void *ptr, FILE *f)
+{
+       talloc_report_depth_file(ptr, 0, -1, f);
+}
+
+/*
+  report on memory usage by all children of a pointer
+*/
+_PUBLIC_ void talloc_report(const void *ptr, FILE *f)
+{
+       talloc_report_depth_file(ptr, 0, 1, f);
+}
+
+/*
+  report on any memory hanging off the null context
+*/
+static void talloc_report_null(void)
+{
+       if (talloc_total_size(null_context) != 0) {
+               talloc_report(null_context, stderr);
+       }
+}
+
+/*
+  report on any memory hanging off the null context
+*/
+static void talloc_report_null_full(void)
+{
+       if (talloc_total_size(null_context) != 0) {
+               talloc_report_full(null_context, stderr);
+       }
+}
+
+/*
+  enable tracking of the NULL context
+*/
+_PUBLIC_ void talloc_enable_null_tracking(void)
+{
+       if (null_context == NULL) {
+               null_context = _talloc_named_const(NULL, 0, "null_context");
+               if (autofree_context != NULL) {
+                       talloc_reparent(NULL, null_context, autofree_context);
+               }
+       }
+}
+
+/*
+  enable tracking of the NULL context, not moving the autofree context
+  into the NULL context. This is needed for the talloc testsuite
+*/
+_PUBLIC_ void talloc_enable_null_tracking_no_autofree(void)
+{
+       if (null_context == NULL) {
+               null_context = _talloc_named_const(NULL, 0, "null_context");
+       }
+}
+
+/*
+  disable tracking of the NULL context
+*/
+_PUBLIC_ void talloc_disable_null_tracking(void)
+{
+       if (null_context != NULL) {
+               /* we have to move any children onto the real NULL
+                  context */
+               struct talloc_chunk *tc, *tc2;
+               tc = talloc_chunk_from_ptr(null_context);
+               for (tc2 = tc->child; tc2; tc2=tc2->next) {
+                       if (tc2->parent == tc) tc2->parent = NULL;
+                       if (tc2->prev == tc) tc2->prev = NULL;
+               }
+               for (tc2 = tc->next; tc2; tc2=tc2->next) {
+                       if (tc2->parent == tc) tc2->parent = NULL;
+                       if (tc2->prev == tc) tc2->prev = NULL;
+               }
+               tc->child = NULL;
+               tc->next = NULL;
+       }
+       talloc_free(null_context);
+       null_context = NULL;
+}
+
+/*
+  enable leak reporting on exit
+*/
+_PUBLIC_ void talloc_enable_leak_report(void)
+{
+       talloc_enable_null_tracking();
+       atexit(talloc_report_null);
+}
+
+/*
+  enable full leak reporting on exit
+*/
+_PUBLIC_ void talloc_enable_leak_report_full(void)
+{
+       talloc_enable_null_tracking();
+       atexit(talloc_report_null_full);
+}
+
+/* 
+   talloc and zero memory. 
+*/
+_PUBLIC_ void *_talloc_zero(const void *ctx, size_t size, const char *name)
+{
+       void *p = _talloc_named_const(ctx, size, name);
+
+       if (p) {
+               memset(p, '\0', size);
+       }
+
+       return p;
+}
+
+/*
+  memdup with a talloc. 
+*/
+_PUBLIC_ void *_talloc_memdup(const void *t, const void *p, size_t size, const char *name)
+{
+       void *newp = _talloc_named_const(t, size, name);
+
+       if (likely(newp)) {
+               memcpy(newp, p, size);
+       }
+
+       return newp;
+}
+
+static inline char *__talloc_strlendup(const void *t, const char *p, size_t len)
+{
+       char *ret;
+
+       ret = (char *)__talloc(t, len + 1);
+       if (unlikely(!ret)) return NULL;
+
+       memcpy(ret, p, len);
+       ret[len] = 0;
+
+       _talloc_set_name_const(ret, ret);
+       return ret;
+}
+
+/*
+  strdup with a talloc
+*/
+_PUBLIC_ char *talloc_strdup(const void *t, const char *p)
+{
+       if (unlikely(!p)) return NULL;
+       return __talloc_strlendup(t, p, strlen(p));
+}
+
+/*
+  strndup with a talloc
+*/
+_PUBLIC_ char *talloc_strndup(const void *t, const char *p, size_t n)
+{
+       if (unlikely(!p)) return NULL;
+       return __talloc_strlendup(t, p, strnlen(p, n));
+}
+
+static inline char *__talloc_strlendup_append(char *s, size_t slen,
+                                             const char *a, size_t alen)
+{
+       char *ret;
+
+       ret = talloc_realloc(NULL, s, char, slen + alen + 1);
+       if (unlikely(!ret)) return NULL;
+
+       /* append the string and the trailing \0 */
+       memcpy(&ret[slen], a, alen);
+       ret[slen+alen] = 0;
+
+       _talloc_set_name_const(ret, ret);
+       return ret;
+}
+
+/*
+ * Appends at the end of the string.
+ */
+_PUBLIC_ char *talloc_strdup_append(char *s, const char *a)
+{
+       if (unlikely(!s)) {
+               return talloc_strdup(NULL, a);
+       }
+
+       if (unlikely(!a)) {
+               return s;
+       }
+
+       return __talloc_strlendup_append(s, strlen(s), a, strlen(a));
+}
+
+/*
+ * Appends at the end of the talloc'ed buffer,
+ * not the end of the string.
+ */
+_PUBLIC_ char *talloc_strdup_append_buffer(char *s, const char *a)
+{
+       size_t slen;
+
+       if (unlikely(!s)) {
+               return talloc_strdup(NULL, a);
+       }
+
+       if (unlikely(!a)) {
+               return s;
+       }
+
+       slen = talloc_get_size(s);
+       if (likely(slen > 0)) {
+               slen--;
+       }
+
+       return __talloc_strlendup_append(s, slen, a, strlen(a));
+}
+
+/*
+ * Appends at the end of the string.
+ */
+_PUBLIC_ char *talloc_strndup_append(char *s, const char *a, size_t n)
+{
+       if (unlikely(!s)) {
+               return talloc_strndup(NULL, a, n);
+       }
+
+       if (unlikely(!a)) {
+               return s;
+       }
+
+       return __talloc_strlendup_append(s, strlen(s), a, strnlen(a, n));
+}
+
+/*
+ * Appends at the end of the talloc'ed buffer,
+ * not the end of the string.
+ */
+_PUBLIC_ char *talloc_strndup_append_buffer(char *s, const char *a, size_t n)
+{
+       size_t slen;
+
+       if (unlikely(!s)) {
+               return talloc_strndup(NULL, a, n);
+       }
+
+       if (unlikely(!a)) {
+               return s;
+       }
+
+       slen = talloc_get_size(s);
+       if (likely(slen > 0)) {
+               slen--;
+       }
+
+       return __talloc_strlendup_append(s, slen, a, strnlen(a, n));
+}
+
+#ifndef HAVE_VA_COPY
+#ifdef HAVE___VA_COPY
+#define va_copy(dest, src) __va_copy(dest, src)
+#else
+#define va_copy(dest, src) (dest) = (src)
+#endif
+#endif
+
+_PUBLIC_ char *talloc_vasprintf(const void *t, const char *fmt, va_list ap)
+{
+       int len;
+       char *ret;
+       va_list ap2;
+       char c;
+
+       /* this call looks strange, but it makes it work on older solaris boxes */
+       va_copy(ap2, ap);
+       len = vsnprintf(&c, 1, fmt, ap2);
+       va_end(ap2);
+       if (unlikely(len < 0)) {
+               return NULL;
+       }
+
+       ret = (char *)__talloc(t, len+1);
+       if (unlikely(!ret)) return NULL;
+
+       va_copy(ap2, ap);
+       vsnprintf(ret, len+1, fmt, ap2);
+       va_end(ap2);
+
+       _talloc_set_name_const(ret, ret);
+       return ret;
+}
+
+
+/*
+  Perform string formatting, and return a pointer to newly allocated
+  memory holding the result, inside a memory pool.
+ */
+_PUBLIC_ char *talloc_asprintf(const void *t, const char *fmt, ...)
+{
+       va_list ap;
+       char *ret;
+
+       va_start(ap, fmt);
+       ret = talloc_vasprintf(t, fmt, ap);
+       va_end(ap);
+       return ret;
+}
+
+static inline char *__talloc_vaslenprintf_append(char *s, size_t slen,
+                                                const char *fmt, va_list ap)
+                                                PRINTF_ATTRIBUTE(3,0);
+
+static inline char *__talloc_vaslenprintf_append(char *s, size_t slen,
+                                                const char *fmt, va_list ap)
+{
+       ssize_t alen;
+       va_list ap2;
+       char c;
+
+       va_copy(ap2, ap);
+       alen = vsnprintf(&c, 1, fmt, ap2);
+       va_end(ap2);
+
+       if (alen <= 0) {
+               /* Either the vsnprintf failed or the format resulted in
+                * no characters being formatted. In the former case, we
+                * ought to return NULL, in the latter we ought to return
+                * the original string. Most current callers of this
+                * function expect it to never return NULL.
+                */
+               return s;
+       }
+
+       s = talloc_realloc(NULL, s, char, slen + alen + 1);
+       if (!s) return NULL;
+
+       va_copy(ap2, ap);
+       vsnprintf(s + slen, alen + 1, fmt, ap2);
+       va_end(ap2);
+
+       _talloc_set_name_const(s, s);
+       return s;
+}
+
+/**
+ * Realloc @p s to append the formatted result of @p fmt and @p ap,
+ * and return @p s, which may have moved.  Good for gradually
+ * accumulating output into a string buffer. Appends at the end
+ * of the string.
+ **/
+_PUBLIC_ char *talloc_vasprintf_append(char *s, const char *fmt, va_list ap)
+{
+       if (unlikely(!s)) {
+               return talloc_vasprintf(NULL, fmt, ap);
+       }
+
+       return __talloc_vaslenprintf_append(s, strlen(s), fmt, ap);
+}
+
+/**
+ * Realloc @p s to append the formatted result of @p fmt and @p ap,
+ * and return @p s, which may have moved. Always appends at the
+ * end of the talloc'ed buffer, not the end of the string.
+ **/
+_PUBLIC_ char *talloc_vasprintf_append_buffer(char *s, const char *fmt, va_list ap)
+{
+       size_t slen;
+
+       if (unlikely(!s)) {
+               return talloc_vasprintf(NULL, fmt, ap);
+       }
+
+       slen = talloc_get_size(s);
+       if (likely(slen > 0)) {
+               slen--;
+       }
+
+       return __talloc_vaslenprintf_append(s, slen, fmt, ap);
+}
+
+/*
+  Realloc @p s to append the formatted result of @p fmt and return @p
+  s, which may have moved.  Good for gradually accumulating output
+  into a string buffer.
+ */
+_PUBLIC_ char *talloc_asprintf_append(char *s, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       s = talloc_vasprintf_append(s, fmt, ap);
+       va_end(ap);
+       return s;
+}
+
+/*
+  Realloc @p s to append the formatted result of @p fmt and return @p
+  s, which may have moved.  Good for gradually accumulating output
+  into a buffer.
+ */
+_PUBLIC_ char *talloc_asprintf_append_buffer(char *s, const char *fmt, ...)
+{
+       va_list ap;
+
+       va_start(ap, fmt);
+       s = talloc_vasprintf_append_buffer(s, fmt, ap);
+       va_end(ap);
+       return s;
+}
+
+/*
+  alloc an array, checking for integer overflow in the array size
+*/
+_PUBLIC_ void *_talloc_array(const void *ctx, size_t el_size, unsigned count, const char *name)
+{
+       if (count >= MAX_TALLOC_SIZE/el_size) {
+               return NULL;
+       }
+       return _talloc_named_const(ctx, el_size * count, name);
+}
+
+/*
+  alloc an zero array, checking for integer overflow in the array size
+*/
+_PUBLIC_ void *_talloc_zero_array(const void *ctx, size_t el_size, unsigned count, const char *name)
+{
+       if (count >= MAX_TALLOC_SIZE/el_size) {
+               return NULL;
+       }
+       return _talloc_zero(ctx, el_size * count, name);
+}
+
+/*
+  realloc an array, checking for integer overflow in the array size
+*/
+_PUBLIC_ void *_talloc_realloc_array(const void *ctx, void *ptr, size_t el_size, unsigned count, const char *name)
+{
+       if (count >= MAX_TALLOC_SIZE/el_size) {
+               return NULL;
+       }
+       return _talloc_realloc(ctx, ptr, el_size * count, name);
+}
+
+/*
+  a function version of talloc_realloc(), so it can be passed as a function pointer
+  to libraries that want a realloc function (a realloc function encapsulates
+  all the basic capabilities of an allocation library, which is why this is useful)
+*/
+_PUBLIC_ void *talloc_realloc_fn(const void *context, void *ptr, size_t size)
+{
+       return _talloc_realloc(context, ptr, size, NULL);
+}
+
+
+static int talloc_autofree_destructor(void *ptr)
+{
+       autofree_context = NULL;
+       return 0;
+}
+
+static void talloc_autofree(void)
+{
+       talloc_free(autofree_context);
+}
+
+/*
+  return a context which will be auto-freed on exit
+  this is useful for reducing the noise in leak reports
+*/
+_PUBLIC_ void *talloc_autofree_context(void)
+{
+       if (autofree_context == NULL) {
+               autofree_context = _talloc_named_const(NULL, 0, "autofree_context");
+               talloc_set_destructor(autofree_context, talloc_autofree_destructor);
+               atexit(talloc_autofree);
+       }
+       return autofree_context;
+}
+
+_PUBLIC_ size_t talloc_get_size(const void *context)
+{
+       struct talloc_chunk *tc;
+
+       if (context == NULL) {
+               context = null_context;
+       }
+       if (context == NULL) {
+               return 0;
+       }
+
+       tc = talloc_chunk_from_ptr(context);
+
+       return tc->size;
+}
+
+/*
+  find a parent of this context that has the given name, if any
+*/
+_PUBLIC_ void *talloc_find_parent_byname(const void *context, const char *name)
+{
+       struct talloc_chunk *tc;
+
+       if (context == NULL) {
+               return NULL;
+       }
+
+       tc = talloc_chunk_from_ptr(context);
+       while (tc) {
+               if (tc->name && strcmp(tc->name, name) == 0) {
+                       return TC_PTR_FROM_CHUNK(tc);
+               }
+               while (tc && tc->prev) tc = tc->prev;
+               if (tc) {
+                       tc = tc->parent;
+               }
+       }
+       return NULL;
+}
+
+/*
+  show the parentage of a context
+*/
+_PUBLIC_ void talloc_show_parents(const void *context, FILE *file)
+{
+       struct talloc_chunk *tc;
+
+       if (context == NULL) {
+               fprintf(file, "talloc no parents for NULL\n");
+               return;
+       }
+
+       tc = talloc_chunk_from_ptr(context);
+       fprintf(file, "talloc parents of '%s'\n", talloc_get_name(context));
+       while (tc) {
+               fprintf(file, "\t'%s'\n", talloc_get_name(TC_PTR_FROM_CHUNK(tc)));
+               while (tc && tc->prev) tc = tc->prev;
+               if (tc) {
+                       tc = tc->parent;
+               }
+       }
+       fflush(file);
+}
+
+/*
+  return 1 if ptr is a parent of context
+*/
+static int _talloc_is_parent(const void *context, const void *ptr, int depth)
+{
+       struct talloc_chunk *tc;
+
+       if (context == NULL) {
+               return 0;
+       }
+
+       tc = talloc_chunk_from_ptr(context);
+       while (tc && depth > 0) {
+               if (TC_PTR_FROM_CHUNK(tc) == ptr) return 1;
+               while (tc && tc->prev) tc = tc->prev;
+               if (tc) {
+                       tc = tc->parent;
+                       depth--;
+               }
+       }
+       return 0;
+}
+
+/*
+  return 1 if ptr is a parent of context
+*/
+_PUBLIC_ int talloc_is_parent(const void *context, const void *ptr)
+{
+       return _talloc_is_parent(context, ptr, TALLOC_MAX_DEPTH);
+}
+
+/*
+  return the total size of memory used by this context and all children
+*/
+static size_t _talloc_total_limit_size(const void *ptr,
+                                       struct talloc_memlimit *old_limit,
+                                       struct talloc_memlimit *new_limit)
+{
+       return _talloc_total_mem_internal(ptr, TOTAL_MEM_LIMIT,
+                                         old_limit, new_limit);
+}
+
+static bool talloc_memlimit_check(struct talloc_memlimit *limit, size_t size)
+{
+       struct talloc_memlimit *l;
+
+       for (l = limit; l != NULL; l = l->upper) {
+               if (l->max_size != 0 &&
+                   ((l->max_size <= l->cur_size) ||
+                    (l->max_size - l->cur_size < TC_HDR_SIZE+size))) {
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+static bool talloc_memlimit_update(struct talloc_memlimit *limit,
+                                  size_t old_size, size_t new_size)
+{
+       struct talloc_memlimit *l;
+       ssize_t d;
+
+       if (old_size == 0) {
+               d = new_size + TC_HDR_SIZE;
+       } else {
+               d = new_size - old_size;
+       }
+       for (l = limit; l != NULL; l = l->upper) {
+               ssize_t new_cur_size = l->cur_size + d;
+               if (new_cur_size < 0) {
+                       return false;
+               }
+               l->cur_size = new_cur_size;
+       }
+
+       return true;
+}
+
+_PUBLIC_ int talloc_set_memlimit(const void *ctx, size_t max_size)
+{
+       struct talloc_chunk *tc = talloc_chunk_from_ptr(ctx);
+       struct talloc_memlimit *orig_limit;
+       struct talloc_memlimit *limit = NULL;
+
+       if (tc->limit && tc->limit->parent == tc) {
+               tc->limit->max_size = max_size;
+               return 0;
+       }
+       orig_limit = tc->limit;
+
+       limit = malloc(sizeof(struct talloc_memlimit));
+       if (limit == NULL) {
+               return 1;
+       }
+       limit->parent = tc;
+       limit->max_size = max_size;
+       limit->cur_size = _talloc_total_limit_size(ctx, tc->limit, limit);
+
+       if (orig_limit) {
+               limit->upper = orig_limit;
+       } else {
+               limit->upper = NULL;
+       }
+
+       return 0;
+}
diff --git a/ctdb/lib/talloc/talloc.h b/ctdb/lib/talloc/talloc.h
new file mode 100644 (file)
index 0000000..f3cbcd0
--- /dev/null
@@ -0,0 +1,1883 @@
+#ifndef _TALLOC_H_
+#define _TALLOC_H_
+/*
+   Unix SMB/CIFS implementation.
+   Samba temporary memory allocation functions
+
+   Copyright (C) Andrew Tridgell 2004-2005
+   Copyright (C) Stefan Metzmacher 2006
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup talloc The talloc API
+ *
+ * talloc is a hierarchical, reference counted memory pool system with
+ * destructors. It is the core memory allocator used in Samba.
+ *
+ * @{
+ */
+
+#define TALLOC_VERSION_MAJOR 2
+#define TALLOC_VERSION_MINOR 0
+
+int talloc_version_major(void);
+int talloc_version_minor(void);
+
+/**
+ * @brief Define a talloc parent type
+ *
+ * As talloc is a hierarchial memory allocator, every talloc chunk is a
+ * potential parent to other talloc chunks. So defining a separate type for a
+ * talloc chunk is not strictly necessary. TALLOC_CTX is defined nevertheless,
+ * as it provides an indicator for function arguments. You will frequently
+ * write code like
+ *
+ * @code
+ *      struct foo *foo_create(TALLOC_CTX *mem_ctx)
+ *      {
+ *              struct foo *result;
+ *              result = talloc(mem_ctx, struct foo);
+ *              if (result == NULL) return NULL;
+ *                      ... initialize foo ...
+ *              return result;
+ *      }
+ * @endcode
+ *
+ * In this type of allocating functions it is handy to have a general
+ * TALLOC_CTX type to indicate which parent to put allocated structures on.
+ */
+typedef void TALLOC_CTX;
+
+/*
+  this uses a little trick to allow __LINE__ to be stringified
+*/
+#ifndef __location__
+#define __TALLOC_STRING_LINE1__(s)    #s
+#define __TALLOC_STRING_LINE2__(s)   __TALLOC_STRING_LINE1__(s)
+#define __TALLOC_STRING_LINE3__  __TALLOC_STRING_LINE2__(__LINE__)
+#define __location__ __FILE__ ":" __TALLOC_STRING_LINE3__
+#endif
+
+#ifndef TALLOC_DEPRECATED
+#define TALLOC_DEPRECATED 0
+#endif
+
+#ifndef PRINTF_ATTRIBUTE
+#if (__GNUC__ >= 3)
+/** Use gcc attribute to check printf fns.  a1 is the 1-based index of
+ * the parameter containing the format, and a2 the index of the first
+ * argument. Note that some gcc 2.x versions don't handle this
+ * properly **/
+#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
+#else
+#define PRINTF_ATTRIBUTE(a1, a2)
+#endif
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Create a new talloc context.
+ *
+ * The talloc() macro is the core of the talloc library. It takes a memory
+ * context and a type, and returns a pointer to a new area of memory of the
+ * given type.
+ *
+ * The returned pointer is itself a talloc context, so you can use it as the
+ * context argument to more calls to talloc if you wish.
+ *
+ * The returned pointer is a "child" of the supplied context. This means that if
+ * you talloc_free() the context then the new child disappears as well.
+ * Alternatively you can free just the child.
+ *
+ * @param[in]  ctx      A talloc context to create a new reference on or NULL to
+ *                      create a new top level context.
+ *
+ * @param[in]  type     The type of memory to allocate.
+ *
+ * @return              A type casted talloc context or NULL on error.
+ *
+ * @code
+ *      unsigned int *a, *b;
+ *
+ *      a = talloc(NULL, unsigned int);
+ *      b = talloc(a, unsigned int);
+ * @endcode
+ *
+ * @see talloc_zero
+ * @see talloc_array
+ * @see talloc_steal
+ * @see talloc_free
+ */
+void *talloc(const void *ctx, #type);
+#else
+#define talloc(ctx, type) (type *)talloc_named_const(ctx, sizeof(type), #type)
+void *_talloc(const void *context, size_t size);
+#endif
+
+/**
+ * @brief Create a new top level talloc context.
+ *
+ * This function creates a zero length named talloc context as a top level
+ * context. It is equivalent to:
+ *
+ * @code
+ *      talloc_named(NULL, 0, fmt, ...);
+ * @endcode
+ * @param[in]  fmt      Format string for the name.
+ *
+ * @param[in]  ...      Additional printf-style arguments.
+ *
+ * @return              The allocated memory chunk, NULL on error.
+ *
+ * @see talloc_named()
+ */
+void *talloc_init(const char *fmt, ...) PRINTF_ATTRIBUTE(1,2);
+
+#ifdef DOXYGEN
+/**
+ * @brief Free a chunk of talloc memory.
+ *
+ * The talloc_free() function frees a piece of talloc memory, and all its
+ * children. You can call talloc_free() on any pointer returned by
+ * talloc().
+ *
+ * The return value of talloc_free() indicates success or failure, with 0
+ * returned for success and -1 for failure. A possible failure condition
+ * is if the pointer had a destructor attached to it and the destructor
+ * returned -1. See talloc_set_destructor() for details on
+ * destructors. Likewise, if "ptr" is NULL, then the function will make
+ * no modifications and return -1.
+ *
+ * From version 2.0 and onwards, as a special case, talloc_free() is
+ * refused on pointers that have more than one parent associated, as talloc
+ * would have no way of knowing which parent should be removed. This is
+ * different from older versions in the sense that always the reference to
+ * the most recently established parent has been destroyed. Hence to free a
+ * pointer that has more than one parent please use talloc_unlink().
+ *
+ * To help you find problems in your code caused by this behaviour, if
+ * you do try and free a pointer with more than one parent then the
+ * talloc logging function will be called to give output like this:
+ *
+ * @code
+ *   ERROR: talloc_free with references at some_dir/source/foo.c:123
+ *     reference at some_dir/source/other.c:325
+ *     reference at some_dir/source/third.c:121
+ * @endcode
+ *
+ * Please see the documentation for talloc_set_log_fn() and
+ * talloc_set_log_stderr() for more information on talloc logging
+ * functions.
+ *
+ * If <code>TALLOC_FREE_FILL</code> environment variable is set,
+ * the memory occupied by the context is filled with the value of this variable.
+ * The value should be a numeric representation of the character you want to
+ * use.
+ *
+ * talloc_free() operates recursively on its children.
+ *
+ * @param[in]  ptr      The chunk to be freed.
+ *
+ * @return              Returns 0 on success and -1 on error. A possible
+ *                      failure condition is if the pointer had a destructor
+ *                      attached to it and the destructor returned -1. Likewise,
+ *                      if "ptr" is NULL, then the function will make no
+ *                      modifications and returns -1.
+ *
+ * Example:
+ * @code
+ *      unsigned int *a, *b;
+ *      a = talloc(NULL, unsigned int);
+ *      b = talloc(a, unsigned int);
+ *
+ *      talloc_free(a); // Frees a and b
+ * @endcode
+ *
+ * @see talloc_set_destructor()
+ * @see talloc_unlink()
+ */
+int talloc_free(void *ptr);
+#else
+#define talloc_free(ctx) _talloc_free(ctx, __location__)
+int _talloc_free(void *ptr, const char *location);
+#endif
+
+/**
+ * @brief Free a talloc chunk's children.
+ *
+ * The function walks along the list of all children of a talloc context and
+ * talloc_free()s only the children, not the context itself.
+ *
+ * A NULL argument is handled as no-op.
+ *
+ * @param[in]  ptr      The chunk that you want to free the children of
+ *                      (NULL is allowed too)
+ */
+void talloc_free_children(void *ptr);
+
+#ifdef DOXYGEN
+/**
+ * @brief Assign a destructor function to be called when a chunk is freed.
+ *
+ * The function talloc_set_destructor() sets the "destructor" for the pointer
+ * "ptr". A destructor is a function that is called when the memory used by a
+ * pointer is about to be released. The destructor receives the pointer as an
+ * argument, and should return 0 for success and -1 for failure.
+ *
+ * The destructor can do anything it wants to, including freeing other pieces
+ * of memory. A common use for destructors is to clean up operating system
+ * resources (such as open file descriptors) contained in the structure the
+ * destructor is placed on.
+ *
+ * You can only place one destructor on a pointer. If you need more than one
+ * destructor then you can create a zero-length child of the pointer and place
+ * an additional destructor on that.
+ *
+ * To remove a destructor call talloc_set_destructor() with NULL for the
+ * destructor.
+ *
+ * If your destructor attempts to talloc_free() the pointer that it is the
+ * destructor for then talloc_free() will return -1 and the free will be
+ * ignored. This would be a pointless operation anyway, as the destructor is
+ * only called when the memory is just about to go away.
+ *
+ * @param[in]  ptr      The talloc chunk to add a destructor to.
+ *
+ * @param[in]  destructor  The destructor function to be called. NULL to remove
+ *                         it.
+ *
+ * Example:
+ * @code
+ *      static int destroy_fd(int *fd) {
+ *              close(*fd);
+ *              return 0;
+ *      }
+ *
+ *      int *open_file(const char *filename) {
+ *              int *fd = talloc(NULL, int);
+ *              *fd = open(filename, O_RDONLY);
+ *              if (*fd < 0) {
+ *                      talloc_free(fd);
+ *                      return NULL;
+ *              }
+ *              // Whenever they free this, we close the file.
+ *              talloc_set_destructor(fd, destroy_fd);
+ *              return fd;
+ *      }
+ * @endcode
+ *
+ * @see talloc()
+ * @see talloc_free()
+ */
+void talloc_set_destructor(const void *ptr, int (*destructor)(void *));
+
+/**
+ * @brief Change a talloc chunk's parent.
+ *
+ * The talloc_steal() function changes the parent context of a talloc
+ * pointer. It is typically used when the context that the pointer is
+ * currently a child of is going to be freed and you wish to keep the
+ * memory for a longer time.
+ *
+ * To make the changed hierarchy less error-prone, you might consider to use
+ * talloc_move().
+ *
+ * If you try and call talloc_steal() on a pointer that has more than one
+ * parent then the result is ambiguous. Talloc will choose to remove the
+ * parent that is currently indicated by talloc_parent() and replace it with
+ * the chosen parent. You will also get a message like this via the talloc
+ * logging functions:
+ *
+ * @code
+ *   WARNING: talloc_steal with references at some_dir/source/foo.c:123
+ *     reference at some_dir/source/other.c:325
+ *     reference at some_dir/source/third.c:121
+ * @endcode
+ *
+ * To unambiguously change the parent of a pointer please see the function
+ * talloc_reparent(). See the talloc_set_log_fn() documentation for more
+ * information on talloc logging.
+ *
+ * @param[in]  new_ctx  The new parent context.
+ *
+ * @param[in]  ptr      The talloc chunk to move.
+ *
+ * @return              Returns the pointer that you pass it. It does not have
+ *                      any failure modes.
+ *
+ * @note It is possible to produce loops in the parent/child relationship
+ * if you are not careful with talloc_steal(). No guarantees are provided
+ * as to your sanity or the safety of your data if you do this.
+ */
+void *talloc_steal(const void *new_ctx, const void *ptr);
+#else /* DOXYGEN */
+/* try to make talloc_set_destructor() and talloc_steal() type safe,
+   if we have a recent gcc */
+#if (__GNUC__ >= 3)
+#define _TALLOC_TYPEOF(ptr) __typeof__(ptr)
+#define talloc_set_destructor(ptr, function)                                 \
+       do {                                                                  \
+               int (*_talloc_destructor_fn)(_TALLOC_TYPEOF(ptr)) = (function);       \
+               _talloc_set_destructor((ptr), (int (*)(void *))_talloc_destructor_fn); \
+       } while(0)
+/* this extremely strange macro is to avoid some braindamaged warning
+   stupidity in gcc 4.1.x */
+#define talloc_steal(ctx, ptr) ({ _TALLOC_TYPEOF(ptr) __talloc_steal_ret = (_TALLOC_TYPEOF(ptr))_talloc_steal_loc((ctx),(ptr), __location__); __talloc_steal_ret; })
+#else /* __GNUC__ >= 3 */
+#define talloc_set_destructor(ptr, function) \
+       _talloc_set_destructor((ptr), (int (*)(void *))(function))
+#define _TALLOC_TYPEOF(ptr) void *
+#define talloc_steal(ctx, ptr) (_TALLOC_TYPEOF(ptr))_talloc_steal_loc((ctx),(ptr), __location__)
+#endif /* __GNUC__ >= 3 */
+void _talloc_set_destructor(const void *ptr, int (*_destructor)(void *));
+void *_talloc_steal_loc(const void *new_ctx, const void *ptr, const char *location);
+#endif /* DOXYGEN */
+
+/**
+ * @brief Assign a name to a talloc chunk.
+ *
+ * Each talloc pointer has a "name". The name is used principally for
+ * debugging purposes, although it is also possible to set and get the name on
+ * a pointer in as a way of "marking" pointers in your code.
+ *
+ * The main use for names on pointer is for "talloc reports". See
+ * talloc_report() and talloc_report_full() for details. Also see
+ * talloc_enable_leak_report() and talloc_enable_leak_report_full().
+ *
+ * The talloc_set_name() function allocates memory as a child of the
+ * pointer. It is logically equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, talloc_asprintf(ptr, fmt, ...));
+ * @endcode
+ *
+ * @param[in]  ptr      The talloc chunk to assign a name to.
+ *
+ * @param[in]  fmt      Format string for the name.
+ *
+ * @param[in]  ...      Add printf-style additional arguments.
+ *
+ * @return              The assigned name, NULL on error.
+ *
+ * @note Multiple calls to talloc_set_name() will allocate more memory without
+ * releasing the name. All of the memory is released when the ptr is freed
+ * using talloc_free().
+ */
+const char *talloc_set_name(const void *ptr, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+
+#ifdef DOXYGEN
+/**
+ * @brief Change a talloc chunk's parent.
+ *
+ * This function has the same effect as talloc_steal(), and additionally sets
+ * the source pointer to NULL. You would use it like this:
+ *
+ * @code
+ *      struct foo *X = talloc(tmp_ctx, struct foo);
+ *      struct foo *Y;
+ *      Y = talloc_move(new_ctx, &X);
+ * @endcode
+ *
+ * @param[in]  new_ctx  The new parent context.
+ *
+ * @param[in]  pptr     Pointer to the talloc chunk to move.
+ *
+ * @return              The pointer of the talloc chunk it has been moved to,
+ *                      NULL on error.
+ */
+void *talloc_move(const void *new_ctx, void **pptr);
+#else
+#define talloc_move(ctx, pptr) (_TALLOC_TYPEOF(*(pptr)))_talloc_move((ctx),(void *)(pptr))
+void *_talloc_move(const void *new_ctx, const void *pptr);
+#endif
+
+/**
+ * @brief Assign a name to a talloc chunk.
+ *
+ * The function is just like talloc_set_name(), but it takes a string constant,
+ * and is much faster. It is extensively used by the "auto naming" macros, such
+ * as talloc_p().
+ *
+ * This function does not allocate any memory. It just copies the supplied
+ * pointer into the internal representation of the talloc ptr. This means you
+ * must not pass a name pointer to memory that will disappear before the ptr
+ * is freed with talloc_free().
+ *
+ * @param[in]  ptr      The talloc chunk to assign a name to.
+ *
+ * @param[in]  name     Format string for the name.
+ */
+void talloc_set_name_const(const void *ptr, const char *name);
+
+/**
+ * @brief Create a named talloc chunk.
+ *
+ * The talloc_named() function creates a named talloc pointer. It is
+ * equivalent to:
+ *
+ * @code
+ *      ptr = talloc_size(context, size);
+ *      talloc_set_name(ptr, fmt, ....);
+ * @endcode
+ *
+ * @param[in]  context  The talloc context to hang the result off.
+ *
+ * @param[in]  size     Number of char's that you want to allocate.
+ *
+ * @param[in]  fmt      Format string for the name.
+ *
+ * @param[in]  ...      Additional printf-style arguments.
+ *
+ * @return              The allocated memory chunk, NULL on error.
+ *
+ * @see talloc_set_name()
+ */
+void *talloc_named(const void *context, size_t size,
+                  const char *fmt, ...) PRINTF_ATTRIBUTE(3,4);
+
+/**
+ * @brief Basic routine to allocate a chunk of memory.
+ *
+ * This is equivalent to:
+ *
+ * @code
+ *      ptr = talloc_size(context, size);
+ *      talloc_set_name_const(ptr, name);
+ * @endcode
+ *
+ * @param[in]  context  The parent context.
+ *
+ * @param[in]  size     The number of char's that we want to allocate.
+ *
+ * @param[in]  name     The name the talloc block has.
+ *
+ * @return             The allocated memory chunk, NULL on error.
+ */
+void *talloc_named_const(const void *context, size_t size, const char *name);
+
+#ifdef DOXYGEN
+/**
+ * @brief Untyped allocation.
+ *
+ * The function should be used when you don't have a convenient type to pass to
+ * talloc(). Unlike talloc(), it is not type safe (as it returns a void *), so
+ * you are on your own for type checking.
+ *
+ * Best to use talloc() or talloc_array() instead.
+ *
+ * @param[in]  ctx     The talloc context to hang the result off.
+ *
+ * @param[in]  size    Number of char's that you want to allocate.
+ *
+ * @return             The allocated memory chunk, NULL on error.
+ *
+ * Example:
+ * @code
+ *      void *mem = talloc_size(NULL, 100);
+ * @endcode
+ */
+void *talloc_size(const void *ctx, size_t size);
+#else
+#define talloc_size(ctx, size) talloc_named_const(ctx, size, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Allocate into a typed pointer.
+ *
+ * The talloc_ptrtype() macro should be used when you have a pointer and want
+ * to allocate memory to point at with this pointer. When compiling with
+ * gcc >= 3 it is typesafe. Note this is a wrapper of talloc_size() and
+ * talloc_get_name() will return the current location in the source file and
+ * not the type.
+ *
+ * @param[in]  ctx      The talloc context to hang the result off.
+ *
+ * @param[in]  type     The pointer you want to assign the result to.
+ *
+ * @return              The properly casted allocated memory chunk, NULL on
+ *                      error.
+ *
+ * Example:
+ * @code
+ *       unsigned int *a = talloc_ptrtype(NULL, a);
+ * @endcode
+ */
+void *talloc_ptrtype(const void *ctx, #type);
+#else
+#define talloc_ptrtype(ctx, ptr) (_TALLOC_TYPEOF(ptr))talloc_size(ctx, sizeof(*(ptr)))
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Allocate a new 0-sized talloc chunk.
+ *
+ * This is a utility macro that creates a new memory context hanging off an
+ * existing context, automatically naming it "talloc_new: __location__" where
+ * __location__ is the source line it is called from. It is particularly
+ * useful for creating a new temporary working context.
+ *
+ * @param[in]  ctx      The talloc parent context.
+ *
+ * @return              A new talloc chunk, NULL on error.
+ */
+void *talloc_new(const void *ctx);
+#else
+#define talloc_new(ctx) talloc_named_const(ctx, 0, "talloc_new: " __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Allocate a 0-initizialized structure.
+ *
+ * The macro is equivalent to:
+ *
+ * @code
+ *      ptr = talloc(ctx, type);
+ *      if (ptr) memset(ptr, 0, sizeof(type));
+ * @endcode
+ *
+ * @param[in]  ctx      The talloc context to hang the result off.
+ *
+ * @param[in]  type     The type that we want to allocate.
+ *
+ * @return              Pointer to a piece of memory, properly cast to 'type *',
+ *                      NULL on error.
+ *
+ * Example:
+ * @code
+ *      unsigned int *a, *b;
+ *      a = talloc_zero(NULL, unsigned int);
+ *      b = talloc_zero(a, unsigned int);
+ * @endcode
+ *
+ * @see talloc()
+ * @see talloc_zero_size()
+ * @see talloc_zero_array()
+ */
+void *talloc_zero(const void *ctx, #type);
+
+/**
+ * @brief Allocate untyped, 0-initialized memory.
+ *
+ * @param[in]  ctx      The talloc context to hang the result off.
+ *
+ * @param[in]  size     Number of char's that you want to allocate.
+ *
+ * @return              The allocated memory chunk.
+ */
+void *talloc_zero_size(const void *ctx, size_t size);
+#else
+#define talloc_zero(ctx, type) (type *)_talloc_zero(ctx, sizeof(type), #type)
+#define talloc_zero_size(ctx, size) _talloc_zero(ctx, size, __location__)
+void *_talloc_zero(const void *ctx, size_t size, const char *name);
+#endif
+
+/**
+ * @brief Return the name of a talloc chunk.
+ *
+ * @param[in]  ptr      The talloc chunk.
+ *
+ * @return              The current name for the given talloc pointer.
+ *
+ * @see talloc_set_name()
+ */
+const char *talloc_get_name(const void *ptr);
+
+/**
+ * @brief Verify that a talloc chunk carries a specified name.
+ *
+ * This function checks if a pointer has the specified name. If it does
+ * then the pointer is returned.
+ *
+ * @param[in]  ptr       The talloc chunk to check.
+ *
+ * @param[in]  name      The name to check against.
+ *
+ * @return               The pointer if the name matches, NULL if it doesn't.
+ */
+void *talloc_check_name(const void *ptr, const char *name);
+
+/**
+ * @brief Get the parent chunk of a pointer.
+ *
+ * @param[in]  ptr      The talloc pointer to inspect.
+ *
+ * @return              The talloc parent of ptr, NULL on error.
+ */
+void *talloc_parent(const void *ptr);
+
+/**
+ * @brief Get a talloc chunk's parent name.
+ *
+ * @param[in]  ptr      The talloc pointer to inspect.
+ *
+ * @return              The name of ptr's parent chunk.
+ */
+const char *talloc_parent_name(const void *ptr);
+
+/**
+ * @brief Get the total size of a talloc chunk including its children.
+ *
+ * The function returns the total size in bytes used by this pointer and all
+ * child pointers. Mostly useful for debugging.
+ *
+ * Passing NULL is allowed, but it will only give a meaningful result if
+ * talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+ * been called.
+ *
+ * @param[in]  ptr      The talloc chunk.
+ *
+ * @return              The total size.
+ */
+size_t talloc_total_size(const void *ptr);
+
+/**
+ * @brief Get the number of talloc chunks hanging off a chunk.
+ *
+ * The talloc_total_blocks() function returns the total memory block
+ * count used by this pointer and all child pointers. Mostly useful for
+ * debugging.
+ *
+ * Passing NULL is allowed, but it will only give a meaningful result if
+ * talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+ * been called.
+ *
+ * @param[in]  ptr      The talloc chunk.
+ *
+ * @return              The total size.
+ */
+size_t talloc_total_blocks(const void *ptr);
+
+#ifdef DOXYGEN
+/**
+ * @brief Duplicate a memory area into a talloc chunk.
+ *
+ * The function is equivalent to:
+ *
+ * @code
+ *      ptr = talloc_size(ctx, size);
+ *      if (ptr) memcpy(ptr, p, size);
+ * @endcode
+ *
+ * @param[in]  t        The talloc context to hang the result off.
+ *
+ * @param[in]  p        The memory chunk you want to duplicate.
+ *
+ * @param[in]  size     Number of char's that you want copy.
+ *
+ * @return              The allocated memory chunk.
+ *
+ * @see talloc_size()
+ */
+void *talloc_memdup(const void *t, const void *p, size_t size);
+#else
+#define talloc_memdup(t, p, size) _talloc_memdup(t, p, size, __location__)
+void *_talloc_memdup(const void *t, const void *p, size_t size, const char *name);
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Assign a type to a talloc chunk.
+ *
+ * This macro allows you to force the name of a pointer to be of a particular
+ * type. This can be used in conjunction with talloc_get_type() to do type
+ * checking on void* pointers.
+ *
+ * It is equivalent to this:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, #type)
+ * @endcode
+ *
+ * @param[in]  ptr      The talloc chunk to assign the type to.
+ *
+ * @param[in]  type     The type to assign.
+ */
+void talloc_set_type(const char *ptr, #type);
+
+/**
+ * @brief Get a typed pointer out of a talloc pointer.
+ *
+ * This macro allows you to do type checking on talloc pointers. It is
+ * particularly useful for void* private pointers. It is equivalent to
+ * this:
+ *
+ * @code
+ *      (type *)talloc_check_name(ptr, #type)
+ * @endcode
+ *
+ * @param[in]  ptr      The talloc pointer to check.
+ *
+ * @param[in]  type     The type to check against.
+ *
+ * @return              The properly casted pointer given by ptr, NULL on error.
+ */
+type *talloc_get_type(const void *ptr, #type);
+#else
+#define talloc_set_type(ptr, type) talloc_set_name_const(ptr, #type)
+#define talloc_get_type(ptr, type) (type *)talloc_check_name(ptr, #type)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Safely turn a void pointer into a typed pointer.
+ *
+ * This macro is used together with talloc(mem_ctx, struct foo). If you had to
+ * assing the talloc chunk pointer to some void pointer variable,
+ * talloc_get_type_abort() is the recommended way to get the convert the void
+ * pointer back to a typed pointer.
+ *
+ * @param[in]  ptr      The void pointer to convert.
+ *
+ * @param[in]  type     The type that this chunk contains
+ *
+ * @return              The same value as ptr, type-checked and properly cast.
+ */
+void *talloc_get_type_abort(const void *ptr, #type);
+#else
+#define talloc_get_type_abort(ptr, type) (type *)_talloc_get_type_abort(ptr, #type, __location__)
+void *_talloc_get_type_abort(const void *ptr, const char *name, const char *location);
+#endif
+
+/**
+ * @brief Find a parent context by name.
+ *
+ * Find a parent memory context of the current context that has the given
+ * name. This can be very useful in complex programs where it may be
+ * difficult to pass all information down to the level you need, but you
+ * know the structure you want is a parent of another context.
+ *
+ * @param[in]  ctx      The talloc chunk to start from.
+ *
+ * @param[in]  name     The name of the parent we look for.
+ *
+ * @return              The memory context we are looking for, NULL if not
+ *                      found.
+ */
+void *talloc_find_parent_byname(const void *ctx, const char *name);
+
+#ifdef DOXYGEN
+/**
+ * @brief Find a parent context by type.
+ *
+ * Find a parent memory context of the current context that has the given
+ * name. This can be very useful in complex programs where it may be
+ * difficult to pass all information down to the level you need, but you
+ * know the structure you want is a parent of another context.
+ *
+ * Like talloc_find_parent_byname() but takes a type, making it typesafe.
+ *
+ * @param[in]  ptr      The talloc chunk to start from.
+ *
+ * @param[in]  type     The type of the parent to look for.
+ *
+ * @return              The memory context we are looking for, NULL if not
+ *                      found.
+ */
+void *talloc_find_parent_bytype(const void *ptr, #type);
+#else
+#define talloc_find_parent_bytype(ptr, type) (type *)talloc_find_parent_byname(ptr, #type)
+#endif
+
+/**
+ * @brief Allocate a talloc pool.
+ *
+ * A talloc pool is a pure optimization for specific situations. In the
+ * release process for Samba 3.2 we found out that we had become considerably
+ * slower than Samba 3.0 was. Profiling showed that malloc(3) was a large CPU
+ * consumer in benchmarks. For Samba 3.2 we have internally converted many
+ * static buffers to dynamically allocated ones, so malloc(3) being beaten
+ * more was no surprise. But it made us slower.
+ *
+ * talloc_pool() is an optimization to call malloc(3) a lot less for the use
+ * pattern Samba has: The SMB protocol is mainly a request/response protocol
+ * where we have to allocate a certain amount of memory per request and free
+ * that after the SMB reply is sent to the client.
+ *
+ * talloc_pool() creates a talloc chunk that you can use as a talloc parent
+ * exactly as you would use any other ::TALLOC_CTX. The difference is that
+ * when you talloc a child of this pool, no malloc(3) is done. Instead, talloc
+ * just increments a pointer inside the talloc_pool. This also works
+ * recursively. If you use the child of the talloc pool as a parent for
+ * grand-children, their memory is also taken from the talloc pool.
+ *
+ * If there is not enough memory in the pool to allocate the new child,
+ * it will create a new talloc chunk as if the parent was a normal talloc
+ * context.
+ *
+ * If you talloc_free() children of a talloc pool, the memory is not given
+ * back to the system. Instead, free(3) is only called if the talloc_pool()
+ * itself is released with talloc_free().
+ *
+ * The downside of a talloc pool is that if you talloc_move() a child of a
+ * talloc pool to a talloc parent outside the pool, the whole pool memory is
+ * not free(3)'ed until that moved chunk is also talloc_free()ed.
+ *
+ * @param[in]  context  The talloc context to hang the result off (must not
+ *                     be another pool).
+ *
+ * @param[in]  size     Size of the talloc pool.
+ *
+ * @return              The allocated talloc pool, NULL on error.
+ */
+void *talloc_pool(const void *context, size_t size);
+
+/**
+ * @brief Free a talloc chunk and NULL out the pointer.
+ *
+ * TALLOC_FREE() frees a pointer and sets it to NULL. Use this if you want
+ * immediate feedback (i.e. crash) if you use a pointer after having free'ed
+ * it.
+ *
+ * @param[in]  ctx      The chunk to be freed.
+ */
+#define TALLOC_FREE(ctx) do { talloc_free(ctx); ctx=NULL; } while(0)
+
+/* @} ******************************************************************/
+
+/**
+ * \defgroup talloc_ref The talloc reference function.
+ * @ingroup talloc
+ *
+ * This module contains the definitions around talloc references
+ *
+ * @{
+ */
+
+/**
+ * @brief Increase the reference count of a talloc chunk.
+ *
+ * The talloc_increase_ref_count(ptr) function is exactly equivalent to:
+ *
+ * @code
+ *      talloc_reference(NULL, ptr);
+ * @endcode
+ *
+ * You can use either syntax, depending on which you think is clearer in
+ * your code.
+ *
+ * @param[in]  ptr      The pointer to increase the reference count.
+ *
+ * @return              0 on success, -1 on error.
+ */
+int talloc_increase_ref_count(const void *ptr);
+
+/**
+ * @brief Get the number of references to a talloc chunk.
+ *
+ * @param[in]  ptr      The pointer to retrieve the reference count from.
+ *
+ * @return              The number of references.
+ */
+size_t talloc_reference_count(const void *ptr);
+
+#ifdef DOXYGEN
+/**
+ * @brief Create an additional talloc parent to a pointer.
+ *
+ * The talloc_reference() function makes "context" an additional parent of
+ * ptr. Each additional reference consumes around 48 bytes of memory on intel
+ * x86 platforms.
+ *
+ * If ptr is NULL, then the function is a no-op, and simply returns NULL.
+ *
+ * After creating a reference you can free it in one of the following ways:
+ *
+ * - you can talloc_free() any parent of the original pointer. That
+ *   will reduce the number of parents of this pointer by 1, and will
+ *   cause this pointer to be freed if it runs out of parents.
+ *
+ * - you can talloc_free() the pointer itself if it has at maximum one
+ *   parent. This behaviour has been changed since the release of version
+ *   2.0. Further informations in the description of "talloc_free".
+ *
+ * For more control on which parent to remove, see talloc_unlink()
+ * @param[in]  ctx      The additional parent.
+ *
+ * @param[in]  ptr      The pointer you want to create an additional parent for.
+ *
+ * @return              The original pointer 'ptr', NULL if talloc ran out of
+ *                      memory in creating the reference.
+ *
+ * Example:
+ * @code
+ *      unsigned int *a, *b, *c;
+ *      a = talloc(NULL, unsigned int);
+ *      b = talloc(NULL, unsigned int);
+ *      c = talloc(a, unsigned int);
+ *      // b also serves as a parent of c.
+ *      talloc_reference(b, c);
+ * @endcode
+ *
+ * @see talloc_unlink()
+ */
+void *talloc_reference(const void *ctx, const void *ptr);
+#else
+#define talloc_reference(ctx, ptr) (_TALLOC_TYPEOF(ptr))_talloc_reference_loc((ctx),(ptr), __location__)
+void *_talloc_reference_loc(const void *context, const void *ptr, const char *location);
+#endif
+
+/**
+ * @brief Remove a specific parent from a talloc chunk.
+ *
+ * The function removes a specific parent from ptr. The context passed must
+ * either be a context used in talloc_reference() with this pointer, or must be
+ * a direct parent of ptr.
+ *
+ * You can just use talloc_free() instead of talloc_unlink() if there
+ * is at maximum one parent. This behaviour has been changed since the
+ * release of version 2.0. Further informations in the description of
+ * "talloc_free".
+ *
+ * @param[in]  context  The talloc parent to remove.
+ *
+ * @param[in]  ptr      The talloc ptr you want to remove the parent from.
+ *
+ * @return              0 on success, -1 on error.
+ *
+ * @note If the parent has already been removed using talloc_free() then
+ * this function will fail and will return -1.  Likewise, if ptr is NULL,
+ * then the function will make no modifications and return -1.
+ *
+ * Example:
+ * @code
+ *      unsigned int *a, *b, *c;
+ *      a = talloc(NULL, unsigned int);
+ *      b = talloc(NULL, unsigned int);
+ *      c = talloc(a, unsigned int);
+ *      // b also serves as a parent of c.
+ *      talloc_reference(b, c);
+ *      talloc_unlink(b, c);
+ * @endcode
+ */
+int talloc_unlink(const void *context, void *ptr);
+
+/**
+ * @brief Provide a talloc context that is freed at program exit.
+ *
+ * This is a handy utility function that returns a talloc context
+ * which will be automatically freed on program exit. This can be used
+ * to reduce the noise in memory leak reports.
+ *
+ * Never use this in code that might be used in objects loaded with
+ * dlopen and unloaded with dlclose. talloc_autofree_context()
+ * internally uses atexit(3). Some platforms like modern Linux handles
+ * this fine, but for example FreeBSD does not deal well with dlopen()
+ * and atexit() used simultaneously: dlclose() does not clean up the
+ * list of atexit-handlers, so when the program exits the code that
+ * was registered from within talloc_autofree_context() is gone, the
+ * program crashes at exit.
+ *
+ * @return              A talloc context, NULL on error.
+ */
+void *talloc_autofree_context(void);
+
+/**
+ * @brief Get the size of a talloc chunk.
+ *
+ * This function lets you know the amount of memory allocated so far by
+ * this context. It does NOT account for subcontext memory.
+ * This can be used to calculate the size of an array.
+ *
+ * @param[in]  ctx      The talloc chunk.
+ *
+ * @return              The size of the talloc chunk.
+ */
+size_t talloc_get_size(const void *ctx);
+
+/**
+ * @brief Show the parentage of a context.
+ *
+ * @param[in]  context            The talloc context to look at.
+ *
+ * @param[in]  file               The output to use, a file, stdout or stderr.
+ */
+void talloc_show_parents(const void *context, FILE *file);
+
+/**
+ * @brief Check if a context is parent of a talloc chunk.
+ *
+ * This checks if context is referenced in the talloc hierarchy above ptr.
+ *
+ * @param[in]  context  The assumed talloc context.
+ *
+ * @param[in]  ptr      The talloc chunk to check.
+ *
+ * @return              Return 1 if this is the case, 0 if not.
+ */
+int talloc_is_parent(const void *context, const void *ptr);
+
+/**
+ * @brief Change the parent context of a talloc pointer.
+ *
+ * The function changes the parent context of a talloc pointer. It is typically
+ * used when the context that the pointer is currently a child of is going to be
+ * freed and you wish to keep the memory for a longer time.
+ *
+ * The difference between talloc_reparent() and talloc_steal() is that
+ * talloc_reparent() can specify which parent you wish to change. This is
+ * useful when a pointer has multiple parents via references.
+ *
+ * @param[in]  old_parent
+ * @param[in]  new_parent
+ * @param[in]  ptr
+ *
+ * @return              Return the pointer you passed. It does not have any
+ *                      failure modes.
+ */
+void *talloc_reparent(const void *old_parent, const void *new_parent, const void *ptr);
+
+/* @} ******************************************************************/
+
+/**
+ * @defgroup talloc_array The talloc array functions
+ * @ingroup talloc
+ *
+ * Talloc contains some handy helpers for handling Arrays conveniently
+ *
+ * @{
+ */
+
+#ifdef DOXYGEN
+/**
+ * @brief Allocate an array.
+ *
+ * The macro is equivalent to:
+ *
+ * @code
+ *      (type *)talloc_size(ctx, sizeof(type) * count);
+ * @endcode
+ *
+ * except that it provides integer overflow protection for the multiply,
+ * returning NULL if the multiply overflows.
+ *
+ * @param[in]  ctx      The talloc context to hang the result off.
+ *
+ * @param[in]  type     The type that we want to allocate.
+ *
+ * @param[in]  count    The number of 'type' elements you want to allocate.
+ *
+ * @return              The allocated result, properly cast to 'type *', NULL on
+ *                      error.
+ *
+ * Example:
+ * @code
+ *      unsigned int *a, *b;
+ *      a = talloc_zero(NULL, unsigned int);
+ *      b = talloc_array(a, unsigned int, 100);
+ * @endcode
+ *
+ * @see talloc()
+ * @see talloc_zero_array()
+ */
+void *talloc_array(const void *ctx, #type, unsigned count);
+#else
+#define talloc_array(ctx, type, count) (type *)_talloc_array(ctx, sizeof(type), count, #type)
+void *_talloc_array(const void *ctx, size_t el_size, unsigned count, const char *name);
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Allocate an array.
+ *
+ * @param[in]  ctx      The talloc context to hang the result off.
+ *
+ * @param[in]  size     The size of an array element.
+ *
+ * @param[in]  count    The number of elements you want to allocate.
+ *
+ * @return              The allocated result, NULL on error.
+ */
+void *talloc_array_size(const void *ctx, size_t size, unsigned count);
+#else
+#define talloc_array_size(ctx, size, count) _talloc_array(ctx, size, count, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Allocate an array into a typed pointer.
+ *
+ * The macro should be used when you have a pointer to an array and want to
+ * allocate memory of an array to point at with this pointer. When compiling
+ * with gcc >= 3 it is typesafe. Note this is a wrapper of talloc_array_size()
+ * and talloc_get_name() will return the current location in the source file
+ * and not the type.
+ *
+ * @param[in]  ctx      The talloc context to hang the result off.
+ *
+ * @param[in]  ptr      The pointer you want to assign the result to.
+ *
+ * @param[in]  count    The number of elements you want to allocate.
+ *
+ * @return              The allocated memory chunk, properly casted. NULL on
+ *                      error.
+ */
+void *talloc_array_ptrtype(const void *ctx, const void *ptr, unsigned count);
+#else
+#define talloc_array_ptrtype(ctx, ptr, count) (_TALLOC_TYPEOF(ptr))talloc_array_size(ctx, sizeof(*(ptr)), count)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Get the number of elements in a talloc'ed array.
+ *
+ * A talloc chunk carries its own size, so for talloc'ed arrays it is not
+ * necessary to store the number of elements explicitly.
+ *
+ * @param[in]  ctx      The allocated array.
+ *
+ * @return              The number of elements in ctx.
+ */
+size_t talloc_array_length(const void *ctx);
+#else
+#define talloc_array_length(ctx) (talloc_get_size(ctx)/sizeof(*ctx))
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Allocate a zero-initialized array
+ *
+ * @param[in]  ctx      The talloc context to hang the result off.
+ *
+ * @param[in]  type     The type that we want to allocate.
+ *
+ * @param[in]  count    The number of "type" elements you want to allocate.
+ *
+ * @return              The allocated result casted to "type *", NULL on error.
+ *
+ * The talloc_zero_array() macro is equivalent to:
+ *
+ * @code
+ *     ptr = talloc_array(ctx, type, count);
+ *     if (ptr) memset(ptr, sizeof(type) * count);
+ * @endcode
+ */
+void *talloc_zero_array(const void *ctx, #type, unsigned count);
+#else
+#define talloc_zero_array(ctx, type, count) (type *)_talloc_zero_array(ctx, sizeof(type), count, #type)
+void *_talloc_zero_array(const void *ctx,
+                        size_t el_size,
+                        unsigned count,
+                        const char *name);
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Change the size of a talloc array.
+ *
+ * The macro changes the size of a talloc pointer. The 'count' argument is the
+ * number of elements of type 'type' that you want the resulting pointer to
+ * hold.
+ *
+ * talloc_realloc() has the following equivalences:
+ *
+ * @code
+ *      talloc_realloc(ctx, NULL, type, 1) ==> talloc(ctx, type);
+ *      talloc_realloc(ctx, NULL, type, N) ==> talloc_array(ctx, type, N);
+ *      talloc_realloc(ctx, ptr, type, 0)  ==> talloc_free(ptr);
+ * @endcode
+ *
+ * The "context" argument is only used if "ptr" is NULL, otherwise it is
+ * ignored.
+ *
+ * @param[in]  ctx      The parent context used if ptr is NULL.
+ *
+ * @param[in]  ptr      The chunk to be resized.
+ *
+ * @param[in]  type     The type of the array element inside ptr.
+ *
+ * @param[in]  count    The intended number of array elements.
+ *
+ * @return              The new array, NULL on error. The call will fail either
+ *                      due to a lack of memory, or because the pointer has more
+ *                      than one parent (see talloc_reference()).
+ */
+void *talloc_realloc(const void *ctx, void *ptr, #type, size_t count);
+#else
+#define talloc_realloc(ctx, p, type, count) (type *)_talloc_realloc_array(ctx, p, sizeof(type), count, #type)
+void *_talloc_realloc_array(const void *ctx, void *ptr, size_t el_size, unsigned count, const char *name);
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Untyped realloc to change the size of a talloc array.
+ *
+ * The macro is useful when the type is not known so the typesafe
+ * talloc_realloc() cannot be used.
+ *
+ * @param[in]  ctx      The parent context used if 'ptr' is NULL.
+ *
+ * @param[in]  ptr      The chunk to be resized.
+ *
+ * @param[in]  size     The new chunk size.
+ *
+ * @return              The new array, NULL on error.
+ */
+void *talloc_realloc_size(const void *ctx, void *ptr, size_t size);
+#else
+#define talloc_realloc_size(ctx, ptr, size) _talloc_realloc(ctx, ptr, size, __location__)
+void *_talloc_realloc(const void *context, void *ptr, size_t size, const char *name);
+#endif
+
+/**
+ * @brief Provide a function version of talloc_realloc_size.
+ *
+ * This is a non-macro version of talloc_realloc(), which is useful as
+ * libraries sometimes want a ralloc function pointer. A realloc()
+ * implementation encapsulates the functionality of malloc(), free() and
+ * realloc() in one call, which is why it is useful to be able to pass around
+ * a single function pointer.
+ *
+ * @param[in]  context  The parent context used if ptr is NULL.
+ *
+ * @param[in]  ptr      The chunk to be resized.
+ *
+ * @param[in]  size     The new chunk size.
+ *
+ * @return              The new chunk, NULL on error.
+ */
+void *talloc_realloc_fn(const void *context, void *ptr, size_t size);
+
+/* @} ******************************************************************/
+
+/**
+ * @defgroup talloc_string The talloc string functions.
+ * @ingroup talloc
+ *
+ * talloc string allocation and manipulation functions.
+ * @{
+ */
+
+/**
+ * @brief Duplicate a string into a talloc chunk.
+ *
+ * This function is equivalent to:
+ *
+ * @code
+ *      ptr = talloc_size(ctx, strlen(p)+1);
+ *      if (ptr) memcpy(ptr, p, strlen(p)+1);
+ * @endcode
+ *
+ * This functions sets the name of the new pointer to the passed
+ * string. This is equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, ptr)
+ * @endcode
+ *
+ * @param[in]  t        The talloc context to hang the result off.
+ *
+ * @param[in]  p        The string you want to duplicate.
+ *
+ * @return              The duplicated string, NULL on error.
+ */
+char *talloc_strdup(const void *t, const char *p);
+
+/**
+ * @brief Append a string to given string.
+ *
+ * The destination string is reallocated to take
+ * <code>strlen(s) + strlen(a) + 1</code> characters.
+ *
+ * This functions sets the name of the new pointer to the new
+ * string. This is equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, ptr)
+ * @endcode
+ *
+ * If <code>s == NULL</code> then new context is created.
+ *
+ * @param[in]  s        The destination to append to.
+ *
+ * @param[in]  a        The string you want to append.
+ *
+ * @return              The concatenated strings, NULL on error.
+ *
+ * @see talloc_strdup()
+ * @see talloc_strdup_append_buffer()
+ */
+char *talloc_strdup_append(char *s, const char *a);
+
+/**
+ * @brief Append a string to a given buffer.
+ *
+ * This is a more efficient version of talloc_strdup_append(). It determines the
+ * length of the destination string by the size of the talloc context.
+ *
+ * Use this very carefully as it produces a different result than
+ * talloc_strdup_append() when a zero character is in the middle of the
+ * destination string.
+ *
+ * @code
+ *      char *str_a = talloc_strdup(NULL, "hello world");
+ *      char *str_b = talloc_strdup(NULL, "hello world");
+ *      str_a[5] = str_b[5] = '\0'
+ *
+ *      char *app = talloc_strdup_append(str_a, ", hello");
+ *      char *buf = talloc_strdup_append_buffer(str_b, ", hello");
+ *
+ *      printf("%s\n", app); // hello, hello (app = "hello, hello")
+ *      printf("%s\n", buf); // hello (buf = "hello\0world, hello")
+ * @endcode
+ *
+ * If <code>s == NULL</code> then new context is created.
+ *
+ * @param[in]  s        The destination buffer to append to.
+ *
+ * @param[in]  a        The string you want to append.
+ *
+ * @return              The concatenated strings, NULL on error.
+ *
+ * @see talloc_strdup()
+ * @see talloc_strdup_append()
+ * @see talloc_array_length()
+ */
+char *talloc_strdup_append_buffer(char *s, const char *a);
+
+/**
+ * @brief Duplicate a length-limited string into a talloc chunk.
+ *
+ * This function is the talloc equivalent of the C library function strndup(3).
+ *
+ * This functions sets the name of the new pointer to the passed string. This is
+ * equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, ptr)
+ * @endcode
+ *
+ * @param[in]  t        The talloc context to hang the result off.
+ *
+ * @param[in]  p        The string you want to duplicate.
+ *
+ * @param[in]  n        The maximum string length to duplicate.
+ *
+ * @return              The duplicated string, NULL on error.
+ */
+char *talloc_strndup(const void *t, const char *p, size_t n);
+
+/**
+ * @brief Append at most n characters of a string to given string.
+ *
+ * The destination string is reallocated to take
+ * <code>strlen(s) + strnlen(a, n) + 1</code> characters.
+ *
+ * This functions sets the name of the new pointer to the new
+ * string. This is equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, ptr)
+ * @endcode
+ *
+ * If <code>s == NULL</code> then new context is created.
+ *
+ * @param[in]  s        The destination string to append to.
+ *
+ * @param[in]  a        The source string you want to append.
+ *
+ * @param[in]  n        The number of characters you want to append from the
+ *                      string.
+ *
+ * @return              The concatenated strings, NULL on error.
+ *
+ * @see talloc_strndup()
+ * @see talloc_strndup_append_buffer()
+ */
+char *talloc_strndup_append(char *s, const char *a, size_t n);
+
+/**
+ * @brief Append at most n characters of a string to given buffer
+ *
+ * This is a more efficient version of talloc_strndup_append(). It determines
+ * the length of the destination string by the size of the talloc context.
+ *
+ * Use this very carefully as it produces a different result than
+ * talloc_strndup_append() when a zero character is in the middle of the
+ * destination string.
+ *
+ * @code
+ *      char *str_a = talloc_strdup(NULL, "hello world");
+ *      char *str_b = talloc_strdup(NULL, "hello world");
+ *      str_a[5] = str_b[5] = '\0'
+ *
+ *      char *app = talloc_strndup_append(str_a, ", hello", 7);
+ *      char *buf = talloc_strndup_append_buffer(str_b, ", hello", 7);
+ *
+ *      printf("%s\n", app); // hello, hello (app = "hello, hello")
+ *      printf("%s\n", buf); // hello (buf = "hello\0world, hello")
+ * @endcode
+ *
+ * If <code>s == NULL</code> then new context is created.
+ *
+ * @param[in]  s        The destination buffer to append to.
+ *
+ * @param[in]  a        The source string you want to append.
+ *
+ * @param[in]  n        The number of characters you want to append from the
+ *                      string.
+ *
+ * @return              The concatenated strings, NULL on error.
+ *
+ * @see talloc_strndup()
+ * @see talloc_strndup_append()
+ * @see talloc_array_length()
+ */
+char *talloc_strndup_append_buffer(char *s, const char *a, size_t n);
+
+/**
+ * @brief Format a string given a va_list.
+ *
+ * This function is the talloc equivalent of the C library function
+ * vasprintf(3).
+ *
+ * This functions sets the name of the new pointer to the new string. This is
+ * equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, ptr)
+ * @endcode
+ *
+ * @param[in]  t        The talloc context to hang the result off.
+ *
+ * @param[in]  fmt      The format string.
+ *
+ * @param[in]  ap       The parameters used to fill fmt.
+ *
+ * @return              The formatted string, NULL on error.
+ */
+char *talloc_vasprintf(const void *t, const char *fmt, va_list ap) PRINTF_ATTRIBUTE(2,0);
+
+/**
+ * @brief Format a string given a va_list and append it to the given destination
+ *        string.
+ *
+ * @param[in]  s        The destination string to append to.
+ *
+ * @param[in]  fmt      The format string.
+ *
+ * @param[in]  ap       The parameters used to fill fmt.
+ *
+ * @return              The formatted string, NULL on error.
+ *
+ * @see talloc_vasprintf()
+ */
+char *talloc_vasprintf_append(char *s, const char *fmt, va_list ap) PRINTF_ATTRIBUTE(2,0);
+
+/**
+ * @brief Format a string given a va_list and append it to the given destination
+ *        buffer.
+ *
+ * @param[in]  s        The destination buffer to append to.
+ *
+ * @param[in]  fmt      The format string.
+ *
+ * @param[in]  ap       The parameters used to fill fmt.
+ *
+ * @return              The formatted string, NULL on error.
+ *
+ * @see talloc_vasprintf()
+ */
+char *talloc_vasprintf_append_buffer(char *s, const char *fmt, va_list ap) PRINTF_ATTRIBUTE(2,0);
+
+/**
+ * @brief Format a string.
+ *
+ * This function is the talloc equivalent of the C library function asprintf(3).
+ *
+ * This functions sets the name of the new pointer to the new string. This is
+ * equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, ptr)
+ * @endcode
+ *
+ * @param[in]  t        The talloc context to hang the result off.
+ *
+ * @param[in]  fmt      The format string.
+ *
+ * @param[in]  ...      The parameters used to fill fmt.
+ *
+ * @return              The formatted string, NULL on error.
+ */
+char *talloc_asprintf(const void *t, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+
+/**
+ * @brief Append a formatted string to another string.
+ *
+ * This function appends the given formatted string to the given string. Use
+ * this variant when the string in the current talloc buffer may have been
+ * truncated in length.
+ *
+ * This functions sets the name of the new pointer to the new
+ * string. This is equivalent to:
+ *
+ * @code
+ *      talloc_set_name_const(ptr, ptr)
+ * @endcode
+ *
+ * If <code>s == NULL</code> then new context is created.
+ *
+ * @param[in]  s        The string to append to.
+ *
+ * @param[in]  fmt      The format string.
+ *
+ * @param[in]  ...      The parameters used to fill fmt.
+ *
+ * @return              The formatted string, NULL on error.
+ */
+char *talloc_asprintf_append(char *s, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+
+/**
+ * @brief Append a formatted string to another string.
+ *
+ * This is a more efficient version of talloc_asprintf_append(). It determines
+ * the length of the destination string by the size of the talloc context.
+ *
+ * Use this very carefully as it produces a different result than
+ * talloc_asprintf_append() when a zero character is in the middle of the
+ * destination string.
+ *
+ * @code
+ *      char *str_a = talloc_strdup(NULL, "hello world");
+ *      char *str_b = talloc_strdup(NULL, "hello world");
+ *      str_a[5] = str_b[5] = '\0'
+ *
+ *      char *app = talloc_asprintf_append(str_a, "%s", ", hello");
+ *      char *buf = talloc_strdup_append_buffer(str_b, "%s", ", hello");
+ *
+ *      printf("%s\n", app); // hello, hello (app = "hello, hello")
+ *      printf("%s\n", buf); // hello (buf = "hello\0world, hello")
+ * @endcode
+ *
+ * If <code>s == NULL</code> then new context is created.
+ *
+ * @param[in]  s        The string to append to
+ *
+ * @param[in]  fmt      The format string.
+ *
+ * @param[in]  ...      The parameters used to fill fmt.
+ *
+ * @return              The formatted string, NULL on error.
+ *
+ * @see talloc_asprintf()
+ * @see talloc_asprintf_append()
+ */
+char *talloc_asprintf_append_buffer(char *s, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3);
+
+/* @} ******************************************************************/
+
+/**
+ * @defgroup talloc_debug The talloc debugging support functions
+ * @ingroup talloc
+ *
+ * To aid memory debugging, talloc contains routines to inspect the currently
+ * allocated memory hierarchy.
+ *
+ * @{
+ */
+
+/**
+ * @brief Walk a complete talloc hierarchy.
+ *
+ * This provides a more flexible reports than talloc_report(). It
+ * will recursively call the callback for the entire tree of memory
+ * referenced by the pointer. References in the tree are passed with
+ * is_ref = 1 and the pointer that is referenced.
+ *
+ * You can pass NULL for the pointer, in which case a report is
+ * printed for the top level memory context, but only if
+ * talloc_enable_leak_report() or talloc_enable_leak_report_full()
+ * has been called.
+ *
+ * The recursion is stopped when depth >= max_depth.
+ * max_depth = -1 means only stop at leaf nodes.
+ *
+ * @param[in]  ptr      The talloc chunk.
+ *
+ * @param[in]  depth    Internal parameter to control recursion. Call with 0.
+ *
+ * @param[in]  max_depth  Maximum recursion level.
+ *
+ * @param[in]  callback  Function to be called on every chunk.
+ *
+ * @param[in]  private_data  Private pointer passed to callback.
+ */
+void talloc_report_depth_cb(const void *ptr, int depth, int max_depth,
+                           void (*callback)(const void *ptr,
+                                            int depth, int max_depth,
+                                            int is_ref,
+                                            void *private_data),
+                           void *private_data);
+
+/**
+ * @brief Print a talloc hierarchy.
+ *
+ * This provides a more flexible reports than talloc_report(). It
+ * will let you specify the depth and max_depth.
+ *
+ * @param[in]  ptr      The talloc chunk.
+ *
+ * @param[in]  depth    Internal parameter to control recursion. Call with 0.
+ *
+ * @param[in]  max_depth  Maximum recursion level.
+ *
+ * @param[in]  f        The file handle to print to.
+ */
+void talloc_report_depth_file(const void *ptr, int depth, int max_depth, FILE *f);
+
+/**
+ * @brief Print a summary report of all memory used by ptr.
+ *
+ * This provides a more detailed report than talloc_report(). It will
+ * recursively print the entire tree of memory referenced by the
+ * pointer. References in the tree are shown by giving the name of the
+ * pointer that is referenced.
+ *
+ * You can pass NULL for the pointer, in which case a report is printed
+ * for the top level memory context, but only if
+ * talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+ * been called.
+ *
+ * @param[in]  ptr      The talloc chunk.
+ *
+ * @param[in]  f        The file handle to print to.
+ *
+ * Example:
+ * @code
+ *      unsigned int *a, *b;
+ *      a = talloc(NULL, unsigned int);
+ *      b = talloc(a, unsigned int);
+ *      fprintf(stderr, "Dumping memory tree for a:\n");
+ *      talloc_report_full(a, stderr);
+ * @endcode
+ *
+ * @see talloc_report()
+ */
+void talloc_report_full(const void *ptr, FILE *f);
+
+/**
+ * @brief Print a summary report of all memory used by ptr.
+ *
+ * This function prints a summary report of all memory used by ptr. One line of
+ * report is printed for each immediate child of ptr, showing the total memory
+ * and number of blocks used by that child.
+ *
+ * You can pass NULL for the pointer, in which case a report is printed
+ * for the top level memory context, but only if talloc_enable_leak_report()
+ * or talloc_enable_leak_report_full() has been called.
+ *
+ * @param[in]  ptr      The talloc chunk.
+ *
+ * @param[in]  f        The file handle to print to.
+ *
+ * Example:
+ * @code
+ *      unsigned int *a, *b;
+ *      a = talloc(NULL, unsigned int);
+ *      b = talloc(a, unsigned int);
+ *      fprintf(stderr, "Summary of memory tree for a:\n");
+ *      talloc_report(a, stderr);
+ * @endcode
+ *
+ * @see talloc_report_full()
+ */
+void talloc_report(const void *ptr, FILE *f);
+
+/**
+ * @brief Enable tracking the use of NULL memory contexts.
+ *
+ * This enables tracking of the NULL memory context without enabling leak
+ * reporting on exit. Useful for when you want to do your own leak
+ * reporting call via talloc_report_null_full();
+ */
+void talloc_enable_null_tracking(void);
+
+/**
+ * @brief Enable tracking the use of NULL memory contexts.
+ *
+ * This enables tracking of the NULL memory context without enabling leak
+ * reporting on exit. Useful for when you want to do your own leak
+ * reporting call via talloc_report_null_full();
+ */
+void talloc_enable_null_tracking_no_autofree(void);
+
+/**
+ * @brief Disable tracking of the NULL memory context.
+ *
+ * This disables tracking of the NULL memory context.
+ */
+void talloc_disable_null_tracking(void);
+
+/**
+ * @brief Enable leak report when a program exits.
+ *
+ * This enables calling of talloc_report(NULL, stderr) when the program
+ * exits. In Samba4 this is enabled by using the --leak-report command
+ * line option.
+ *
+ * For it to be useful, this function must be called before any other
+ * talloc function as it establishes a "null context" that acts as the
+ * top of the tree. If you don't call this function first then passing
+ * NULL to talloc_report() or talloc_report_full() won't give you the
+ * full tree printout.
+ *
+ * Here is a typical talloc report:
+ *
+ * @code
+ * talloc report on 'null_context' (total 267 bytes in 15 blocks)
+ *      libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+ *      libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+ *      iconv(UTF8,CP850)              contains     42 bytes in   2 blocks
+ *      libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+ *      iconv(CP850,UTF8)              contains     42 bytes in   2 blocks
+ *      iconv(UTF8,UTF-16LE)           contains     45 bytes in   2 blocks
+ *      iconv(UTF-16LE,UTF8)           contains     45 bytes in   2 blocks
+ * @endcode
+ */
+void talloc_enable_leak_report(void);
+
+/**
+ * @brief Enable full leak report when a program exits.
+ *
+ * This enables calling of talloc_report_full(NULL, stderr) when the
+ * program exits. In Samba4 this is enabled by using the
+ * --leak-report-full command line option.
+ *
+ * For it to be useful, this function must be called before any other
+ * talloc function as it establishes a "null context" that acts as the
+ * top of the tree. If you don't call this function first then passing
+ * NULL to talloc_report() or talloc_report_full() won't give you the
+ * full tree printout.
+ *
+ * Here is a typical full report:
+ *
+ * @code
+ * full talloc report on 'root' (total 18 bytes in 8 blocks)
+ *      p1                             contains     18 bytes in   7 blocks (ref 0)
+ *      r1                             contains     13 bytes in   2 blocks (ref 0)
+ *      reference to: p2
+ *      p2                             contains      1 bytes in   1 blocks (ref 1)
+ *      x3                             contains      1 bytes in   1 blocks (ref 0)
+ *      x2                             contains      1 bytes in   1 blocks (ref 0)
+ *      x1                             contains      1 bytes in   1 blocks (ref 0)
+ * @endcode
+ */
+void talloc_enable_leak_report_full(void);
+
+/**
+ * @brief Set a custom "abort" function that is called on serious error.
+ *
+ * The default "abort" function is <code>abort()</code>.
+ *
+ * The "abort" function is called when:
+ *
+ * <ul>
+ *  <li>talloc_get_type_abort() fails</li>
+ *  <li>the provided pointer is not a valid talloc context</li>
+ *  <li>when the context meta data are invalid</li>
+ *  <li>when access after free is detected</li>
+ * </ul>
+ *
+ * Example:
+ *
+ * @code
+ * void my_abort(const char *reason)
+ * {
+ *      fprintf(stderr, "talloc abort: %s\n", reason);
+ *      abort();
+ * }
+ *
+ *      talloc_set_abort_fn(my_abort);
+ * @endcode
+ *
+ * @param[in]  abort_fn      The new "abort" function.
+ *
+ * @see talloc_set_log_fn()
+ * @see talloc_get_type()
+ */
+void talloc_set_abort_fn(void (*abort_fn)(const char *reason));
+
+/**
+ * @brief Set a logging function.
+ *
+ * @param[in]  log_fn      The logging function.
+ *
+ * @see talloc_set_log_stderr()
+ * @see talloc_set_abort_fn()
+ */
+void talloc_set_log_fn(void (*log_fn)(const char *message));
+
+/**
+ * @brief Set stderr as the output for logs.
+ *
+ * @see talloc_set_log_fn()
+ * @see talloc_set_abort_fn()
+ */
+void talloc_set_log_stderr(void);
+
+/**
+ * @brief Set a max memory limit for the current context hierarchy
+ *       This affects all children of this context and constrain any
+ *       allocation in the hierarchy to never exceed the limit set.
+ *       The limit can be removed by setting 0 (unlimited) as the
+ *       max_size by calling the funciton again on the sam context.
+ *       Memory limits can also be nested, meaning a hild can have
+ *       a stricter memory limit than a parent.
+ *       Memory limits are enforced only at memory allocation time.
+ *       Stealing a context into a 'limited' hierarchy properly
+ *       updates memory usage but does *not* cause failure if the
+ *       move causes the new parent to exceed its limits. However
+ *       any further allocation on that hierarchy will then fail.
+ *
+ * @param[in]  ctx             The talloc context to set the limit on
+ * @param[in]  max_size        The (new) max_size
+ */
+int talloc_set_memlimit(const void *ctx, size_t max_size);
+
+/* @} ******************************************************************/
+
+#if TALLOC_DEPRECATED
+#define talloc_zero_p(ctx, type) talloc_zero(ctx, type)
+#define talloc_p(ctx, type) talloc(ctx, type)
+#define talloc_array_p(ctx, type, count) talloc_array(ctx, type, count)
+#define talloc_realloc_p(ctx, p, type, count) talloc_realloc(ctx, p, type, count)
+#define talloc_destroy(ctx) talloc_free(ctx)
+#define talloc_append_string(c, s, a) (s?talloc_strdup_append(s,a):talloc_strdup(c, a))
+#endif
+
+#ifndef TALLOC_MAX_DEPTH
+#define TALLOC_MAX_DEPTH 10000
+#endif
+
+#ifdef __cplusplus
+} /* end of extern "C" */
+#endif
+
+#endif
diff --git a/ctdb/lib/talloc/talloc.i b/ctdb/lib/talloc/talloc.i
new file mode 100644 (file)
index 0000000..a9afb97
--- /dev/null
@@ -0,0 +1,31 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Copyright (C) Jelmer Vernooij <jelmer@samba.org> 2007
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Don't expose talloc contexts in Python code. Python does reference 
+   counting for us, so just create a new top-level talloc context.
+ */
+%typemap(in, numinputs=0, noblock=1) TALLOC_CTX * {
+    $1 = NULL;
+}
+
+%define %talloctype(TYPE)
+%nodefaultctor TYPE;
+%extend TYPE {
+    ~TYPE() { talloc_free($self); }
+}
+%enddef
diff --git a/ctdb/lib/talloc/talloc.pc.in b/ctdb/lib/talloc/talloc.pc.in
new file mode 100644 (file)
index 0000000..437281a
--- /dev/null
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: talloc 
+Description: A hierarchical pool based memory system with destructors
+Version: @TALLOC_VERSION@
+Libs: @LIB_RPATH@ -L${libdir} -ltalloc
+Cflags: -I${includedir} 
+URL: http://talloc.samba.org/
diff --git a/ctdb/lib/talloc/talloc_guide.txt b/ctdb/lib/talloc/talloc_guide.txt
new file mode 100644 (file)
index 0000000..16afc9b
--- /dev/null
@@ -0,0 +1,767 @@
+Using talloc in Samba4
+======================
+
+.. contents::
+
+Andrew Tridgell
+August 2009
+
+The most current version of this document is available at
+   http://samba.org/ftp/unpacked/talloc/talloc_guide.txt
+
+If you are used to the "old" talloc from Samba3 before 3.0.20 then please read
+this carefully, as talloc has changed a lot. With 3.0.20 (or 3.0.14?) the
+Samba4 talloc has been ported back to Samba3, so this guide applies to both.
+
+The new talloc is a hierarchical, reference counted memory pool system
+with destructors. Quite a mouthful really, but not too bad once you
+get used to it.
+
+Perhaps the biggest change from Samba3 is that there is no distinction
+between a "talloc context" and a "talloc pointer". Any pointer
+returned from talloc() is itself a valid talloc context. This means
+you can do this::
+
+  struct foo *X = talloc(mem_ctx, struct foo);
+  X->name = talloc_strdup(X, "foo");
+
+and the pointer X->name would be a "child" of the talloc context "X"
+which is itself a child of "mem_ctx". So if you do talloc_free(mem_ctx)
+then it is all destroyed, whereas if you do talloc_free(X) then just X
+and X->name are destroyed, and if you do talloc_free(X->name) then
+just the name element of X is destroyed.
+
+If you think about this, then what this effectively gives you is an
+n-ary tree, where you can free any part of the tree with
+talloc_free().
+
+If you find this confusing, then I suggest you run the testsuite to
+watch talloc in action. You may also like to add your own tests to
+testsuite.c to clarify how some particular situation is handled.
+
+
+Performance
+-----------
+
+All the additional features of talloc() over malloc() do come at a
+price. We have a simple performance test in Samba4 that measures
+talloc() versus malloc() performance, and it seems that talloc() is
+about 4% slower than malloc() on my x86 Debian Linux box. For Samba,
+the great reduction in code complexity that we get by using talloc
+makes this worthwhile, especially as the total overhead of
+talloc/malloc in Samba is already quite small.
+
+
+talloc API
+----------
+
+The following is a complete guide to the talloc API. Read it all at
+least twice.
+
+Multi-threading
+---------------
+
+talloc itself does not deal with threads. It is thread-safe (assuming  
+the underlying "malloc" is), as long as each thread uses different  
+memory contexts.
+If two threads use the same context then they need to synchronize in
+order to be safe. In particular:
+- when using talloc_enable_leak_report(), giving directly NULL as a  
+parent context implicitly refers to a hidden "null context" global  
+variable, so this should not be used in a multi-threaded environment  
+without proper synchronization ;
+- the context returned by talloc_autofree_context() is also global so  
+shouldn't be used by several threads simultaneously without  
+synchronization.
+
+talloc and shared objects
+-------------------------
+
+talloc can be used in shared objects. Special care needs to be taken
+to never use talloc_autofree_context() in code that might be loaded
+with dlopen() and unloaded with dlclose(), as talloc_autofree_context()
+internally uses atexit(3). Some platforms like modern Linux handles
+this fine, but for example FreeBSD does not deal well with dlopen()
+and atexit() used simultaneously: dlclose() does not clean up the list
+of atexit-handlers, so when the program exits the code that was
+registered from within talloc_autofree_context() is gone, the program
+crashes at exit.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc(const void *context, type);
+
+The talloc() macro is the core of the talloc library. It takes a
+memory context and a type, and returns a pointer to a new area of
+memory of the given type.
+
+The returned pointer is itself a talloc context, so you can use it as
+the context argument to more calls to talloc if you wish.
+
+The returned pointer is a "child" of the supplied context. This means
+that if you talloc_free() the context then the new child disappears as
+well. Alternatively you can free just the child.
+
+The context argument to talloc() can be NULL, in which case a new top
+level context is created. 
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_size(const void *context, size_t size);
+
+The function talloc_size() should be used when you don't have a
+convenient type to pass to talloc(). Unlike talloc(), it is not type
+safe (as it returns a void *), so you are on your own for type checking.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(typeof(ptr)) talloc_ptrtype(const void *ctx, ptr);
+
+The talloc_ptrtype() macro should be used when you have a pointer and
+want to allocate memory to point at with this pointer. When compiling
+with gcc >= 3 it is typesafe. Note this is a wrapper of talloc_size()
+and talloc_get_name() will return the current location in the source file.
+and not the type.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+int talloc_free(void *ptr);
+
+The talloc_free() function frees a piece of talloc memory, and all its
+children. You can call talloc_free() on any pointer returned by
+talloc().
+
+The return value of talloc_free() indicates success or failure, with 0
+returned for success and -1 for failure. A possible failure condition
+is if the pointer had a destructor attached to it and the destructor
+returned -1. See talloc_set_destructor() for details on
+destructors. Likewise, if "ptr" is NULL, then the function will make
+no modifications and returns -1.
+
+From version 2.0 and onwards, as a special case, talloc_free() is
+refused on pointers that have more than one parent associated, as talloc
+would have no way of knowing which parent should be removed. This is
+different from older versions in the sense that always the reference to
+the most recently established parent has been destroyed. Hence to free a
+pointer that has more than one parent please use talloc_unlink().
+
+To help you find problems in your code caused by this behaviour, if
+you do try and free a pointer with more than one parent then the
+talloc logging function will be called to give output like this:
+
+  ERROR: talloc_free with references at some_dir/source/foo.c:123
+       reference at some_dir/source/other.c:325
+       reference at some_dir/source/third.c:121
+
+Please see the documentation for talloc_set_log_fn() and
+talloc_set_log_stderr() for more information on talloc logging
+functions.
+
+talloc_free() operates recursively on its children.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_free_children(void *ptr);
+
+The talloc_free_children() walks along the list of all children of a
+talloc context and talloc_free()s only the children, not the context
+itself.
+
+A NULL argument is handled as no-op.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_reference(const void *context, const void *ptr);
+
+The talloc_reference() function makes "context" an additional parent
+of "ptr".
+
+The return value of talloc_reference() is always the original pointer
+"ptr", unless talloc ran out of memory in creating the reference in
+which case it will return NULL (each additional reference consumes
+around 48 bytes of memory on intel x86 platforms).
+
+If "ptr" is NULL, then the function is a no-op, and simply returns NULL.
+
+After creating a reference you can free it in one of the following
+ways:
+
+  - you can talloc_free() any parent of the original pointer. That
+    will reduce the number of parents of this pointer by 1, and will
+    cause this pointer to be freed if it runs out of parents.
+
+  - you can talloc_free() the pointer itself if it has at maximum one
+    parent. This behaviour has been changed since the release of version
+    2.0. Further informations in the description of "talloc_free".
+
+For more control on which parent to remove, see talloc_unlink()
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+int talloc_unlink(const void *context, const void *ptr);
+
+The talloc_unlink() function removes a specific parent from ptr. The
+context passed must either be a context used in talloc_reference()
+with this pointer, or must be a direct parent of ptr. 
+
+Note that if the parent has already been removed using talloc_free()
+then this function will fail and will return -1.  Likewise, if "ptr"
+is NULL, then the function will make no modifications and return -1.
+
+You can just use talloc_free() instead of talloc_unlink() if there
+is at maximum one parent. This behaviour has been changed since the
+release of version 2.0. Further informations in the description of
+"talloc_free".
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_destructor(const void *ptr, int (*destructor)(void *));
+
+The function talloc_set_destructor() sets the "destructor" for the
+pointer "ptr". A destructor is a function that is called when the
+memory used by a pointer is about to be released. The destructor
+receives the pointer as an argument, and should return 0 for success
+and -1 for failure.
+
+The destructor can do anything it wants to, including freeing other
+pieces of memory. A common use for destructors is to clean up
+operating system resources (such as open file descriptors) contained
+in the structure the destructor is placed on.
+
+You can only place one destructor on a pointer. If you need more than
+one destructor then you can create a zero-length child of the pointer
+and place an additional destructor on that.
+
+To remove a destructor call talloc_set_destructor() with NULL for the
+destructor.
+
+If your destructor attempts to talloc_free() the pointer that it is
+the destructor for then talloc_free() will return -1 and the free will
+be ignored. This would be a pointless operation anyway, as the
+destructor is only called when the memory is just about to go away.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+int talloc_increase_ref_count(const void *ptr);
+
+The talloc_increase_ref_count(ptr) function is exactly equivalent to:
+
+  talloc_reference(NULL, ptr);
+
+You can use either syntax, depending on which you think is clearer in
+your code.
+
+It returns 0 on success and -1 on failure.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+size_t talloc_reference_count(const void *ptr);
+
+Return the number of references to the pointer.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_name(const void *ptr, const char *fmt, ...);
+
+Each talloc pointer has a "name". The name is used principally for
+debugging purposes, although it is also possible to set and get the
+name on a pointer in as a way of "marking" pointers in your code.
+
+The main use for names on pointer is for "talloc reports". See
+talloc_report() and talloc_report_full() for details. Also see
+talloc_enable_leak_report() and talloc_enable_leak_report_full().
+
+The talloc_set_name() function allocates memory as a child of the
+pointer. It is logically equivalent to:
+  talloc_set_name_const(ptr, talloc_asprintf(ptr, fmt, ...));
+
+Note that multiple calls to talloc_set_name() will allocate more
+memory without releasing the name. All of the memory is released when
+the ptr is freed using talloc_free().
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_name_const(const void *ptr, const char *name);
+
+The function talloc_set_name_const() is just like talloc_set_name(),
+but it takes a string constant, and is much faster. It is extensively
+used by the "auto naming" macros, such as talloc_p().
+
+This function does not allocate any memory. It just copies the
+supplied pointer into the internal representation of the talloc
+ptr. This means you must not pass a name pointer to memory that will
+disappear before the ptr is freed with talloc_free().
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_named(const void *context, size_t size, const char *fmt, ...);
+
+The talloc_named() function creates a named talloc pointer. It is
+equivalent to:
+
+   ptr = talloc_size(context, size);
+   talloc_set_name(ptr, fmt, ....);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_named_const(const void *context, size_t size, const char *name);
+
+This is equivalent to::
+
+   ptr = talloc_size(context, size);
+   talloc_set_name_const(ptr, name);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+const char *talloc_get_name(const void *ptr);
+
+This returns the current name for the given talloc pointer. See
+talloc_set_name() for details.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_init(const char *fmt, ...);
+
+This function creates a zero length named talloc context as a top
+level context. It is equivalent to::
+
+  talloc_named(NULL, 0, fmt, ...);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_new(void *ctx);
+
+This is a utility macro that creates a new memory context hanging
+off an exiting context, automatically naming it "talloc_new: __location__"
+where __location__ is the source line it is called from. It is
+particularly useful for creating a new temporary working context.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_realloc(const void *context, void *ptr, type, count);
+
+The talloc_realloc() macro changes the size of a talloc
+pointer. The "count" argument is the number of elements of type "type"
+that you want the resulting pointer to hold. 
+
+talloc_realloc() has the following equivalences::
+
+  talloc_realloc(context, NULL, type, 1) ==> talloc(context, type);
+  talloc_realloc(context, NULL, type, N) ==> talloc_array(context, type, N);
+  talloc_realloc(context, ptr, type, 0)  ==> talloc_free(ptr);
+
+The "context" argument is only used if "ptr" is NULL, otherwise it is
+ignored.
+
+talloc_realloc() returns the new pointer, or NULL on failure. The call
+will fail either due to a lack of memory, or because the pointer has
+more than one parent (see talloc_reference()).
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_realloc_size(const void *context, void *ptr, size_t size);
+
+the talloc_realloc_size() function is useful when the type is not 
+known so the typesafe talloc_realloc() cannot be used.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_steal(const void *new_ctx, const void *ptr);
+
+The talloc_steal() function changes the parent context of a talloc
+pointer. It is typically used when the context that the pointer is
+currently a child of is going to be freed and you wish to keep the
+memory for a longer time. 
+
+The talloc_steal() function returns the pointer that you pass it. It
+does not have any failure modes.
+
+NOTE: It is possible to produce loops in the parent/child relationship
+if you are not careful with talloc_steal(). No guarantees are provided
+as to your sanity or the safety of your data if you do this.
+
+talloc_steal (new_ctx, NULL) will return NULL with no sideeffects.
+
+Note that if you try and call talloc_steal() on a pointer that has
+more than one parent then the result is ambiguous. Talloc will choose
+to remove the parent that is currently indicated by talloc_parent()
+and replace it with the chosen parent. You will also get a message
+like this via the talloc logging functions:
+
+  WARNING: talloc_steal with references at some_dir/source/foo.c:123
+       reference at some_dir/source/other.c:325
+       reference at some_dir/source/third.c:121
+
+To unambiguously change the parent of a pointer please see the
+function talloc_reparent(). See the talloc_set_log_fn() documentation
+for more information on talloc logging.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_reparent(const void *old_parent, const void *new_parent, const void *ptr);
+
+The talloc_reparent() function changes the parent context of a talloc
+pointer. It is typically used when the context that the pointer is
+currently a child of is going to be freed and you wish to keep the
+memory for a longer time.
+
+The talloc_reparent() function returns the pointer that you pass it. It
+does not have any failure modes.
+
+The difference between talloc_reparent() and talloc_steal() is that
+talloc_reparent() can specify which parent you wish to change. This is
+useful when a pointer has multiple parents via references.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_parent(const void *ptr);
+
+The talloc_parent() function returns the current talloc parent. This
+is usually the pointer under which this memory was originally created,
+but it may have changed due to a talloc_steal() or talloc_reparent()
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+size_t talloc_total_size(const void *ptr);
+
+The talloc_total_size() function returns the total size in bytes used
+by this pointer and all child pointers. Mostly useful for debugging.
+
+Passing NULL is allowed, but it will only give a meaningful result if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+size_t talloc_total_blocks(const void *ptr);
+
+The talloc_total_blocks() function returns the total memory block
+count used by this pointer and all child pointers. Mostly useful for
+debugging.
+
+Passing NULL is allowed, but it will only give a meaningful result if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_report_depth_cb(const void *ptr, int depth, int max_depth,
+                           void (*callback)(const void *ptr,
+                                            int depth, int max_depth,
+                                            int is_ref,
+                                            void *priv),
+                           void *priv);
+
+This provides a more flexible reports than talloc_report(). It
+will recursively call the callback for the entire tree of memory
+referenced by the pointer. References in the tree are passed with
+is_ref = 1 and the pointer that is referenced.
+
+You can pass NULL for the pointer, in which case a report is
+printed for the top level memory context, but only if
+talloc_enable_leak_report() or talloc_enable_leak_report_full()
+has been called.
+
+The recursion is stopped when depth >= max_depth.
+max_depth = -1 means only stop at leaf nodes.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_report_depth_file(const void *ptr, int depth, int max_depth, FILE *f);
+
+This provides a more flexible reports than talloc_report(). It
+will let you specify the depth and max_depth.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_report(const void *ptr, FILE *f);
+
+The talloc_report() function prints a summary report of all memory
+used by ptr. One line of report is printed for each immediate child of
+ptr, showing the total memory and number of blocks used by that child.
+
+You can pass NULL for the pointer, in which case a report is printed
+for the top level memory context, but only if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_report_full(const void *ptr, FILE *f);
+
+This provides a more detailed report than talloc_report(). It will
+recursively print the entire tree of memory referenced by the
+pointer. References in the tree are shown by giving the name of the
+pointer that is referenced.
+
+You can pass NULL for the pointer, in which case a report is printed
+for the top level memory context, but only if
+talloc_enable_leak_report() or talloc_enable_leak_report_full() has
+been called.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_enable_leak_report(void);
+
+This enables calling of talloc_report(NULL, stderr) when the program
+exits. In Samba4 this is enabled by using the --leak-report command
+line option.
+
+For it to be useful, this function must be called before any other
+talloc function as it establishes a "null context" that acts as the
+top of the tree. If you don't call this function first then passing
+NULL to talloc_report() or talloc_report_full() won't give you the
+full tree printout.
+
+Here is a typical talloc report:
+
+talloc report on 'null_context' (total 267 bytes in 15 blocks)
+        libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+        libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+        iconv(UTF8,CP850)              contains     42 bytes in   2 blocks
+        libcli/auth/spnego_parse.c:55  contains     31 bytes in   2 blocks
+        iconv(CP850,UTF8)              contains     42 bytes in   2 blocks
+        iconv(UTF8,UTF-16LE)           contains     45 bytes in   2 blocks
+        iconv(UTF-16LE,UTF8)           contains     45 bytes in   2 blocks
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_enable_leak_report_full(void);
+
+This enables calling of talloc_report_full(NULL, stderr) when the
+program exits. In Samba4 this is enabled by using the
+--leak-report-full command line option.
+
+For it to be useful, this function must be called before any other
+talloc function as it establishes a "null context" that acts as the
+top of the tree. If you don't call this function first then passing
+NULL to talloc_report() or talloc_report_full() won't give you the
+full tree printout.
+
+Here is a typical full report:
+
+full talloc report on 'root' (total 18 bytes in 8 blocks)
+    p1                             contains     18 bytes in   7 blocks (ref 0)
+        r1                             contains     13 bytes in   2 blocks (ref 0)
+            reference to: p2
+        p2                             contains      1 bytes in   1 blocks (ref 1)
+        x3                             contains      1 bytes in   1 blocks (ref 0)
+        x2                             contains      1 bytes in   1 blocks (ref 0)
+        x1                             contains      1 bytes in   1 blocks (ref 0)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_enable_null_tracking(void);
+
+This enables tracking of the NULL memory context without enabling leak
+reporting on exit. Useful for when you want to do your own leak
+reporting call via talloc_report_null_full();
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_disable_null_tracking(void);
+
+This disables tracking of the NULL memory context.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_zero(const void *ctx, type);
+
+The talloc_zero() macro is equivalent to::
+
+  ptr = talloc(ctx, type);
+  if (ptr) memset(ptr, 0, sizeof(type));
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_zero_size(const void *ctx, size_t size)
+
+The talloc_zero_size() function is useful when you don't have a known type
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_memdup(const void *ctx, const void *p, size_t size);
+
+The talloc_memdup() function is equivalent to::
+
+  ptr = talloc_size(ctx, size);
+  if (ptr) memcpy(ptr, p, size);
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_strdup(const void *ctx, const char *p);
+
+The talloc_strdup() function is equivalent to::
+
+  ptr = talloc_size(ctx, strlen(p)+1);
+  if (ptr) memcpy(ptr, p, strlen(p)+1);
+
+This functions sets the name of the new pointer to the passed
+string. This is equivalent to::
+
+   talloc_set_name_const(ptr, ptr)
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_strndup(const void *t, const char *p, size_t n);
+
+The talloc_strndup() function is the talloc equivalent of the C
+library function strndup()
+
+This functions sets the name of the new pointer to the passed
+string. This is equivalent to:
+   talloc_set_name_const(ptr, ptr)
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_append_string(const void *t, char *orig, const char *append);
+
+The talloc_append_string() function appends the given formatted
+string to the given string.
+
+This function sets the name of the new pointer to the new
+string. This is equivalent to::
+
+   talloc_set_name_const(ptr, ptr)
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_vasprintf(const void *t, const char *fmt, va_list ap);
+
+The talloc_vasprintf() function is the talloc equivalent of the C
+library function vasprintf()
+
+This functions sets the name of the new pointer to the new
+string. This is equivalent to::
+
+   talloc_set_name_const(ptr, ptr)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_asprintf(const void *t, const char *fmt, ...);
+
+The talloc_asprintf() function is the talloc equivalent of the C
+library function asprintf()
+
+This functions sets the name of the new pointer to the new
+string. This is equivalent to::
+
+   talloc_set_name_const(ptr, ptr)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_asprintf_append(char *s, const char *fmt, ...);
+
+The talloc_asprintf_append() function appends the given formatted
+string to the given string.
+Use this variant when the string in the current talloc buffer may
+have been truncated in length.
+
+This functions sets the name of the new pointer to the new
+string. This is equivalent to::
+
+   talloc_set_name_const(ptr, ptr)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+char *talloc_asprintf_append_buffer(char *s, const char *fmt, ...);
+
+The talloc_asprintf_append() function appends the given formatted 
+string to the end of the currently allocated talloc buffer.
+Use this variant when the string in the current talloc buffer has
+not been changed.
+
+This functions sets the name of the new pointer to the new
+string. This is equivalent to::
+
+   talloc_set_name_const(ptr, ptr)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+((type *)talloc_array(const void *ctx, type, unsigned int count);
+
+The talloc_array() macro is equivalent to::
+
+  (type *)talloc_size(ctx, sizeof(type) * count);
+
+except that it provides integer overflow protection for the multiply,
+returning NULL if the multiply overflows.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_array_size(const void *ctx, size_t size, unsigned int count);
+
+The talloc_array_size() function is useful when the type is not
+known. It operates in the same way as talloc_array(), but takes a size
+instead of a type.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(typeof(ptr)) talloc_array_ptrtype(const void *ctx, ptr, unsigned int count);
+
+The talloc_ptrtype() macro should be used when you have a pointer to an array
+and want to allocate memory of an array to point at with this pointer. When compiling
+with gcc >= 3 it is typesafe. Note this is a wrapper of talloc_array_size()
+and talloc_get_name() will return the current location in the source file.
+and not the type.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_realloc_fn(const void *ctx, void *ptr, size_t size);
+
+This is a non-macro version of talloc_realloc(), which is useful 
+as libraries sometimes want a ralloc function pointer. A realloc()
+implementation encapsulates the functionality of malloc(), free() and
+realloc() in one call, which is why it is useful to be able to pass
+around a single function pointer.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_autofree_context(void);
+
+This is a handy utility function that returns a talloc context
+which will be automatically freed on program exit. This can be used
+to reduce the noise in memory leak reports.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_check_name(const void *ptr, const char *name);
+
+This function checks if a pointer has the specified name. If it does
+then the pointer is returned. It it doesn't then NULL is returned.
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_get_type(const void *ptr, type);
+
+This macro allows you to do type checking on talloc pointers. It is
+particularly useful for void* private pointers. It is equivalent to
+this::
+
+   (type *)talloc_check_name(ptr, #type)
+
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+talloc_set_type(const void *ptr, type);
+
+This macro allows you to force the name of a pointer to be of a
+particular type. This can be used in conjunction with
+talloc_get_type() to do type checking on void* pointers.
+
+It is equivalent to this::
+
+   talloc_set_name_const(ptr, #type)
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+talloc_get_size(const void *ctx);
+
+This function lets you know the amount of memory allocated so far by
+this context. It does NOT account for subcontext memory.
+This can be used to calculate the size of an array.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void *talloc_find_parent_byname(const void *ctx, const char *name);
+
+Find a parent memory context of the current context that has the given
+name. This can be very useful in complex programs where it may be
+difficult to pass all information down to the level you need, but you
+know the structure you want is a parent of another context.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+(type *)talloc_find_parent_bytype(ctx, type);
+
+Like talloc_find_parent_byname() but takes a type, making it typesafe.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_log_fn(void (*log_fn)(const char *message));
+
+This function sets a logging function that talloc will use for
+warnings and errors. By default talloc will not print any warnings or
+errors.
+
+=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+void talloc_set_log_stderr(void)
+
+This sets the talloc log function to write log messages to stderr.
diff --git a/ctdb/lib/talloc/talloc_testsuite.h b/ctdb/lib/talloc/talloc_testsuite.h
new file mode 100644 (file)
index 0000000..acb9701
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef __LIB_TALLOC_TALLOC_TESTSUITE_H__
+#define __LIB_TALLOC_TALLOC_TESTSUITE_H__
+
+struct torture_context;
+bool torture_local_talloc(struct torture_context *tctx);
+
+#endif
diff --git a/ctdb/lib/talloc/testsuite.c b/ctdb/lib/talloc/testsuite.c
new file mode 100644 (file)
index 0000000..d456cbb
--- /dev/null
@@ -0,0 +1,1602 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   local testing of talloc routines.
+
+   Copyright (C) Andrew Tridgell 2004
+   
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/time.h"
+#include <talloc.h>
+
+#include "talloc_testsuite.h"
+
+static struct timeval timeval_current(void)
+{
+       struct timeval tv;
+       gettimeofday(&tv, NULL);
+       return tv;
+}
+
+static double timeval_elapsed(struct timeval *tv)
+{
+       struct timeval tv2 = timeval_current();
+       return (tv2.tv_sec - tv->tv_sec) + 
+              (tv2.tv_usec - tv->tv_usec)*1.0e-6;
+}
+
+#define torture_assert(test, expr, str) if (!(expr)) { \
+       printf("failure: %s [\n%s: Expression %s failed: %s\n]\n", \
+               test, __location__, #expr, str); \
+       return false; \
+}
+
+#define torture_assert_str_equal(test, arg1, arg2, desc) \
+       if (arg1 == NULL && arg2 == NULL) {                             \
+       } else if (strcmp(arg1, arg2)) {                        \
+               printf("failure: %s [\n%s: Expected %s, got %s: %s\n]\n", \
+                  test, __location__, arg1, arg2, desc); \
+               return false; \
+       }
+
+#define CHECK_SIZE(test, ptr, tsize) do { \
+       if (talloc_total_size(ptr) != (tsize)) { \
+               printf("failed: %s [\n%s: wrong '%s' tree size: got %u  expected %u\n]\n", \
+                      test, __location__, #ptr, \
+                      (unsigned)talloc_total_size(ptr), \
+                      (unsigned)tsize); \
+               talloc_report_full(ptr, stdout); \
+               return false; \
+       } \
+} while (0)
+
+#define CHECK_BLOCKS(test, ptr, tblocks) do { \
+       if (talloc_total_blocks(ptr) != (tblocks)) { \
+               printf("failed: %s [\n%s: wrong '%s' tree blocks: got %u  expected %u\n]\n", \
+                      test, __location__, #ptr, \
+                      (unsigned)talloc_total_blocks(ptr), \
+                      (unsigned)tblocks); \
+               talloc_report_full(ptr, stdout); \
+               return false; \
+       } \
+} while (0)
+
+#define CHECK_PARENT(test, ptr, parent) do { \
+       if (talloc_parent(ptr) != (parent)) { \
+               printf("failed: %s [\n%s: '%s' has wrong parent: got %p  expected %p\n]\n", \
+                      test, __location__, #ptr, \
+                      talloc_parent(ptr), \
+                      (parent)); \
+               talloc_report_full(ptr, stdout); \
+               talloc_report_full(parent, stdout); \
+               talloc_report_full(NULL, stdout); \
+               return false; \
+       } \
+} while (0)
+
+static unsigned int test_abort_count;
+
+#if 0
+static void test_abort_fn(const char *reason)
+{
+       printf("# test_abort_fn(%s)\n", reason);
+       test_abort_count++;
+}
+
+static void test_abort_start(void)
+{
+       test_abort_count = 0;
+       talloc_set_abort_fn(test_abort_fn);
+}
+#endif
+
+static void test_abort_stop(void)
+{
+       test_abort_count = 0;
+       talloc_set_abort_fn(NULL);
+}
+
+static void test_log_stdout(const char *message)
+{
+       fprintf(stdout, "%s", message);
+}
+
+/*
+  test references 
+*/
+static bool test_ref1(void)
+{
+       void *root, *p1, *p2, *ref, *r1;
+
+       printf("test: ref1\n# SINGLE REFERENCE FREE\n");
+
+       root = talloc_named_const(NULL, 0, "root");
+       p1 = talloc_named_const(root, 1, "p1");
+       p2 = talloc_named_const(p1, 1, "p2");
+       talloc_named_const(p1, 1, "x1");
+       talloc_named_const(p1, 2, "x2");
+       talloc_named_const(p1, 3, "x3");
+
+       r1 = talloc_named_const(root, 1, "r1"); 
+       ref = talloc_reference(r1, p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref1", p1, 5);
+       CHECK_BLOCKS("ref1", p2, 1);
+       CHECK_BLOCKS("ref1", r1, 2);
+
+       fprintf(stderr, "Freeing p2\n");
+       talloc_unlink(r1, p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref1", p1, 5);
+       CHECK_BLOCKS("ref1", p2, 1);
+       CHECK_BLOCKS("ref1", r1, 1);
+
+       fprintf(stderr, "Freeing p1\n");
+       talloc_free(p1);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref1", r1, 1);
+
+       fprintf(stderr, "Freeing r1\n");
+       talloc_free(r1);
+       talloc_report_full(NULL, stderr);
+
+       fprintf(stderr, "Testing NULL\n");
+       if (talloc_reference(root, NULL)) {
+               return false;
+       }
+
+       CHECK_BLOCKS("ref1", root, 1);
+
+       CHECK_SIZE("ref1", root, 0);
+
+       talloc_free(root);
+       printf("success: ref1\n");
+       return true;
+}
+
+/*
+  test references 
+*/
+static bool test_ref2(void)
+{
+       void *root, *p1, *p2, *ref, *r1;
+
+       printf("test: ref2\n# DOUBLE REFERENCE FREE\n");
+       root = talloc_named_const(NULL, 0, "root");
+       p1 = talloc_named_const(root, 1, "p1");
+       talloc_named_const(p1, 1, "x1");
+       talloc_named_const(p1, 1, "x2");
+       talloc_named_const(p1, 1, "x3");
+       p2 = talloc_named_const(p1, 1, "p2");
+
+       r1 = talloc_named_const(root, 1, "r1"); 
+       ref = talloc_reference(r1, p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref2", p1, 5);
+       CHECK_BLOCKS("ref2", p2, 1);
+       CHECK_BLOCKS("ref2", r1, 2);
+
+       fprintf(stderr, "Freeing ref\n");
+       talloc_unlink(r1, ref);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref2", p1, 5);
+       CHECK_BLOCKS("ref2", p2, 1);
+       CHECK_BLOCKS("ref2", r1, 1);
+
+       fprintf(stderr, "Freeing p2\n");
+       talloc_free(p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref2", p1, 4);
+       CHECK_BLOCKS("ref2", r1, 1);
+
+       fprintf(stderr, "Freeing p1\n");
+       talloc_free(p1);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref2", r1, 1);
+
+       fprintf(stderr, "Freeing r1\n");
+       talloc_free(r1);
+       talloc_report_full(root, stderr);
+
+       CHECK_SIZE("ref2", root, 0);
+
+       talloc_free(root);
+       printf("success: ref2\n");
+       return true;
+}
+
+/*
+  test references 
+*/
+static bool test_ref3(void)
+{
+       void *root, *p1, *p2, *ref, *r1;
+
+       printf("test: ref3\n# PARENT REFERENCE FREE\n");
+
+       root = talloc_named_const(NULL, 0, "root");
+       p1 = talloc_named_const(root, 1, "p1");
+       p2 = talloc_named_const(root, 1, "p2");
+       r1 = talloc_named_const(p1, 1, "r1");
+       ref = talloc_reference(p2, r1);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref3", p1, 2);
+       CHECK_BLOCKS("ref3", p2, 2);
+       CHECK_BLOCKS("ref3", r1, 1);
+
+       fprintf(stderr, "Freeing p1\n");
+       talloc_free(p1);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref3", p2, 2);
+       CHECK_BLOCKS("ref3", r1, 1);
+
+       fprintf(stderr, "Freeing p2\n");
+       talloc_free(p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_SIZE("ref3", root, 0);
+
+       talloc_free(root);
+
+       printf("success: ref3\n");
+       return true;
+}
+
+/*
+  test references 
+*/
+static bool test_ref4(void)
+{
+       void *root, *p1, *p2, *ref, *r1;
+
+       printf("test: ref4\n# REFERRER REFERENCE FREE\n");
+
+       root = talloc_named_const(NULL, 0, "root");
+       p1 = talloc_named_const(root, 1, "p1");
+       talloc_named_const(p1, 1, "x1");
+       talloc_named_const(p1, 1, "x2");
+       talloc_named_const(p1, 1, "x3");
+       p2 = talloc_named_const(p1, 1, "p2");
+
+       r1 = talloc_named_const(root, 1, "r1"); 
+       ref = talloc_reference(r1, p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref4", p1, 5);
+       CHECK_BLOCKS("ref4", p2, 1);
+       CHECK_BLOCKS("ref4", r1, 2);
+
+       fprintf(stderr, "Freeing r1\n");
+       talloc_free(r1);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref4", p1, 5);
+       CHECK_BLOCKS("ref4", p2, 1);
+
+       fprintf(stderr, "Freeing p2\n");
+       talloc_free(p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("ref4", p1, 4);
+
+       fprintf(stderr, "Freeing p1\n");
+       talloc_free(p1);
+       talloc_report_full(root, stderr);
+
+       CHECK_SIZE("ref4", root, 0);
+
+       talloc_free(root);
+
+       printf("success: ref4\n");
+       return true;
+}
+
+
+/*
+  test references 
+*/
+static bool test_unlink1(void)
+{
+       void *root, *p1, *p2, *ref, *r1;
+
+       printf("test: unlink\n# UNLINK\n");
+
+       root = talloc_named_const(NULL, 0, "root");
+       p1 = talloc_named_const(root, 1, "p1");
+       talloc_named_const(p1, 1, "x1");
+       talloc_named_const(p1, 1, "x2");
+       talloc_named_const(p1, 1, "x3");
+       p2 = talloc_named_const(p1, 1, "p2");
+
+       r1 = talloc_named_const(p1, 1, "r1");   
+       ref = talloc_reference(r1, p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("unlink", p1, 7);
+       CHECK_BLOCKS("unlink", p2, 1);
+       CHECK_BLOCKS("unlink", r1, 2);
+
+       fprintf(stderr, "Unreferencing r1\n");
+       talloc_unlink(r1, p2);
+       talloc_report_full(root, stderr);
+
+       CHECK_BLOCKS("unlink", p1, 6);
+       CHECK_BLOCKS("unlink", p2, 1);
+       CHECK_BLOCKS("unlink", r1, 1);
+
+       fprintf(stderr, "Freeing p1\n");
+       talloc_free(p1);
+       talloc_report_full(root, stderr);
+
+       CHECK_SIZE("unlink", root, 0);
+
+       talloc_free(root);
+
+       printf("success: unlink\n");
+       return true;
+}
+
+static int fail_destructor(void *ptr)
+{
+       return -1;
+}
+
+/*
+  miscellaneous tests to try to get a higher test coverage percentage
+*/
+static bool test_misc(void)
+{
+       void *root, *p1;
+       char *p2;
+       double *d;
+       const char *name;
+
+       printf("test: misc\n# MISCELLANEOUS\n");
+
+       root = talloc_new(NULL);
+
+       p1 = talloc_size(root, 0x7fffffff);
+       torture_assert("misc", !p1, "failed: large talloc allowed\n");
+
+       p1 = talloc_strdup(root, "foo");
+       talloc_increase_ref_count(p1);
+       talloc_increase_ref_count(p1);
+       talloc_increase_ref_count(p1);
+       CHECK_BLOCKS("misc", p1, 1);
+       CHECK_BLOCKS("misc", root, 2);
+       talloc_unlink(NULL, p1);
+       CHECK_BLOCKS("misc", p1, 1);
+       CHECK_BLOCKS("misc", root, 2);
+       talloc_unlink(NULL, p1);
+       CHECK_BLOCKS("misc", p1, 1);
+       CHECK_BLOCKS("misc", root, 2);
+       p2 = talloc_strdup(p1, "foo");
+       torture_assert("misc", talloc_unlink(root, p2) == -1,
+                                  "failed: talloc_unlink() of non-reference context should return -1\n");
+       torture_assert("misc", talloc_unlink(p1, p2) == 0,
+               "failed: talloc_unlink() of parent should succeed\n");
+       talloc_unlink(NULL, p1);
+       CHECK_BLOCKS("misc", p1, 1);
+       CHECK_BLOCKS("misc", root, 2);
+
+       name = talloc_set_name(p1, "my name is %s", "foo");
+       torture_assert_str_equal("misc", talloc_get_name(p1), "my name is foo",
+               "failed: wrong name after talloc_set_name(my name is foo)");
+       CHECK_BLOCKS("misc", p1, 2);
+       CHECK_BLOCKS("misc", root, 3);
+
+       talloc_set_name_const(p1, NULL);
+       torture_assert_str_equal ("misc", talloc_get_name(p1), "UNNAMED",
+               "failed: wrong name after talloc_set_name(NULL)");
+       CHECK_BLOCKS("misc", p1, 2);
+       CHECK_BLOCKS("misc", root, 3);
+
+       torture_assert("misc", talloc_free(NULL) == -1, 
+                                  "talloc_free(NULL) should give -1\n");
+
+       talloc_set_destructor(p1, fail_destructor);
+       torture_assert("misc", talloc_free(p1) == -1, 
+               "Failed destructor should cause talloc_free to fail\n");
+       talloc_set_destructor(p1, NULL);
+
+       talloc_report(root, stderr);
+
+
+       p2 = (char *)talloc_zero_size(p1, 20);
+       torture_assert("misc", p2[19] == 0, "Failed to give zero memory\n");
+       talloc_free(p2);
+
+       torture_assert("misc", talloc_strdup(root, NULL) == NULL,
+               "failed: strdup on NULL should give NULL\n");
+
+       p2 = talloc_strndup(p1, "foo", 2);
+       torture_assert("misc", strcmp("fo", p2) == 0, 
+                                  "strndup doesn't work\n");
+       p2 = talloc_asprintf_append_buffer(p2, "o%c", 'd');
+       torture_assert("misc", strcmp("food", p2) == 0, 
+                                  "talloc_asprintf_append_buffer doesn't work\n");
+       CHECK_BLOCKS("misc", p2, 1);
+       CHECK_BLOCKS("misc", p1, 3);
+
+       p2 = talloc_asprintf_append_buffer(NULL, "hello %s", "world");
+       torture_assert("misc", strcmp("hello world", p2) == 0,
+               "talloc_asprintf_append_buffer doesn't work\n");
+       CHECK_BLOCKS("misc", p2, 1);
+       CHECK_BLOCKS("misc", p1, 3);
+       talloc_free(p2);
+
+       d = talloc_array(p1, double, 0x20000000);
+       torture_assert("misc", !d, "failed: integer overflow not detected\n");
+
+       d = talloc_realloc(p1, d, double, 0x20000000);
+       torture_assert("misc", !d, "failed: integer overflow not detected\n");
+
+       talloc_free(p1);
+       CHECK_BLOCKS("misc", root, 1);
+
+       p1 = talloc_named(root, 100, "%d bytes", 100);
+       CHECK_BLOCKS("misc", p1, 2);
+       CHECK_BLOCKS("misc", root, 3);
+       talloc_unlink(root, p1);
+
+       p1 = talloc_init("%d bytes", 200);
+       p2 = talloc_asprintf(p1, "my test '%s'", "string");
+       torture_assert_str_equal("misc", p2, "my test 'string'",
+               "failed: talloc_asprintf(\"my test '%%s'\", \"string\") gave: \"%s\"");
+       CHECK_BLOCKS("misc", p1, 3);
+       CHECK_SIZE("misc", p2, 17);
+       CHECK_BLOCKS("misc", root, 1);
+       talloc_unlink(NULL, p1);
+
+       p1 = talloc_named_const(root, 10, "p1");
+       p2 = (char *)talloc_named_const(root, 20, "p2");
+       (void)talloc_reference(p1, p2);
+       talloc_report_full(root, stderr);
+       talloc_unlink(root, p2);
+       talloc_report_full(root, stderr);
+       CHECK_BLOCKS("misc", p2, 1);
+       CHECK_BLOCKS("misc", p1, 2);
+       CHECK_BLOCKS("misc", root, 3);
+       talloc_unlink(p1, p2);
+       talloc_unlink(root, p1);
+
+       p1 = talloc_named_const(root, 10, "p1");
+       p2 = (char *)talloc_named_const(root, 20, "p2");
+       (void)talloc_reference(NULL, p2);
+       talloc_report_full(root, stderr);
+       talloc_unlink(root, p2);
+       talloc_report_full(root, stderr);
+       CHECK_BLOCKS("misc", p2, 1);
+       CHECK_BLOCKS("misc", p1, 1);
+       CHECK_BLOCKS("misc", root, 2);
+       talloc_unlink(NULL, p2);
+       talloc_unlink(root, p1);
+
+       /* Test that talloc_unlink is a no-op */
+
+       torture_assert("misc", talloc_unlink(root, NULL) == -1,
+               "failed: talloc_unlink(root, NULL) == -1\n");
+
+       talloc_report(root, stderr);
+       talloc_report(NULL, stderr);
+
+       CHECK_SIZE("misc", root, 0);
+
+       talloc_free(root);
+
+       CHECK_SIZE("misc", NULL, 0);
+
+       talloc_enable_null_tracking_no_autofree();
+       talloc_enable_leak_report();
+       talloc_enable_leak_report_full();
+
+       printf("success: misc\n");
+
+       return true;
+}
+
+
+/*
+  test realloc
+*/
+static bool test_realloc(void)
+{
+       void *root, *p1, *p2;
+
+       printf("test: realloc\n# REALLOC\n");
+
+       root = talloc_new(NULL);
+
+       p1 = talloc_size(root, 10);
+       CHECK_SIZE("realloc", p1, 10);
+
+       p1 = talloc_realloc_size(NULL, p1, 20);
+       CHECK_SIZE("realloc", p1, 20);
+
+       talloc_new(p1);
+
+       p2 = talloc_realloc_size(p1, NULL, 30);
+
+       talloc_new(p1);
+
+       p2 = talloc_realloc_size(p1, p2, 40);
+
+       CHECK_SIZE("realloc", p2, 40);
+       CHECK_SIZE("realloc", root, 60);
+       CHECK_BLOCKS("realloc", p1, 4);
+
+       p1 = talloc_realloc_size(NULL, p1, 20);
+       CHECK_SIZE("realloc", p1, 60);
+
+       talloc_increase_ref_count(p2);
+       torture_assert("realloc", talloc_realloc_size(NULL, p2, 5) == NULL,
+               "failed: talloc_realloc() on a referenced pointer should fail\n");
+       CHECK_BLOCKS("realloc", p1, 4);
+
+       talloc_realloc_size(NULL, p2, 0);
+       talloc_realloc_size(NULL, p2, 0);
+       CHECK_BLOCKS("realloc", p1, 4);
+       talloc_realloc_size(p1, p2, 0);
+       CHECK_BLOCKS("realloc", p1, 3);
+
+       torture_assert("realloc", talloc_realloc_size(NULL, p1, 0x7fffffff) == NULL,
+               "failed: oversize talloc should fail\n");
+
+       talloc_realloc_size(NULL, p1, 0);
+       CHECK_BLOCKS("realloc", root, 4);
+       talloc_realloc_size(root, p1, 0);
+       CHECK_BLOCKS("realloc", root, 1);
+
+       CHECK_SIZE("realloc", root, 0);
+
+       talloc_free(root);
+
+       printf("success: realloc\n");
+
+       return true;
+}
+
+/*
+  test realloc with a child
+*/
+static bool test_realloc_child(void)
+{
+       void *root;
+       struct el2 {
+               const char *name;
+       } *el2; 
+       struct el1 {
+               int count;
+               struct el2 **list, **list2, **list3;
+       } *el1;
+
+       printf("test: REALLOC WITH CHILD\n");
+
+       root = talloc_new(NULL);
+
+       el1 = talloc(root, struct el1);
+       el1->list = talloc(el1, struct el2 *);
+       el1->list[0] = talloc(el1->list, struct el2);
+       el1->list[0]->name = talloc_strdup(el1->list[0], "testing");
+
+       el1->list2 = talloc(el1, struct el2 *);
+       el1->list2[0] = talloc(el1->list2, struct el2);
+       el1->list2[0]->name = talloc_strdup(el1->list2[0], "testing2");
+
+       el1->list3 = talloc(el1, struct el2 *);
+       el1->list3[0] = talloc(el1->list3, struct el2);
+       el1->list3[0]->name = talloc_strdup(el1->list3[0], "testing2");
+       
+       el2 = talloc(el1->list, struct el2);
+       el2 = talloc(el1->list2, struct el2);
+       el2 = talloc(el1->list3, struct el2);
+
+       el1->list = talloc_realloc(el1, el1->list, struct el2 *, 100);
+       el1->list2 = talloc_realloc(el1, el1->list2, struct el2 *, 200);
+       el1->list3 = talloc_realloc(el1, el1->list3, struct el2 *, 300);
+
+       talloc_free(root);
+
+       printf("success: REALLOC WITH CHILD\n");
+       return true;
+}
+
+/*
+  test type checking
+*/
+static bool test_type(void)
+{
+       void *root;
+       struct el1 {
+               int count;
+       };
+       struct el2 {
+               int count;
+       };
+       struct el1 *el1;
+
+       printf("test: type\n# talloc type checking\n");
+
+       root = talloc_new(NULL);
+
+       el1 = talloc(root, struct el1);
+
+       el1->count = 1;
+
+       torture_assert("type", talloc_get_type(el1, struct el1) == el1,
+               "type check failed on el1\n");
+       torture_assert("type", talloc_get_type(el1, struct el2) == NULL,
+               "type check failed on el1 with el2\n");
+       talloc_set_type(el1, struct el2);
+       torture_assert("type", talloc_get_type(el1, struct el2) == (struct el2 *)el1,
+               "type set failed on el1 with el2\n");
+
+       talloc_free(root);
+
+       printf("success: type\n");
+       return true;
+}
+
+/*
+  test steal
+*/
+static bool test_steal(void)
+{
+       void *root, *p1, *p2;
+
+       printf("test: steal\n# STEAL\n");
+
+       root = talloc_new(NULL);
+
+       p1 = talloc_array(root, char, 10);
+       CHECK_SIZE("steal", p1, 10);
+
+       p2 = talloc_realloc(root, NULL, char, 20);
+       CHECK_SIZE("steal", p1, 10);
+       CHECK_SIZE("steal", root, 30);
+
+       torture_assert("steal", talloc_steal(p1, NULL) == NULL,
+               "failed: stealing NULL should give NULL\n");
+
+       torture_assert("steal", talloc_steal(p1, p1) == p1,
+               "failed: stealing to ourselves is a nop\n");
+       CHECK_BLOCKS("steal", root, 3);
+       CHECK_SIZE("steal", root, 30);
+
+       talloc_steal(NULL, p1);
+       talloc_steal(NULL, p2);
+       CHECK_BLOCKS("steal", root, 1);
+       CHECK_SIZE("steal", root, 0);
+
+       talloc_free(p1);
+       talloc_steal(root, p2);
+       CHECK_BLOCKS("steal", root, 2);
+       CHECK_SIZE("steal", root, 20);
+       
+       talloc_free(p2);
+
+       CHECK_BLOCKS("steal", root, 1);
+       CHECK_SIZE("steal", root, 0);
+
+       talloc_free(root);
+
+       p1 = talloc_size(NULL, 3);
+       talloc_report_full(NULL, stderr);
+       CHECK_SIZE("steal", NULL, 3);
+       talloc_free(p1);
+
+       printf("success: steal\n");
+       return true;
+}
+
+/*
+  test move
+*/
+static bool test_move(void)
+{
+       void *root;
+       struct t_move {
+               char *p;
+               int *x;
+       } *t1, *t2;
+
+       printf("test: move\n# MOVE\n");
+
+       root = talloc_new(NULL);
+
+       t1 = talloc(root, struct t_move);
+       t2 = talloc(root, struct t_move);
+       t1->p = talloc_strdup(t1, "foo");
+       t1->x = talloc(t1, int);
+       *t1->x = 42;
+
+       t2->p = talloc_move(t2, &t1->p);
+       t2->x = talloc_move(t2, &t1->x);
+       torture_assert("move", t1->p == NULL && t1->x == NULL &&
+           strcmp(t2->p, "foo") == 0 && *t2->x == 42,
+               "talloc move failed");
+
+       talloc_free(root);
+
+       printf("success: move\n");
+
+       return true;
+}
+
+/*
+  test talloc_realloc_fn
+*/
+static bool test_realloc_fn(void)
+{
+       void *root, *p1;
+
+       printf("test: realloc_fn\n# talloc_realloc_fn\n");
+
+       root = talloc_new(NULL);
+
+       p1 = talloc_realloc_fn(root, NULL, 10);
+       CHECK_BLOCKS("realloc_fn", root, 2);
+       CHECK_SIZE("realloc_fn", root, 10);
+       p1 = talloc_realloc_fn(root, p1, 20);
+       CHECK_BLOCKS("realloc_fn", root, 2);
+       CHECK_SIZE("realloc_fn", root, 20);
+       p1 = talloc_realloc_fn(root, p1, 0);
+       CHECK_BLOCKS("realloc_fn", root, 1);
+       CHECK_SIZE("realloc_fn", root, 0);
+
+       talloc_free(root);
+
+       printf("success: realloc_fn\n");
+       return true;
+}
+
+
+static bool test_unref_reparent(void)
+{
+       void *root, *p1, *p2, *c1;
+
+       printf("test: unref_reparent\n# UNREFERENCE AFTER PARENT FREED\n");
+
+       root = talloc_named_const(NULL, 0, "root");
+       p1 = talloc_named_const(root, 1, "orig parent");
+       p2 = talloc_named_const(root, 1, "parent by reference");
+
+       c1 = talloc_named_const(p1, 1, "child");
+       talloc_reference(p2, c1);
+
+       CHECK_PARENT("unref_reparent", c1, p1);
+
+       talloc_free(p1);
+
+       CHECK_PARENT("unref_reparent", c1, p2);
+
+       talloc_unlink(p2, c1);
+
+       CHECK_SIZE("unref_reparent", root, 1);
+
+       talloc_free(p2);
+       talloc_free(root);
+
+       printf("success: unref_reparent\n");
+       return true;
+}
+
+/*
+  measure the speed of talloc versus malloc
+*/
+static bool test_speed(void)
+{
+       void *ctx = talloc_new(NULL);
+       unsigned count;
+       const int loop = 1000;
+       int i;
+       struct timeval tv;
+
+       printf("test: speed\n# TALLOC VS MALLOC SPEED\n");
+
+       tv = timeval_current();
+       count = 0;
+       do {
+               void *p1, *p2, *p3;
+               for (i=0;i<loop;i++) {
+                       p1 = talloc_size(ctx, loop % 100);
+                       p2 = talloc_strdup(p1, "foo bar");
+                       p3 = talloc_size(p1, 300);
+                       talloc_free(p1);
+               }
+               count += 3 * loop;
+       } while (timeval_elapsed(&tv) < 5.0);
+
+       fprintf(stderr, "talloc: %.0f ops/sec\n", count/timeval_elapsed(&tv));
+
+       talloc_free(ctx);
+
+       ctx = talloc_pool(NULL, 1024);
+
+       tv = timeval_current();
+       count = 0;
+       do {
+               void *p1, *p2, *p3;
+               for (i=0;i<loop;i++) {
+                       p1 = talloc_size(ctx, loop % 100);
+                       p2 = talloc_strdup(p1, "foo bar");
+                       p3 = talloc_size(p1, 300);
+                       talloc_free(p1);
+               }
+               count += 3 * loop;
+       } while (timeval_elapsed(&tv) < 5.0);
+
+       talloc_free(ctx);
+
+       fprintf(stderr, "talloc_pool: %.0f ops/sec\n", count/timeval_elapsed(&tv));
+
+       tv = timeval_current();
+       count = 0;
+       do {
+               void *p1, *p2, *p3;
+               for (i=0;i<loop;i++) {
+                       p1 = malloc(loop % 100);
+                       p2 = strdup("foo bar");
+                       p3 = malloc(300);
+                       free(p1);
+                       free(p2);
+                       free(p3);
+               }
+               count += 3 * loop;
+       } while (timeval_elapsed(&tv) < 5.0);
+       fprintf(stderr, "malloc: %.0f ops/sec\n", count/timeval_elapsed(&tv));
+
+       printf("success: speed\n");
+
+       return true;
+}
+
+static bool test_lifeless(void)
+{
+       void *top = talloc_new(NULL);
+       char *parent, *child; 
+       void *child_owner = talloc_new(NULL);
+
+       printf("test: lifeless\n# TALLOC_UNLINK LOOP\n");
+
+       parent = talloc_strdup(top, "parent");
+       child = talloc_strdup(parent, "child");  
+       (void)talloc_reference(child, parent);
+       (void)talloc_reference(child_owner, child); 
+       talloc_report_full(top, stderr);
+       talloc_unlink(top, parent);
+       talloc_unlink(top, child);
+       talloc_report_full(top, stderr);
+       talloc_free(top);
+       talloc_free(child_owner);
+       talloc_free(child);
+
+       printf("success: lifeless\n");
+       return true;
+}
+
+static int loop_destructor_count;
+
+static int test_loop_destructor(char *ptr)
+{
+       loop_destructor_count++;
+       return 0;
+}
+
+static bool test_loop(void)
+{
+       void *top = talloc_new(NULL);
+       char *parent;
+       struct req1 {
+               char *req2, *req3;
+       } *req1;
+
+       printf("test: loop\n# TALLOC LOOP DESTRUCTION\n");
+
+       parent = talloc_strdup(top, "parent");
+       req1 = talloc(parent, struct req1);
+       req1->req2 = talloc_strdup(req1, "req2");  
+       talloc_set_destructor(req1->req2, test_loop_destructor);
+       req1->req3 = talloc_strdup(req1, "req3");
+       (void)talloc_reference(req1->req3, req1);
+       talloc_report_full(top, stderr);
+       talloc_free(parent);
+       talloc_report_full(top, stderr);
+       talloc_report_full(NULL, stderr);
+       talloc_free(top);
+
+       torture_assert("loop", loop_destructor_count == 1, 
+                                  "FAILED TO FIRE LOOP DESTRUCTOR\n");
+       loop_destructor_count = 0;
+
+       printf("success: loop\n");
+       return true;
+}
+
+static int fail_destructor_str(char *ptr)
+{
+       return -1;
+}
+
+static bool test_free_parent_deny_child(void)
+{
+       void *top = talloc_new(NULL);
+       char *level1;
+       char *level2;
+       char *level3;
+
+       printf("test: free_parent_deny_child\n# TALLOC FREE PARENT DENY CHILD\n");
+
+       level1 = talloc_strdup(top, "level1");
+       level2 = talloc_strdup(level1, "level2");
+       level3 = talloc_strdup(level2, "level3");
+
+       talloc_set_destructor(level3, fail_destructor_str);
+       talloc_free(level1);
+       talloc_set_destructor(level3, NULL);
+
+       CHECK_PARENT("free_parent_deny_child", level3, top);
+
+       talloc_free(top);
+
+       printf("success: free_parent_deny_child\n");
+       return true;
+}
+
+static bool test_talloc_ptrtype(void)
+{
+       void *top = talloc_new(NULL);
+       struct struct1 {
+               int foo;
+               int bar;
+       } *s1, *s2, **s3, ***s4;
+       const char *location1;
+       const char *location2;
+       const char *location3;
+       const char *location4;
+
+       printf("test: ptrtype\n# TALLOC PTRTYPE\n");
+
+       s1 = talloc_ptrtype(top, s1);location1 = __location__;
+
+       if (talloc_get_size(s1) != sizeof(struct struct1)) {
+               printf("failure: ptrtype [\n"
+                 "talloc_ptrtype() allocated the wrong size %lu (should be %lu)\n"
+                 "]\n", (unsigned long)talloc_get_size(s1),
+                          (unsigned long)sizeof(struct struct1));
+               return false;
+       }
+
+       if (strcmp(location1, talloc_get_name(s1)) != 0) {
+               printf("failure: ptrtype [\n"
+                 "talloc_ptrtype() sets the wrong name '%s' (should be '%s')\n]\n",
+                       talloc_get_name(s1), location1);
+               return false;
+       }
+
+       s2 = talloc_array_ptrtype(top, s2, 10);location2 = __location__;
+
+       if (talloc_get_size(s2) != (sizeof(struct struct1) * 10)) {
+               printf("failure: ptrtype [\n"
+                          "talloc_array_ptrtype() allocated the wrong size "
+                      "%lu (should be %lu)\n]\n",
+                       (unsigned long)talloc_get_size(s2),
+                   (unsigned long)(sizeof(struct struct1)*10));
+               return false;
+       }
+
+       if (strcmp(location2, talloc_get_name(s2)) != 0) {
+               printf("failure: ptrtype [\n"
+               "talloc_array_ptrtype() sets the wrong name '%s' (should be '%s')\n]\n",
+                       talloc_get_name(s2), location2);
+               return false;
+       }
+
+       s3 = talloc_array_ptrtype(top, s3, 10);location3 = __location__;
+
+       if (talloc_get_size(s3) != (sizeof(struct struct1 *) * 10)) {
+               printf("failure: ptrtype [\n"
+                          "talloc_array_ptrtype() allocated the wrong size "
+                      "%lu (should be %lu)\n]\n",
+                          (unsigned long)talloc_get_size(s3),
+                      (unsigned long)(sizeof(struct struct1 *)*10));
+               return false;
+       }
+
+       torture_assert_str_equal("ptrtype", location3, talloc_get_name(s3),
+               "talloc_array_ptrtype() sets the wrong name");
+
+       s4 = talloc_array_ptrtype(top, s4, 10);location4 = __location__;
+
+       if (talloc_get_size(s4) != (sizeof(struct struct1 **) * 10)) {
+               printf("failure: ptrtype [\n"
+                     "talloc_array_ptrtype() allocated the wrong size "
+                      "%lu (should be %lu)\n]\n",
+                          (unsigned long)talloc_get_size(s4),
+                      (unsigned long)(sizeof(struct struct1 **)*10));
+               return false;
+       }
+
+       torture_assert_str_equal("ptrtype", location4, talloc_get_name(s4),
+               "talloc_array_ptrtype() sets the wrong name");
+
+       talloc_free(top);
+
+       printf("success: ptrtype\n");
+       return true;
+}
+
+static int _test_talloc_free_in_destructor(void **ptr)
+{
+       talloc_free(*ptr);
+       return 0;
+}
+
+static bool test_talloc_free_in_destructor(void)
+{
+       void *level0;
+       void *level1;
+       void *level2;
+       void *level3;
+       void *level4;
+       void **level5;
+
+       printf("test: free_in_destructor\n# TALLOC FREE IN DESTRUCTOR\n");
+
+       level0 = talloc_new(NULL);
+       level1 = talloc_new(level0);
+       level2 = talloc_new(level1);
+       level3 = talloc_new(level2);
+       level4 = talloc_new(level3);
+       level5 = talloc(level4, void *);
+
+       *level5 = level3;
+       (void)talloc_reference(level0, level3);
+       (void)talloc_reference(level3, level3);
+       (void)talloc_reference(level5, level3);
+
+       talloc_set_destructor(level5, _test_talloc_free_in_destructor);
+
+       talloc_free(level1);
+
+       talloc_free(level0);
+
+       printf("success: free_in_destructor\n");
+       return true;
+}
+
+static bool test_autofree(void)
+{
+#if _SAMBA_BUILD_ < 4
+       /* autofree test would kill smbtorture */
+       void *p;
+       printf("test: autofree\n# TALLOC AUTOFREE CONTEXT\n");
+
+       p = talloc_autofree_context();
+       talloc_free(p);
+
+       p = talloc_autofree_context();
+       talloc_free(p);
+
+       printf("success: autofree\n");
+#endif
+       return true;
+}
+
+static bool test_pool(void)
+{
+       void *pool;
+       void *p1, *p2, *p3, *p4;
+       void *p2_2;
+
+       pool = talloc_pool(NULL, 1024);
+
+       p1 = talloc_size(pool, 80);
+       memset(p1, 0x11, talloc_get_size(p1));
+       p2 = talloc_size(pool, 20);
+       memset(p2, 0x11, talloc_get_size(p2));
+       p3 = talloc_size(p1, 50);
+       memset(p3, 0x11, talloc_get_size(p3));
+       p4 = talloc_size(p3, 1000);
+       memset(p4, 0x11, talloc_get_size(p4));
+
+#if 1 /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+       p2_2 = talloc_realloc_size(pool, p2, 20+1);
+       torture_assert("pool realloc 20+1", p2_2 == p2, "failed: pointer changed");
+       memset(p2, 0x11, talloc_get_size(p2));
+       p2_2 = talloc_realloc_size(pool, p2, 20-1);
+       torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+       memset(p2, 0x11, talloc_get_size(p2));
+       p2_2 = talloc_realloc_size(pool, p2, 20-1);
+       torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       talloc_free(p3);
+
+       /* this should reclaim the memory of p4 and p3 */
+       p2_2 = talloc_realloc_size(pool, p2, 400);
+       torture_assert("pool realloc 400", p2_2 == p2, "failed: pointer changed");
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       talloc_free(p1);
+
+       /* this should reclaim the memory of p1 */
+       p2_2 = talloc_realloc_size(pool, p2, 800);
+       torture_assert("pool realloc 800", p2_2 == p1, "failed: pointer not changed");
+       p2 = p2_2;
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       /* this should do a malloc */
+       p2_2 = talloc_realloc_size(pool, p2, 1800);
+       torture_assert("pool realloc 1800", p2_2 != p2, "failed: pointer not changed");
+       p2 = p2_2;
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       /* this should reclaim the memory from the pool */
+       p3 = talloc_size(pool, 80);
+       torture_assert("pool alloc 80", p3 == p1, "failed: pointer changed");
+       memset(p3, 0x11, talloc_get_size(p3));
+
+       talloc_free(p2);
+       talloc_free(p3);
+
+       p1 = talloc_size(pool, 80);
+       memset(p1, 0x11, talloc_get_size(p1));
+       p2 = talloc_size(pool, 20);
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       talloc_free(p1);
+
+       p2_2 = talloc_realloc_size(pool, p2, 20-1);
+       torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+       memset(p2, 0x11, talloc_get_size(p2));
+       p2_2 = talloc_realloc_size(pool, p2, 20-1);
+       torture_assert("pool realloc 20-1", p2_2 == p2, "failed: pointer changed");
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       /* this should do a malloc */
+       p2_2 = talloc_realloc_size(pool, p2, 1800);
+       torture_assert("pool realloc 1800", p2_2 != p2, "failed: pointer not changed");
+       p2 = p2_2;
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       /* this should reclaim the memory from the pool */
+       p3 = talloc_size(pool, 800);
+       torture_assert("pool alloc 800", p3 == p1, "failed: pointer changed");
+       memset(p3, 0x11, talloc_get_size(p3));
+
+#endif /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+
+       talloc_free(pool);
+
+       return true;
+}
+
+static bool test_pool_steal(void)
+{
+       void *root;
+       void *pool;
+       void *p1, *p2;
+       void *p1_2, *p2_2;
+       size_t hdr;
+       size_t ofs1, ofs2;
+
+       root = talloc_new(NULL);
+       pool = talloc_pool(root, 1024);
+
+       p1 = talloc_size(pool, 4 * 16);
+       torture_assert("pool allocate 4 * 16", p1 != NULL, "failed ");
+       memset(p1, 0x11, talloc_get_size(p1));
+       p2 = talloc_size(pool, 4 * 16);
+       torture_assert("pool allocate 4 * 16", p2 > p1, "failed: !(p2 > p1) ");
+       memset(p2, 0x11, talloc_get_size(p2));
+
+       ofs1 = PTR_DIFF(p2, p1);
+       hdr = ofs1 - talloc_get_size(p1);
+
+       talloc_steal(root, p1);
+       talloc_steal(root, p2);
+
+       talloc_free(pool);
+
+       p1_2 = p1;
+
+#if 1 /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+       p1_2 = talloc_realloc_size(root, p1, 5 * 16);
+       torture_assert("pool realloc 5 * 16", p1_2 > p2, "failed: pointer not changed");
+       memset(p1_2, 0x11, talloc_get_size(p1_2));
+       ofs1 = PTR_DIFF(p1_2, p2);
+       ofs2 = talloc_get_size(p2) + hdr;
+
+       torture_assert("pool realloc ", ofs1 == ofs2, "failed: pointer offset unexpected");
+
+       p2_2 = talloc_realloc_size(root, p2, 3 * 16);
+       torture_assert("pool realloc 5 * 16", p2_2 == p2, "failed: pointer changed");
+       memset(p2_2, 0x11, talloc_get_size(p2_2));
+#endif /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+
+       talloc_free(p1_2);
+
+       p2_2 = p2;
+
+#if 1 /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+       /* now we should reclaim the full pool */
+       p2_2 = talloc_realloc_size(root, p2, 8 * 16);
+       torture_assert("pool realloc 8 * 16", p2_2 == p1, "failed: pointer not expected");
+       p2 = p2_2;
+       memset(p2_2, 0x11, talloc_get_size(p2_2));
+
+       /* now we malloc and free the full pool space */
+       p2_2 = talloc_realloc_size(root, p2, 2 * 1024);
+       torture_assert("pool realloc 2 * 1024", p2_2 != p1, "failed: pointer not expected");
+       memset(p2_2, 0x11, talloc_get_size(p2_2));
+
+#endif /* this relies on ALWAYS_REALLOC == 0 in talloc.c */
+
+       talloc_free(p2_2);
+
+       talloc_free(root);
+
+       return true;
+}
+
+static bool test_free_ref_null_context(void)
+{
+       void *p1, *p2, *p3;
+       int ret;
+
+       talloc_disable_null_tracking();
+       p1 = talloc_new(NULL);
+       p2 = talloc_new(NULL);
+
+       p3 = talloc_reference(p2, p1);
+       torture_assert("reference", p3 == p1, "failed: reference on null");
+
+       ret = talloc_free(p1);
+       torture_assert("ref free with null parent", ret == 0, "failed: free with null parent");
+       talloc_free(p2);
+
+       talloc_enable_null_tracking_no_autofree();
+       p1 = talloc_new(NULL);
+       p2 = talloc_new(NULL);
+
+       p3 = talloc_reference(p2, p1);
+       torture_assert("reference", p3 == p1, "failed: reference on null");
+
+       ret = talloc_free(p1);
+       torture_assert("ref free with null tracked parent", ret == 0, "failed: free with null parent");
+       talloc_free(p2);
+
+       return true;
+}
+
+static bool test_rusty(void)
+{
+       void *root;
+       const char *p1;
+
+       talloc_enable_null_tracking();
+       root = talloc_new(NULL);
+       p1 = talloc_strdup(root, "foo");
+       talloc_increase_ref_count(p1);
+       talloc_report_full(root, stdout);
+       talloc_free(root);
+       CHECK_BLOCKS("null_context", NULL, 2);
+       return true;
+}
+
+static bool test_free_children(void)
+{
+       void *root;
+       char *p1, *p2;
+       const char *name, *name2;
+
+       talloc_enable_null_tracking();
+       root = talloc_new(NULL);
+       p1 = talloc_strdup(root, "foo1");
+       p2 = talloc_strdup(p1, "foo2");
+
+       talloc_set_name(p1, "%s", "testname");
+       talloc_free_children(p1);
+       /* check its still a valid talloc ptr */
+       talloc_get_size(talloc_get_name(p1));
+       if (strcmp(talloc_get_name(p1), "testname") != 0) {
+               return false;
+       }
+
+       talloc_set_name(p1, "%s", "testname");
+       name = talloc_get_name(p1);
+       talloc_free_children(p1);
+       /* check its still a valid talloc ptr */
+       talloc_get_size(talloc_get_name(p1));
+       torture_assert("name", name == talloc_get_name(p1), "name ptr changed");
+       torture_assert("namecheck", strcmp(talloc_get_name(p1), "testname") == 0,
+                      "wrong name");
+       CHECK_BLOCKS("name1", p1, 2);
+
+       /* note that this does not free the old child name */
+       talloc_set_name_const(p1, "testname2");
+       name2 = talloc_get_name(p1);
+       /* but this does */
+       talloc_free_children(p1);
+       torture_assert("namecheck", strcmp(talloc_get_name(p1), "testname2") == 0,
+                      "wrong name");
+       CHECK_BLOCKS("name1", p1, 1);
+
+       talloc_report_full(root, stdout);
+       talloc_free(root);
+       return true;
+}
+
+static bool test_memlimit(void)
+{
+       void *root;
+       char *l1, *l2, *l3, *l4, *l5, *t;
+
+       printf("test: memlimit\n# MEMORY LIMITS\n");
+
+       printf("==== talloc_new(NULL)\n");
+       root = talloc_new(NULL);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_size(root, 2048)\n");
+       l1 = talloc_size(root, 2048);
+       torture_assert("memlimit", l1 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_free(l1)\n");
+       talloc_free(l1);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(root, level 1)\n");
+       l1 = talloc_strdup(root, "level 1");
+       torture_assert("memlimit", l1 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_set_memlimit(l1, 2048)\n");
+       torture_assert("memlimit", talloc_set_memlimit(l1, 2048) == 0,
+               "failed: setting memlimit should never fail\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_size(root, 2048)\n");
+       l2 = talloc_size(l1, 2048);
+       torture_assert("memlimit", l2 == NULL,
+               "failed: alloc should fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(l1, level 2)\n");
+       l2 = talloc_strdup(l1, "level 2");
+       torture_assert("memlimit", l2 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_free(l2)\n");
+       talloc_free(l2);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_size(NULL, 2048)\n");
+       l2 = talloc_size(NULL, 2048);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_steal(l1, l2)\n");
+       talloc_steal(l1, l2);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(l2, level 3)\n");
+       l3 = talloc_strdup(l2, "level 3");
+       torture_assert("memlimit", l3 == NULL,
+               "failed: alloc should fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_free(l2)\n");
+       talloc_free(l2);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(NULL, level 2)\n");
+       l2 = talloc_strdup(NULL, "level 2");
+       talloc_steal(l1, l2);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(l2, level 3)\n");
+       l3 = talloc_strdup(l2, "level 3");
+       torture_assert("memlimit", l3 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_set_memlimit(l3, 1024)\n");
+       torture_assert("memlimit", talloc_set_memlimit(l3, 1024) == 0,
+               "failed: setting memlimit should never fail\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(l3, level 4)\n");
+       l4 = talloc_strdup(l3, "level 4");
+       torture_assert("memlimit", l4 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_set_memlimit(l4, 512)\n");
+       torture_assert("memlimit", talloc_set_memlimit(l4, 512) == 0,
+               "failed: setting memlimit should never fail\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(l4, level 5)\n");
+       l5 = talloc_strdup(l4, "level 5");
+       torture_assert("memlimit", l5 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_realloc(NULL, l5, char, 600)\n");
+       t = talloc_realloc(NULL, l5, char, 600);
+       torture_assert("memlimit", t == NULL,
+               "failed: alloc should fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_realloc(NULL, l5, char, 5)\n");
+       l5 = talloc_realloc(NULL, l5, char, 5);
+       torture_assert("memlimit", l5 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(l3, level 4)\n");
+       l4 = talloc_strdup(l3, "level 4");
+       torture_assert("memlimit", l4 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_set_memlimit(l4, 512)\n");
+       torture_assert("memlimit", talloc_set_memlimit(l4, 512) == 0,
+               "failed: setting memlimit should never fail\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_strdup(l4, level 5)\n");
+       l5 = talloc_strdup(l4, "level 5");
+       torture_assert("memlimit", l5 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+
+       printf("==== Make new temp context and steal l5\n");
+       t = talloc_new(root);
+       talloc_steal(t, l5);
+
+       talloc_report_full(root, stdout);
+
+       printf("==== talloc_size(t, 2048)\n");
+       l1 = talloc_size(t, 2048);
+       torture_assert("memlimit", l1 != NULL,
+               "failed: alloc should not fail due to memory limit\n");
+
+       talloc_report_full(root, stdout);
+       talloc_free(root);
+
+       printf("success: memlimit\n");
+
+       return true;
+}
+
+static void test_reset(void)
+{
+       talloc_set_log_fn(test_log_stdout);
+       test_abort_stop();
+       talloc_disable_null_tracking();
+       talloc_enable_null_tracking_no_autofree();
+}
+
+bool torture_local_talloc(struct torture_context *tctx)
+{
+       bool ret = true;
+
+       setlinebuf(stdout);
+
+       test_reset();
+       ret &= test_ref1();
+       test_reset();
+       ret &= test_ref2();
+       test_reset();
+       ret &= test_ref3();
+       test_reset();
+       ret &= test_ref4();
+       test_reset();
+       ret &= test_unlink1(); 
+       test_reset();
+       ret &= test_misc();
+       test_reset();
+       ret &= test_realloc();
+       test_reset();
+       ret &= test_realloc_child(); 
+       test_reset();
+       ret &= test_steal(); 
+       test_reset();
+       ret &= test_move(); 
+       test_reset();
+       ret &= test_unref_reparent();
+       test_reset();
+       ret &= test_realloc_fn(); 
+       test_reset();
+       ret &= test_type();
+       test_reset();
+       ret &= test_lifeless(); 
+       test_reset();
+       ret &= test_loop();
+       test_reset();
+       ret &= test_free_parent_deny_child(); 
+       test_reset();
+       ret &= test_talloc_ptrtype();
+       test_reset();
+       ret &= test_talloc_free_in_destructor();
+       test_reset();
+       ret &= test_pool();
+       test_reset();
+       ret &= test_pool_steal();
+       test_reset();
+       ret &= test_free_ref_null_context();
+       test_reset();
+       ret &= test_rusty();
+       test_reset();
+       ret &= test_free_children();
+       test_reset();
+       ret &= test_memlimit();
+
+
+       if (ret) {
+               test_reset();
+               ret &= test_speed();
+       }
+       test_reset();
+       ret &= test_autofree();
+
+       test_reset();
+       talloc_disable_null_tracking();
+       return ret;
+}
diff --git a/ctdb/lib/talloc/testsuite_main.c b/ctdb/lib/talloc/testsuite_main.c
new file mode 100644 (file)
index 0000000..50ce0f8
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   local testing of talloc routines.
+
+   Copyright (C) Andrew Tridgell 2004
+
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+
+#include "talloc_testsuite.h"
+
+int main(void)
+{
+       bool ret = torture_local_talloc(NULL);
+       if (!ret)
+               return -1;
+       return 0;
+}
diff --git a/ctdb/lib/talloc/web/index.html b/ctdb/lib/talloc/web/index.html
new file mode 100644 (file)
index 0000000..388ec2c
--- /dev/null
@@ -0,0 +1,51 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
+<HTML>
+<HEAD>
+<TITLE>talloc</TITLE>
+</HEAD>
+<BODY BGCOLOR="#ffffff" TEXT="#000000" VLINK="#292555" LINK="#292555" ALINK="#cc0033">
+
+<h1>talloc</h1>
+
+talloc is a hierarchical pool based memory allocator with
+destructors. It is the core memory allocator used in Samba, and has
+made a huge difference in many aspects of Samba4 development.<p>
+
+To get started with talloc, I would recommend you read the <a
+href="http://samba.org/ftp/unpacked/talloc/talloc_guide.txt">talloc guide</a>.
+
+<h2>Download</h2>
+You can download the latest releases of talloc from the <a
+href="http://samba.org/ftp/talloc">talloc directory</a> on the samba public
+source archive.
+
+<h2>Discussion and bug reports</h2>
+
+talloc does not currently have its own mailing list or bug tracking
+system. For now, please use the <a
+href="https://lists.samba.org/mailman/listinfo/samba-technical">samba-technical</a>
+mailing list, and the <a href="http://bugzilla.samba.org/">Samba
+bugzilla</a> bug tracking system.
+
+<h2>Development</h2>
+
+You can download the latest code either via git or rsync.<br>
+<br>
+To fetch via git see the following guide:<br>
+<a href="http://wiki.samba.org/index.php/Using_Git_for_Samba_Development">Using Git for Samba Development</a><br>
+Once you have cloned the tree switch to the master branch and cd into the lib/talloc directory.<br>
+<br>
+To fetch via rsync use this command:
+
+<pre>
+  rsync -Pavz samba.org::ftp/unpacked/standalone_projects/lib/talloc .
+</pre>
+
+<hr>
+<tiny>
+<a href="http://samba.org/~tridge/">Andrew Tridgell</a><br>
+talloc AT tridgell.net
+</tiny>
+
+</BODY>
+</HTML>
diff --git a/ctdb/lib/talloc/wscript b/ctdb/lib/talloc/wscript
new file mode 100644 (file)
index 0000000..8d3246b
--- /dev/null
@@ -0,0 +1,154 @@
+#!/usr/bin/env python
+
+APPNAME = 'talloc'
+VERSION = '2.0.8'
+
+
+blddir = 'bin'
+
+import Logs
+import os, sys
+
+# find the buildtools directory
+srcdir = '.'
+while not os.path.exists(srcdir+'/buildtools') and len(srcdir.split('/')) < 5:
+    srcdir = '../' + srcdir
+sys.path.insert(0, srcdir + '/buildtools/wafsamba')
+
+import sys
+sys.path.insert(0, srcdir+"/buildtools/wafsamba")
+import wafsamba, samba_dist, Options
+
+# setup what directories to put in a tarball
+samba_dist.DIST_DIRS('lib/talloc:. lib/replace:lib/replace buildtools:buildtools')
+
+
+def set_options(opt):
+    opt.BUILTIN_DEFAULT('replace')
+    opt.PRIVATE_EXTENSION_DEFAULT('talloc', noextension='talloc')
+    opt.RECURSE('lib/replace')
+    opt.add_option('--enable-talloc-compat1',
+                   help=("Build talloc 1.x.x compat library [False]"),
+                   action="store_true", dest='TALLOC_COMPAT1', default=False)
+    if opt.IN_LAUNCH_DIR():
+        opt.add_option('--disable-python',
+                       help=("disable the pytalloc module"),
+                       action="store_true", dest='disable_python', default=False)
+
+
+def configure(conf):
+    conf.RECURSE('lib/replace')
+
+    conf.env.standalone_talloc = conf.IN_LAUNCH_DIR()
+
+    conf.env.disable_python = getattr(Options.options, 'disable_python', False)
+
+    if not conf.env.standalone_talloc:
+        if conf.CHECK_BUNDLED_SYSTEM_PKG('talloc', minversion=VERSION,
+                                     implied_deps='replace'):
+            conf.define('USING_SYSTEM_TALLOC', 1)
+        if conf.CHECK_BUNDLED_SYSTEM_PKG('pytalloc-util', minversion=VERSION,
+                                     implied_deps='talloc replace'):
+            conf.define('USING_SYSTEM_PYTALLOC_UTIL', 1)
+
+    conf.env.TALLOC_COMPAT1 = Options.options.TALLOC_COMPAT1
+
+    conf.CHECK_XSLTPROC_MANPAGES()
+
+    if not conf.env.disable_python:
+        # also disable if we don't have the python libs installed
+        conf.find_program('python', var='PYTHON')
+        conf.check_tool('python')
+        conf.check_python_version((2,4,2))
+        conf.SAMBA_CHECK_PYTHON_HEADERS(mandatory=False)
+        if not conf.env.HAVE_PYTHON_H:
+            Logs.warn('Disabling pytalloc-util as python devel libs not found')
+            conf.env.disable_python = True
+
+    conf.SAMBA_CONFIG_H()
+
+    conf.SAMBA_CHECK_UNDEFINED_SYMBOL_FLAGS()
+
+
+def build(bld):
+    bld.RECURSE('lib/replace')
+
+    if bld.env.standalone_talloc:
+        bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig'
+        bld.env.TALLOC_VERSION = VERSION
+        private_library = False
+
+        # should we also install the symlink to libtalloc1.so here?
+        bld.SAMBA_LIBRARY('talloc-compat1-%s' % (VERSION),
+                          'compat/talloc_compat1.c',
+                          public_deps='talloc',
+                          soname='libtalloc.so.1',
+                          pc_files=[],
+                          public_headers=[],
+                          enabled=bld.env.TALLOC_COMPAT1)
+
+        bld.SAMBA_BINARY('talloc_testsuite',
+                         'testsuite_main.c testsuite.c',
+                         deps='talloc',
+                         install=False)
+
+    else:
+        private_library = True
+
+    if not bld.CONFIG_SET('USING_SYSTEM_TALLOC'):
+
+        bld.SAMBA_LIBRARY('talloc',
+                          'talloc.c',
+                          deps='replace',
+                          abi_directory='ABI',
+                          abi_match='talloc* _talloc*',
+                          hide_symbols=True,
+                          vnum=VERSION,
+                          public_headers='talloc.h',
+                          pc_files='talloc.pc',
+                          public_headers_install=not private_library,
+                          private_library=private_library,
+                          manpages='talloc.3')
+
+    if not bld.CONFIG_SET('USING_SYSTEM_PYTALLOC_UTIL') and not bld.env.disable_python:
+        bld.SAMBA_LIBRARY('pytalloc-util',
+            source='pytalloc_util.c',
+            public_deps='talloc',
+            pyembed=True,
+            vnum=VERSION,
+            hide_symbols=True,
+            abi_directory='ABI',
+            abi_match='pytalloc_*',
+            private_library=private_library,
+            public_headers='pytalloc.h',
+            pc_files='pytalloc-util.pc'
+            )
+        bld.SAMBA_PYTHON('pytalloc',
+                         'pytalloc.c',
+                         deps='talloc pytalloc-util',
+                         enabled=True,
+                         realname='talloc.so')
+
+def test(ctx):
+    '''run talloc testsuite'''
+    import Utils, samba_utils
+    cmd = os.path.join(Utils.g_module.blddir, 'talloc_testsuite')
+    ret = samba_utils.RUN_COMMAND(cmd)
+    print("testsuite returned %d" % ret)
+    sys.exit(ret)
+
+def dist():
+    '''makes a tarball for distribution'''
+    samba_dist.dist()
+
+def reconfigure(ctx):
+    '''reconfigure if config scripts have changed'''
+    import samba_utils
+    samba_utils.reconfigure(ctx)
+
+
+def pydoctor(ctx):
+    '''build python apidocs'''
+    cmd='PYTHONPATH=bin/python pydoctor --project-name=talloc --project-url=http://talloc.samba.org/ --make-html --docformat=restructuredtext --introspect-c-modules --add-module bin/python/talloc.*'
+    print("Running: %s" % cmd)
+    os.system(cmd)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.1.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.1.sigs
new file mode 100644 (file)
index 0000000..84f2007
--- /dev/null
@@ -0,0 +1,95 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_alloc_read: unsigned char *(struct tdb_context *, tdb_off_t, tdb_len_t)
+tdb_allocate: tdb_off_t (struct tdb_context *, tdb_len_t, struct tdb_record *)
+tdb_allrecord_lock: int (struct tdb_context *, int, enum tdb_lock_flags, bool)
+tdb_allrecord_unlock: int (struct tdb_context *, int, bool)
+tdb_allrecord_upgrade: int (struct tdb_context *)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_brlock: int (struct tdb_context *, int, tdb_off_t, size_t, enum tdb_lock_flags)
+tdb_brunlock: int (struct tdb_context *, int, tdb_off_t, size_t)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_convert: void *(void *, uint32_t)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_do_delete: int (struct tdb_context *, tdb_off_t, struct tdb_record *)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_expand: int (struct tdb_context *, tdb_off_t)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_find_lock_hash: tdb_off_t (struct tdb_context *, TDB_DATA, uint32_t, int, struct tdb_record *)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_free: int (struct tdb_context *, tdb_off_t, struct tdb_record *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_have_extra_locks: bool (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_io_init: void (struct tdb_context *)
+tdb_lock: int (struct tdb_context *, int, int)
+tdb_lock_nonblock: int (struct tdb_context *, int, int)
+tdb_lock_record: int (struct tdb_context *, tdb_off_t)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_mmap: void (struct tdb_context *)
+tdb_munmap: int (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_needs_recovery: bool (struct tdb_context *)
+tdb_nest_lock: int (struct tdb_context *, uint32_t, int, enum tdb_lock_flags)
+tdb_nest_unlock: int (struct tdb_context *, uint32_t, int, bool)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_ofs_read: int (struct tdb_context *, tdb_off_t, tdb_off_t *)
+tdb_ofs_write: int (struct tdb_context *, tdb_off_t, tdb_off_t *)
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_data: int (struct tdb_context *, TDB_DATA, tdb_off_t, tdb_len_t, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_rec_free_read: int (struct tdb_context *, tdb_off_t, struct tdb_record *)
+tdb_rec_read: int (struct tdb_context *, tdb_off_t, struct tdb_record *)
+tdb_rec_write: int (struct tdb_context *, tdb_off_t, struct tdb_record *)
+tdb_release_transaction_locks: void (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_lock: int (struct tdb_context *, int, enum tdb_lock_flags)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_recover: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_transaction_unlock: int (struct tdb_context *, int)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlock: int (struct tdb_context *, int, int)
+tdb_unlock_record: int (struct tdb_context *, tdb_off_t)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
+tdb_write_lock_record: int (struct tdb_context *, tdb_off_t)
+tdb_write_unlock_record: int (struct tdb_context *, tdb_off_t)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.10.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.10.sigs
new file mode 100644 (file)
index 0000000..61f6c19
--- /dev/null
@@ -0,0 +1,66 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lock_nonblock: int (struct tdb_context *, int, int)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_summary: char *(struct tdb_context *)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_transaction_write_lock_mark: int (struct tdb_context *)
+tdb_transaction_write_lock_unmark: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlock: int (struct tdb_context *, int, int)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.11.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.11.sigs
new file mode 100644 (file)
index 0000000..d727f21
--- /dev/null
@@ -0,0 +1,67 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lock_nonblock: int (struct tdb_context *, int, int)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_rescue: int (struct tdb_context *, void (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_summary: char *(struct tdb_context *)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_transaction_write_lock_mark: int (struct tdb_context *)
+tdb_transaction_write_lock_unmark: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlock: int (struct tdb_context *, int, int)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.2.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.2.sigs
new file mode 100644 (file)
index 0000000..043790d
--- /dev/null
@@ -0,0 +1,60 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.3.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.3.sigs
new file mode 100644 (file)
index 0000000..043790d
--- /dev/null
@@ -0,0 +1,60 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.4.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.4.sigs
new file mode 100644 (file)
index 0000000..043790d
--- /dev/null
@@ -0,0 +1,60 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.5.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.5.sigs
new file mode 100644 (file)
index 0000000..1e01f3b
--- /dev/null
@@ -0,0 +1,61 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.6.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.6.sigs
new file mode 100644 (file)
index 0000000..1e01f3b
--- /dev/null
@@ -0,0 +1,61 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.7.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.7.sigs
new file mode 100644 (file)
index 0000000..1e01f3b
--- /dev/null
@@ -0,0 +1,61 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.8.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.8.sigs
new file mode 100644 (file)
index 0000000..1e01f3b
--- /dev/null
@@ -0,0 +1,61 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/ABI/tdb-1.2.9.sigs b/ctdb/lib/tdb/ABI/tdb-1.2.9.sigs
new file mode 100644 (file)
index 0000000..9e4149b
--- /dev/null
@@ -0,0 +1,62 @@
+tdb_add_flags: void (struct tdb_context *, unsigned int)
+tdb_append: int (struct tdb_context *, TDB_DATA, TDB_DATA)
+tdb_chainlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_mark: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_nonblock: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_chainlock_unmark: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock: int (struct tdb_context *, TDB_DATA)
+tdb_chainunlock_read: int (struct tdb_context *, TDB_DATA)
+tdb_check: int (struct tdb_context *, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_close: int (struct tdb_context *)
+tdb_delete: int (struct tdb_context *, TDB_DATA)
+tdb_dump_all: void (struct tdb_context *)
+tdb_enable_seqnum: void (struct tdb_context *)
+tdb_error: enum TDB_ERROR (struct tdb_context *)
+tdb_errorstr: const char *(struct tdb_context *)
+tdb_exists: int (struct tdb_context *, TDB_DATA)
+tdb_fd: int (struct tdb_context *)
+tdb_fetch: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_firstkey: TDB_DATA (struct tdb_context *)
+tdb_freelist_size: int (struct tdb_context *)
+tdb_get_flags: int (struct tdb_context *)
+tdb_get_logging_private: void *(struct tdb_context *)
+tdb_get_seqnum: int (struct tdb_context *)
+tdb_hash_size: int (struct tdb_context *)
+tdb_increment_seqnum_nonblock: void (struct tdb_context *)
+tdb_jenkins_hash: unsigned int (TDB_DATA *)
+tdb_lockall: int (struct tdb_context *)
+tdb_lockall_mark: int (struct tdb_context *)
+tdb_lockall_nonblock: int (struct tdb_context *)
+tdb_lockall_read: int (struct tdb_context *)
+tdb_lockall_read_nonblock: int (struct tdb_context *)
+tdb_lockall_unmark: int (struct tdb_context *)
+tdb_log_fn: tdb_log_func (struct tdb_context *)
+tdb_map_size: size_t (struct tdb_context *)
+tdb_name: const char *(struct tdb_context *)
+tdb_nextkey: TDB_DATA (struct tdb_context *, TDB_DATA)
+tdb_null: dptr = 0xXXXX, dsize = 0
+tdb_open: struct tdb_context *(const char *, int, int, int, mode_t)
+tdb_open_ex: struct tdb_context *(const char *, int, int, int, mode_t, const struct tdb_logging_context *, tdb_hash_func)
+tdb_parse_record: int (struct tdb_context *, TDB_DATA, int (*)(TDB_DATA, TDB_DATA, void *), void *)
+tdb_printfreelist: int (struct tdb_context *)
+tdb_remove_flags: void (struct tdb_context *, unsigned int)
+tdb_reopen: int (struct tdb_context *)
+tdb_reopen_all: int (int)
+tdb_repack: int (struct tdb_context *)
+tdb_set_logging_function: void (struct tdb_context *, const struct tdb_logging_context *)
+tdb_set_max_dead: void (struct tdb_context *, int)
+tdb_setalarm_sigptr: void (struct tdb_context *, volatile sig_atomic_t *)
+tdb_store: int (struct tdb_context *, TDB_DATA, TDB_DATA, int)
+tdb_summary: char *(struct tdb_context *)
+tdb_transaction_cancel: int (struct tdb_context *)
+tdb_transaction_commit: int (struct tdb_context *)
+tdb_transaction_prepare_commit: int (struct tdb_context *)
+tdb_transaction_start: int (struct tdb_context *)
+tdb_transaction_start_nonblock: int (struct tdb_context *)
+tdb_traverse: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_traverse_read: int (struct tdb_context *, tdb_traverse_func, void *)
+tdb_unlockall: int (struct tdb_context *)
+tdb_unlockall_read: int (struct tdb_context *)
+tdb_validate_freelist: int (struct tdb_context *, int *)
+tdb_wipe_all: int (struct tdb_context *)
diff --git a/ctdb/lib/tdb/common/check.c b/ctdb/lib/tdb/common/check.c
new file mode 100644 (file)
index 0000000..313f55c
--- /dev/null
@@ -0,0 +1,472 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Rusty Russell            2009
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb_private.h"
+
+/* Since we opened it, these shouldn't fail unless it's recent corruption. */
+static bool tdb_check_header(struct tdb_context *tdb, tdb_off_t *recovery)
+{
+       struct tdb_header hdr;
+       uint32_t h1, h2;
+
+       if (tdb->methods->tdb_read(tdb, 0, &hdr, sizeof(hdr), 0) == -1)
+               return false;
+       if (strcmp(hdr.magic_food, TDB_MAGIC_FOOD) != 0)
+               goto corrupt;
+
+       CONVERT(hdr);
+       if (hdr.version != TDB_VERSION)
+               goto corrupt;
+
+       if (hdr.rwlocks != 0 && hdr.rwlocks != TDB_HASH_RWLOCK_MAGIC)
+               goto corrupt;
+
+       tdb_header_hash(tdb, &h1, &h2);
+       if (hdr.magic1_hash && hdr.magic2_hash &&
+           (hdr.magic1_hash != h1 || hdr.magic2_hash != h2))
+               goto corrupt;
+
+       if (hdr.hash_size == 0)
+               goto corrupt;
+
+       if (hdr.hash_size != tdb->header.hash_size)
+               goto corrupt;
+
+       if (hdr.recovery_start != 0 &&
+           hdr.recovery_start < TDB_DATA_START(tdb->header.hash_size))
+               goto corrupt;
+
+       *recovery = hdr.recovery_start;
+       return true;
+
+corrupt:
+       tdb->ecode = TDB_ERR_CORRUPT;
+       TDB_LOG((tdb, TDB_DEBUG_ERROR, "Header is corrupt\n"));
+       return false;
+}
+
+/* Generic record header check. */
+static bool tdb_check_record(struct tdb_context *tdb,
+                            tdb_off_t off,
+                            const struct tdb_record *rec)
+{
+       tdb_off_t tailer;
+
+       /* Check rec->next: 0 or points to record offset, aligned. */
+       if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size)){
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d too small next %d\n",
+                        off, rec->next));
+               goto corrupt;
+       }
+       if (rec->next + sizeof(*rec) < rec->next) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d too large next %d\n",
+                        off, rec->next));
+               goto corrupt;
+       }
+       if ((rec->next % TDB_ALIGNMENT) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d misaligned next %d\n",
+                        off, rec->next));
+               goto corrupt;
+       }
+       if (tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0))
+               goto corrupt;
+
+       /* Check rec_len: similar to rec->next, implies next record. */
+       if ((rec->rec_len % TDB_ALIGNMENT) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d misaligned length %d\n",
+                        off, rec->rec_len));
+               goto corrupt;
+       }
+       /* Must fit tailer. */
+       if (rec->rec_len < sizeof(tailer)) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d too short length %d\n",
+                        off, rec->rec_len));
+               goto corrupt;
+       }
+       /* OOB allows "right at the end" access, so this works for last rec. */
+       if (tdb->methods->tdb_oob(tdb, off, sizeof(*rec)+rec->rec_len, 0))
+               goto corrupt;
+
+       /* Check tailer. */
+       if (tdb_ofs_read(tdb, off+sizeof(*rec)+rec->rec_len-sizeof(tailer),
+                        &tailer) == -1)
+               goto corrupt;
+       if (tailer != sizeof(*rec) + rec->rec_len) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d invalid tailer\n", off));
+               goto corrupt;
+       }
+
+       return true;
+
+corrupt:
+       tdb->ecode = TDB_ERR_CORRUPT;
+       return false;
+}
+
+/* Grab some bytes: may copy if can't use mmap.
+   Caller has already done bounds check. */
+static TDB_DATA get_bytes(struct tdb_context *tdb,
+                         tdb_off_t off, tdb_len_t len)
+{
+       TDB_DATA d;
+
+       d.dsize = len;
+
+       if (tdb->transaction == NULL && tdb->map_ptr != NULL)
+               d.dptr = (unsigned char *)tdb->map_ptr + off;
+       else
+               d.dptr = tdb_alloc_read(tdb, off, d.dsize);
+       return d;
+}
+
+/* Frees data if we're not able to simply use mmap. */
+static void put_bytes(struct tdb_context *tdb, TDB_DATA d)
+{
+       if (tdb->transaction == NULL && tdb->map_ptr != NULL)
+               return;
+       free(d.dptr);
+}
+
+/* We use the excellent Jenkins lookup3 hash; this is based on hash_word2.
+ * See: http://burtleburtle.net/bob/c/lookup3.c
+ */
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+static void hash(uint32_t key, uint32_t *pc, uint32_t *pb)
+{
+       uint32_t a,b,c;
+
+       /* Set up the internal state */
+       a = b = c = 0xdeadbeef + *pc;
+       c += *pb;
+       a += key;
+       c ^= b; c -= rot(b,14);
+       a ^= c; a -= rot(c,11);
+       b ^= a; b -= rot(a,25);
+       c ^= b; c -= rot(b,16);
+       a ^= c; a -= rot(c,4);
+       b ^= a; b -= rot(a,14);
+       c ^= b; c -= rot(b,24);
+       *pc=c; *pb=b;
+}
+
+/*
+  We want to check that all free records are in the free list
+  (only once), and all free list entries are free records.  Similarly
+  for each hash chain of used records.
+
+  Doing that naively (without walking hash chains, since we want to be
+  linear) means keeping a list of records which have been seen in each
+  hash chain, and another of records pointed to (ie. next pointers
+  from records and the initial hash chain heads).  These two lists
+  should be equal.  This will take 8 bytes per record, and require
+  sorting at the end.
+
+  So instead, we record each offset in a bitmap such a way that
+  recording it twice will cancel out.  Since each offset should appear
+  exactly twice, the bitmap should be zero at the end.
+
+  The approach was inspired by Bloom Filters (see Wikipedia).  For
+  each value, we flip K bits in a bitmap of size N.  The number of
+  distinct arrangements is:
+
+       N! / (K! * (N-K)!)
+
+  Of course, not all arrangements are actually distinct, but testing
+  shows this formula to be close enough.
+
+  So, if K == 8 and N == 256, the probability of two things flipping the same
+  bits is 1 in 409,663,695,276,000.
+
+  Given that ldb uses a hash size of 10000, using 32 bytes per hash chain
+  (320k) seems reasonable.
+*/
+#define NUM_HASHES 8
+#define BITMAP_BITS 256
+
+static void bit_flip(unsigned char bits[], unsigned int idx)
+{
+       bits[idx / CHAR_BIT] ^= (1 << (idx % CHAR_BIT));
+}
+
+/* We record offsets in a bitmap for the particular chain it should be in.  */
+static void record_offset(unsigned char bits[], tdb_off_t off)
+{
+       uint32_t h1 = off, h2 = 0;
+       unsigned int i;
+
+       /* We get two good hash values out of jhash2, so we use both.  Then
+        * we keep going to produce further hash values. */
+       for (i = 0; i < NUM_HASHES / 2; i++) {
+               hash(off, &h1, &h2);
+               bit_flip(bits, h1 % BITMAP_BITS);
+               bit_flip(bits, h2 % BITMAP_BITS);
+               h2++;
+       }
+}
+
+/* Check that an in-use record is valid. */
+static bool tdb_check_used_record(struct tdb_context *tdb,
+                                 tdb_off_t off,
+                                 const struct tdb_record *rec,
+                                 unsigned char **hashes,
+                                 int (*check)(TDB_DATA, TDB_DATA, void *),
+                                 void *private_data)
+{
+       TDB_DATA key, data;
+
+       if (!tdb_check_record(tdb, off, rec))
+               return false;
+
+       /* key + data + tailer must fit in record */
+       if (rec->key_len + rec->data_len + sizeof(tdb_off_t) > rec->rec_len) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d too short for contents\n", off));
+               return false;
+       }
+
+       key = get_bytes(tdb, off + sizeof(*rec), rec->key_len);
+       if (!key.dptr)
+               return false;
+
+       if (tdb->hash_fn(&key) != rec->full_hash) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Record offset %d has incorrect hash\n", off));
+               goto fail_put_key;
+       }
+
+       /* Mark this offset as a known value for this hash bucket. */
+       record_offset(hashes[BUCKET(rec->full_hash)+1], off);
+       /* And similarly if the next pointer is valid. */
+       if (rec->next)
+               record_offset(hashes[BUCKET(rec->full_hash)+1], rec->next);
+
+       /* If they supply a check function and this record isn't dead,
+          get data and feed it. */
+       if (check && rec->magic != TDB_DEAD_MAGIC) {
+               data = get_bytes(tdb, off + sizeof(*rec) + rec->key_len,
+                                rec->data_len);
+               if (!data.dptr)
+                       goto fail_put_key;
+
+               if (check(key, data, private_data) == -1)
+                       goto fail_put_data;
+               put_bytes(tdb, data);
+       }
+
+       put_bytes(tdb, key);
+       return true;
+
+fail_put_data:
+       put_bytes(tdb, data);
+fail_put_key:
+       put_bytes(tdb, key);
+       return false;
+}
+
+/* Check that an unused record is valid. */
+static bool tdb_check_free_record(struct tdb_context *tdb,
+                                 tdb_off_t off,
+                                 const struct tdb_record *rec,
+                                 unsigned char **hashes)
+{
+       if (!tdb_check_record(tdb, off, rec))
+               return false;
+
+       /* Mark this offset as a known value for the free list. */
+       record_offset(hashes[0], off);
+       /* And similarly if the next pointer is valid. */
+       if (rec->next)
+               record_offset(hashes[0], rec->next);
+       return true;
+}
+
+/* Slow, but should be very rare. */
+size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off)
+{
+       size_t len;
+
+       for (len = 0; off + len < tdb->map_size; len++) {
+               char c;
+               if (tdb->methods->tdb_read(tdb, off, &c, 1, 0))
+                       return 0;
+               if (c != 0 && c != 0x42)
+                       break;
+       }
+       return len;
+}
+
+_PUBLIC_ int tdb_check(struct tdb_context *tdb,
+             int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
+             void *private_data)
+{
+       unsigned int h;
+       unsigned char **hashes;
+       tdb_off_t off, recovery_start;
+       struct tdb_record rec;
+       bool found_recovery = false;
+       tdb_len_t dead;
+       bool locked;
+
+       /* Read-only databases use no locking at all: it's best-effort.
+        * We may have a write lock already, so skip that case too. */
+       if (tdb->read_only || tdb->allrecord_lock.count != 0) {
+               locked = false;
+       } else {
+               if (tdb_lockall_read(tdb) == -1)
+                       return -1;
+               locked = true;
+       }
+
+       /* Make sure we know true size of the underlying file. */
+       tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
+
+       /* Header must be OK: also gets us the recovery ptr, if any. */
+       if (!tdb_check_header(tdb, &recovery_start))
+               goto unlock;
+
+       /* We should have the whole header, too. */
+       if (tdb->map_size < TDB_DATA_START(tdb->header.hash_size)) {
+               tdb->ecode = TDB_ERR_CORRUPT;
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "File too short for hashes\n"));
+               goto unlock;
+       }
+
+       /* One big malloc: pointers then bit arrays. */
+       hashes = (unsigned char **)calloc(
+                       1, sizeof(hashes[0]) * (1+tdb->header.hash_size)
+                       + BITMAP_BITS / CHAR_BIT * (1+tdb->header.hash_size));
+       if (!hashes) {
+               tdb->ecode = TDB_ERR_OOM;
+               goto unlock;
+       }
+
+       /* Initialize pointers */
+       hashes[0] = (unsigned char *)(&hashes[1+tdb->header.hash_size]);
+       for (h = 1; h < 1+tdb->header.hash_size; h++)
+               hashes[h] = hashes[h-1] + BITMAP_BITS / CHAR_BIT;
+
+       /* Freelist and hash headers are all in a row: read them. */
+       for (h = 0; h < 1+tdb->header.hash_size; h++) {
+               if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
+                                &off) == -1)
+                       goto free;
+               if (off)
+                       record_offset(hashes[h], off);
+       }
+
+       /* For each record, read it in and check it's ok. */
+       for (off = TDB_DATA_START(tdb->header.hash_size);
+            off < tdb->map_size;
+            off += sizeof(rec) + rec.rec_len) {
+               if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
+                                          DOCONV()) == -1)
+                       goto free;
+               switch (rec.magic) {
+               case TDB_MAGIC:
+               case TDB_DEAD_MAGIC:
+                       if (!tdb_check_used_record(tdb, off, &rec, hashes,
+                                                  check, private_data))
+                               goto free;
+                       break;
+               case TDB_FREE_MAGIC:
+                       if (!tdb_check_free_record(tdb, off, &rec, hashes))
+                               goto free;
+                       break;
+               /* If we crash after ftruncate, we can get zeroes or fill. */
+               case TDB_RECOVERY_INVALID_MAGIC:
+               case 0x42424242:
+                       if (recovery_start == off) {
+                               found_recovery = true;
+                               break;
+                       }
+                       dead = tdb_dead_space(tdb, off);
+                       if (dead < sizeof(rec))
+                               goto corrupt;
+
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                "Dead space at %d-%d (of %u)\n",
+                                off, off + dead, tdb->map_size));
+                       rec.rec_len = dead - sizeof(rec);
+                       break;
+               case TDB_RECOVERY_MAGIC:
+                       if (recovery_start != off) {
+                               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                        "Unexpected recovery record at offset %d\n",
+                                        off));
+                               goto free;
+                       }
+                       found_recovery = true;
+                       break;
+               default: ;
+               corrupt:
+                       tdb->ecode = TDB_ERR_CORRUPT;
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                "Bad magic 0x%x at offset %d\n",
+                                rec.magic, off));
+                       goto free;
+               }
+       }
+
+       /* Now, hashes should all be empty: each record exists and is referred
+        * to by one other. */
+       for (h = 0; h < 1+tdb->header.hash_size; h++) {
+               unsigned int i;
+               for (i = 0; i < BITMAP_BITS / CHAR_BIT; i++) {
+                       if (hashes[h][i] != 0) {
+                               tdb->ecode = TDB_ERR_CORRUPT;
+                               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                        "Hashes do not match records\n"));
+                               goto free;
+                       }
+               }
+       }
+
+       /* We must have found recovery area if there was one. */
+       if (recovery_start != 0 && !found_recovery) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "Expected a recovery area at %u\n",
+                        recovery_start));
+               goto free;
+       }
+
+       free(hashes);
+       if (locked) {
+               tdb_unlockall_read(tdb);
+       }
+       return 0;
+
+free:
+       free(hashes);
+unlock:
+       if (locked) {
+               tdb_unlockall_read(tdb);
+       }
+       return -1;
+}
diff --git a/ctdb/lib/tdb/common/dump.c b/ctdb/lib/tdb/common/dump.c
new file mode 100644 (file)
index 0000000..67de04e
--- /dev/null
@@ -0,0 +1,137 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+static tdb_off_t tdb_dump_record(struct tdb_context *tdb, int hash,
+                                tdb_off_t offset)
+{
+       struct tdb_record rec;
+       tdb_off_t tailer_ofs, tailer;
+
+       if (tdb->methods->tdb_read(tdb, offset, (char *)&rec, 
+                                  sizeof(rec), DOCONV()) == -1) {
+               printf("ERROR: failed to read record at %u\n", offset);
+               return 0;
+       }
+
+       printf(" rec: hash=%d offset=0x%08x next=0x%08x rec_len=%d "
+              "key_len=%d data_len=%d full_hash=0x%x magic=0x%x\n",
+              hash, offset, rec.next, rec.rec_len, rec.key_len, rec.data_len,
+              rec.full_hash, rec.magic);
+
+       tailer_ofs = offset + sizeof(rec) + rec.rec_len - sizeof(tdb_off_t);
+
+       if (tdb_ofs_read(tdb, tailer_ofs, &tailer) == -1) {
+               printf("ERROR: failed to read tailer at %u\n", tailer_ofs);
+               return rec.next;
+       }
+
+       if (tailer != rec.rec_len + sizeof(rec)) {
+               printf("ERROR: tailer does not match record! tailer=%u totalsize=%u\n",
+                               (unsigned int)tailer, (unsigned int)(rec.rec_len + sizeof(rec)));
+       }
+       return rec.next;
+}
+
+static int tdb_dump_chain(struct tdb_context *tdb, int i)
+{
+       tdb_off_t rec_ptr, top;
+
+       top = TDB_HASH_TOP(i);
+
+       if (tdb_lock(tdb, i, F_WRLCK) != 0)
+               return -1;
+
+       if (tdb_ofs_read(tdb, top, &rec_ptr) == -1)
+               return tdb_unlock(tdb, i, F_WRLCK);
+
+       if (rec_ptr)
+               printf("hash=%d\n", i);
+
+       while (rec_ptr) {
+               rec_ptr = tdb_dump_record(tdb, i, rec_ptr);
+       }
+
+       return tdb_unlock(tdb, i, F_WRLCK);
+}
+
+_PUBLIC_ void tdb_dump_all(struct tdb_context *tdb)
+{
+       int i;
+       for (i=0;i<tdb->header.hash_size;i++) {
+               tdb_dump_chain(tdb, i);
+       }
+       printf("freelist:\n");
+       tdb_dump_chain(tdb, -1);
+}
+
+_PUBLIC_ int tdb_printfreelist(struct tdb_context *tdb)
+{
+       int ret;
+       long total_free = 0;
+       tdb_off_t offset, rec_ptr;
+       struct tdb_record rec;
+
+       if ((ret = tdb_lock(tdb, -1, F_WRLCK)) != 0)
+               return ret;
+
+       offset = FREELIST_TOP;
+
+       /* read in the freelist top */
+       if (tdb_ofs_read(tdb, offset, &rec_ptr) == -1) {
+               tdb_unlock(tdb, -1, F_WRLCK);
+               return 0;
+       }
+
+       printf("freelist top=[0x%08x]\n", rec_ptr );
+       while (rec_ptr) {
+               if (tdb->methods->tdb_read(tdb, rec_ptr, (char *)&rec, 
+                                          sizeof(rec), DOCONV()) == -1) {
+                       tdb_unlock(tdb, -1, F_WRLCK);
+                       return -1;
+               }
+
+               if (rec.magic != TDB_FREE_MAGIC) {
+                       printf("bad magic 0x%08x in free list\n", rec.magic);
+                       tdb_unlock(tdb, -1, F_WRLCK);
+                       return -1;
+               }
+
+               printf("entry offset=[0x%08x], rec.rec_len = [0x%08x (%d)] (end = 0x%08x)\n", 
+                      rec_ptr, rec.rec_len, rec.rec_len, rec_ptr + rec.rec_len);
+               total_free += rec.rec_len;
+
+               /* move to the next record */
+               rec_ptr = rec.next;
+       }
+       printf("total rec_len = [0x%08x (%d)]\n", (int)total_free, 
+               (int)total_free);
+
+       return tdb_unlock(tdb, -1, F_WRLCK);
+}
+
diff --git a/ctdb/lib/tdb/common/error.c b/ctdb/lib/tdb/common/error.c
new file mode 100644 (file)
index 0000000..2aaaa81
--- /dev/null
@@ -0,0 +1,57 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+_PUBLIC_ enum TDB_ERROR tdb_error(struct tdb_context *tdb)
+{
+       return tdb->ecode;
+}
+
+static struct tdb_errname {
+       enum TDB_ERROR ecode; const char *estring;
+} emap[] = { {TDB_SUCCESS, "Success"},
+            {TDB_ERR_CORRUPT, "Corrupt database"},
+            {TDB_ERR_IO, "IO Error"},
+            {TDB_ERR_LOCK, "Locking error"},
+            {TDB_ERR_OOM, "Out of memory"},
+            {TDB_ERR_EXISTS, "Record exists"},
+            {TDB_ERR_NOLOCK, "Lock exists on other keys"},
+            {TDB_ERR_EINVAL, "Invalid parameter"},
+            {TDB_ERR_NOEXIST, "Record does not exist"},
+            {TDB_ERR_RDONLY, "write not permitted"} };
+
+/* Error string for the last tdb error */
+_PUBLIC_ const char *tdb_errorstr(struct tdb_context *tdb)
+{
+       uint32_t i;
+       for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
+               if (tdb->ecode == emap[i].ecode)
+                       return emap[i].estring;
+       return "Invalid error code";
+}
+
diff --git a/ctdb/lib/tdb/common/freelist.c b/ctdb/lib/tdb/common/freelist.c
new file mode 100644 (file)
index 0000000..6358f64
--- /dev/null
@@ -0,0 +1,386 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+/* 'right' merges can involve O(n^2) cost when combined with a
+   traverse, so they are disabled until we find a way to do them in 
+   O(1) time
+*/
+#define USE_RIGHT_MERGES 0
+
+/* read a freelist record and check for simple errors */
+int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off, struct tdb_record *rec)
+{
+       if (tdb->methods->tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
+               return -1;
+
+       if (rec->magic == TDB_MAGIC) {
+               /* this happens when a app is showdown while deleting a record - we should
+                  not completely fail when this happens */
+               TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read non-free magic 0x%x at offset=%d - fixing\n", 
+                        rec->magic, off));
+               rec->magic = TDB_FREE_MAGIC;
+               if (tdb->methods->tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
+                       return -1;
+       }
+
+       if (rec->magic != TDB_FREE_MAGIC) {
+               /* Ensure ecode is set for log fn. */
+               tdb->ecode = TDB_ERR_CORRUPT;
+               TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_rec_free_read bad magic 0x%x at offset=%d\n", 
+                          rec->magic, off));
+               return -1;
+       }
+       if (tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0) != 0)
+               return -1;
+       return 0;
+}
+
+
+#if USE_RIGHT_MERGES
+/* Remove an element from the freelist.  Must have alloc lock. */
+static int remove_from_freelist(struct tdb_context *tdb, tdb_off_t off, tdb_off_t next)
+{
+       tdb_off_t last_ptr, i;
+
+       /* read in the freelist top */
+       last_ptr = FREELIST_TOP;
+       while (tdb_ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
+               if (i == off) {
+                       /* We've found it! */
+                       return tdb_ofs_write(tdb, last_ptr, &next);
+               }
+               /* Follow chain (next offset is at start of record) */
+               last_ptr = i;
+       }
+       tdb->ecode = TDB_ERR_CORRUPT;
+       TDB_LOG((tdb, TDB_DEBUG_FATAL,"remove_from_freelist: not on list at off=%d\n", off));
+       return -1;
+}
+#endif
+
+
+/* update a record tailer (must hold allocation lock) */
+static int update_tailer(struct tdb_context *tdb, tdb_off_t offset,
+                        const struct tdb_record *rec)
+{
+       tdb_off_t totalsize;
+
+       /* Offset of tailer from record header */
+       totalsize = sizeof(*rec) + rec->rec_len;
+       return tdb_ofs_write(tdb, offset + totalsize - sizeof(tdb_off_t),
+                        &totalsize);
+}
+
+/* Add an element into the freelist. Merge adjacent records if
+   necessary. */
+int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
+{
+       /* Allocation and tailer lock */
+       if (tdb_lock(tdb, -1, F_WRLCK) != 0)
+               return -1;
+
+       /* set an initial tailer, so if we fail we don't leave a bogus record */
+       if (update_tailer(tdb, offset, rec) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed!\n"));
+               goto fail;
+       }
+
+#if USE_RIGHT_MERGES
+       /* Look right first (I'm an Australian, dammit) */
+       if (offset + sizeof(*rec) + rec->rec_len + sizeof(*rec) <= tdb->map_size) {
+               tdb_off_t right = offset + sizeof(*rec) + rec->rec_len;
+               struct tdb_record r;
+
+               if (tdb->methods->tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right read failed at %u\n", right));
+                       goto left;
+               }
+
+               /* If it's free, expand to include it. */
+               if (r.magic == TDB_FREE_MAGIC) {
+                       if (remove_from_freelist(tdb, right, r.next) == -1) {
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: right free failed at %u\n", right));
+                               goto left;
+                       }
+                       rec->rec_len += sizeof(r) + r.rec_len;
+                       if (update_tailer(tdb, offset, rec) == -1) {
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
+                               goto fail;
+                       }
+               }
+       }
+left:
+#endif
+
+       /* Look left */
+       if (offset - sizeof(tdb_off_t) > TDB_DATA_START(tdb->header.hash_size)) {
+               tdb_off_t left = offset - sizeof(tdb_off_t);
+               struct tdb_record l;
+               tdb_off_t leftsize;
+
+               /* Read in tailer and jump back to header */
+               if (tdb_ofs_read(tdb, left, &leftsize) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left offset read failed at %u\n", left));
+                       goto update;
+               }
+
+               /* it could be uninitialised data */
+               if (leftsize == 0 || leftsize == TDB_PAD_U32) {
+                       goto update;
+               }
+
+               left = offset - leftsize;
+
+               if (leftsize > offset ||
+                   left < TDB_DATA_START(tdb->header.hash_size)) {
+                       goto update;
+               }
+
+               /* Now read in the left record */
+               if (tdb->methods->tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
+                       goto update;
+               }
+
+               /* If it's free, expand to include it. */
+               if (l.magic == TDB_FREE_MAGIC) {
+                       /* we now merge the new record into the left record, rather than the other 
+                          way around. This makes the operation O(1) instead of O(n). This change
+                          prevents traverse from being O(n^2) after a lot of deletes */
+                       l.rec_len += sizeof(*rec) + rec->rec_len;
+                       if (tdb_rec_write(tdb, left, &l) == -1) {
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_left failed at %u\n", left));
+                               goto fail;
+                       }
+                       if (update_tailer(tdb, left, &l) == -1) {
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free: update_tailer failed at %u\n", offset));
+                               goto fail;
+                       }
+                       tdb_unlock(tdb, -1, F_WRLCK);
+                       return 0;
+               }
+       }
+
+update:
+
+       /* Now, prepend to free list */
+       rec->magic = TDB_FREE_MAGIC;
+
+       if (tdb_ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
+           tdb_rec_write(tdb, offset, rec) == -1 ||
+           tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_free record write failed at offset=%d\n", offset));
+               goto fail;
+       }
+
+       /* And we're done. */
+       tdb_unlock(tdb, -1, F_WRLCK);
+       return 0;
+
+ fail:
+       tdb_unlock(tdb, -1, F_WRLCK);
+       return -1;
+}
+
+
+
+/* 
+   the core of tdb_allocate - called when we have decided which
+   free list entry to use
+
+   Note that we try to allocate by grabbing data from the end of an existing record,
+   not the beginning. This is so the left merge in a free is more likely to be
+   able to free up the record without fragmentation
+ */
+static tdb_off_t tdb_allocate_ofs(struct tdb_context *tdb, 
+                                 tdb_len_t length, tdb_off_t rec_ptr,
+                                 struct tdb_record *rec, tdb_off_t last_ptr)
+{
+#define MIN_REC_SIZE (sizeof(struct tdb_record) + sizeof(tdb_off_t) + 8)
+
+       if (rec->rec_len < length + MIN_REC_SIZE) {
+               /* we have to grab the whole record */
+
+               /* unlink it from the previous record */
+               if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1) {
+                       return 0;
+               }
+
+               /* mark it not free */
+               rec->magic = TDB_MAGIC;
+               if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
+                       return 0;
+               }
+               return rec_ptr;
+       }
+
+       /* we're going to just shorten the existing record */
+       rec->rec_len -= (length + sizeof(*rec));
+       if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
+               return 0;
+       }
+       if (update_tailer(tdb, rec_ptr, rec) == -1) {
+               return 0;
+       }
+
+       /* and setup the new record */
+       rec_ptr += sizeof(*rec) + rec->rec_len; 
+
+       memset(rec, '\0', sizeof(*rec));
+       rec->rec_len = length;
+       rec->magic = TDB_MAGIC;
+
+       if (tdb_rec_write(tdb, rec_ptr, rec) == -1) {
+               return 0;
+       }
+
+       if (update_tailer(tdb, rec_ptr, rec) == -1) {
+               return 0;
+       }
+
+       return rec_ptr;
+}
+
+/* allocate some space from the free list. The offset returned points
+   to a unconnected tdb_record within the database with room for at
+   least length bytes of total data
+
+   0 is returned if the space could not be allocated
+ */
+tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec)
+{
+       tdb_off_t rec_ptr, last_ptr, newrec_ptr;
+       struct {
+               tdb_off_t rec_ptr, last_ptr;
+               tdb_len_t rec_len;
+       } bestfit;
+       float multiplier = 1.0;
+
+       if (tdb_lock(tdb, -1, F_WRLCK) == -1)
+               return 0;
+
+       /* over-allocate to reduce fragmentation */
+       length *= 1.25;
+
+       /* Extra bytes required for tailer */
+       length += sizeof(tdb_off_t);
+       length = TDB_ALIGN(length, TDB_ALIGNMENT);
+
+ again:
+       last_ptr = FREELIST_TOP;
+
+       /* read in the freelist top */
+       if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
+               goto fail;
+
+       bestfit.rec_ptr = 0;
+       bestfit.last_ptr = 0;
+       bestfit.rec_len = 0;
+
+       /* 
+          this is a best fit allocation strategy. Originally we used
+          a first fit strategy, but it suffered from massive fragmentation
+          issues when faced with a slowly increasing record size.
+        */
+       while (rec_ptr) {
+               if (tdb_rec_free_read(tdb, rec_ptr, rec) == -1) {
+                       goto fail;
+               }
+
+               if (rec->rec_len >= length) {
+                       if (bestfit.rec_ptr == 0 ||
+                           rec->rec_len < bestfit.rec_len) {
+                               bestfit.rec_len = rec->rec_len;
+                               bestfit.rec_ptr = rec_ptr;
+                               bestfit.last_ptr = last_ptr;
+                       }
+               }
+
+               /* move to the next record */
+               last_ptr = rec_ptr;
+               rec_ptr = rec->next;
+
+               /* if we've found a record that is big enough, then
+                  stop searching if its also not too big. The
+                  definition of 'too big' changes as we scan
+                  through */
+               if (bestfit.rec_len > 0 &&
+                   bestfit.rec_len < length * multiplier) {
+                       break;
+               }
+
+               /* this multiplier means we only extremely rarely
+                  search more than 50 or so records. At 50 records we
+                  accept records up to 11 times larger than what we
+                  want */
+               multiplier *= 1.05;
+       }
+
+       if (bestfit.rec_ptr != 0) {
+               if (tdb_rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
+                       goto fail;
+               }
+
+               newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, 
+                                             rec, bestfit.last_ptr);
+               tdb_unlock(tdb, -1, F_WRLCK);
+               return newrec_ptr;
+       }
+
+       /* we didn't find enough space. See if we can expand the
+          database and if we can then try again */
+       if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
+               goto again;
+ fail:
+       tdb_unlock(tdb, -1, F_WRLCK);
+       return 0;
+}
+
+
+
+/* 
+   return the size of the freelist - used to decide if we should repack 
+*/
+_PUBLIC_ int tdb_freelist_size(struct tdb_context *tdb)
+{
+       tdb_off_t ptr;
+       int count=0;
+
+       if (tdb_lock(tdb, -1, F_RDLCK) == -1) {
+               return -1;
+       }
+
+       ptr = FREELIST_TOP;
+       while (tdb_ofs_read(tdb, ptr, &ptr) == 0 && ptr != 0) {
+               count++;
+       }
+
+       tdb_unlock(tdb, -1, F_RDLCK);
+       return count;
+}
diff --git a/ctdb/lib/tdb/common/freelistcheck.c b/ctdb/lib/tdb/common/freelistcheck.c
new file mode 100644 (file)
index 0000000..ab6e78f
--- /dev/null
@@ -0,0 +1,109 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Jeremy Allison                    2006
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+/* Check the freelist is good and contains no loops.
+   Very memory intensive - only do this as a consistency
+   checker. Heh heh - uses an in memory tdb as the storage
+   for the "seen" record list. For some reason this strikes
+   me as extremely clever as I don't have to write another tree
+   data structure implementation :-).
+ */
+
+static int seen_insert(struct tdb_context *mem_tdb, tdb_off_t rec_ptr)
+{
+       TDB_DATA key, data;
+
+       memset(&data, '\0', sizeof(data));
+       key.dptr = (unsigned char *)&rec_ptr;
+       key.dsize = sizeof(rec_ptr);
+       return tdb_store(mem_tdb, key, data, TDB_INSERT);
+}
+
+_PUBLIC_ int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries)
+{
+       struct tdb_context *mem_tdb = NULL;
+       struct tdb_record rec;
+       tdb_off_t rec_ptr, last_ptr;
+       int ret = -1;
+
+       *pnum_entries = 0;
+
+       mem_tdb = tdb_open("flval", tdb->header.hash_size,
+                               TDB_INTERNAL, O_RDWR, 0600);
+       if (!mem_tdb) {
+               return -1;
+       }
+
+       if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
+               tdb_close(mem_tdb);
+               return 0;
+       }
+
+       last_ptr = FREELIST_TOP;
+
+       /* Store the FREELIST_TOP record. */
+       if (seen_insert(mem_tdb, last_ptr) == -1) {
+               tdb->ecode = TDB_ERR_CORRUPT;
+               ret = -1;
+               goto fail;
+       }
+
+       /* read in the freelist top */
+       if (tdb_ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1) {
+               goto fail;
+       }
+
+       while (rec_ptr) {
+
+               /* If we can't store this record (we've seen it
+                  before) then the free list has a loop and must
+                  be corrupt. */
+
+               if (seen_insert(mem_tdb, rec_ptr)) {
+                       tdb->ecode = TDB_ERR_CORRUPT;
+                       ret = -1;
+                       goto fail;
+               }
+
+               if (tdb_rec_free_read(tdb, rec_ptr, &rec) == -1) {
+                       goto fail;
+               }
+
+               /* move to the next record */
+               last_ptr = rec_ptr;
+               rec_ptr = rec.next;
+               *pnum_entries += 1;
+       }
+
+       ret = 0;
+
+  fail:
+
+       tdb_close(mem_tdb);
+       tdb_unlock(tdb, -1, F_WRLCK);
+       return ret;
+}
diff --git a/ctdb/lib/tdb/common/hash.c b/ctdb/lib/tdb/common/hash.c
new file mode 100644 (file)
index 0000000..1eed722
--- /dev/null
@@ -0,0 +1,345 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Rusty Russell            2010
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb_private.h"
+
+/* This is based on the hash algorithm from gdbm */
+unsigned int tdb_old_hash(TDB_DATA *key)
+{
+       uint32_t value; /* Used to compute the hash value.  */
+       uint32_t   i;   /* Used to cycle through random values. */
+
+       /* Set the initial value from the key size. */
+       for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
+               value = (value + (key->dptr[i] << (i*5 % 24)));
+
+       return (1103515243 * value + 12345);
+}
+
+#ifndef WORDS_BIGENDIAN
+# define HASH_LITTLE_ENDIAN 1
+# define HASH_BIG_ENDIAN 0
+#else
+# define HASH_LITTLE_ENDIAN 0
+# define HASH_BIG_ENDIAN 1
+#endif
+
+/*
+-------------------------------------------------------------------------------
+lookup3.c, by Bob Jenkins, May 2006, Public Domain.
+
+These are functions for producing 32-bit hashes for hash table lookup.
+hash_word(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
+are externally useful functions.  Routines to test the hash are included
+if SELF_TEST is defined.  You can use this free for any purpose.  It's in
+the public domain.  It has no warranty.
+
+You probably want to use hashlittle().  hashlittle() and hashbig()
+hash byte arrays.  hashlittle() is is faster than hashbig() on
+little-endian machines.  Intel and AMD are little-endian machines.
+On second thought, you probably want hashlittle2(), which is identical to
+hashlittle() except it returns two 32-bit hashes for the price of one.
+You could implement hashbig2() if you wanted but I haven't bothered here.
+
+If you want to find a hash of, say, exactly 7 integers, do
+  a = i1;  b = i2;  c = i3;
+  mix(a,b,c);
+  a += i4; b += i5; c += i6;
+  mix(a,b,c);
+  a += i7;
+  final(a,b,c);
+then use c as the hash value.  If you have a variable length array of
+4-byte integers to hash, use hash_word().  If you have a byte array (like
+a character string), use hashlittle().  If you have several byte arrays, or
+a mix of things, see the comments above hashlittle().
+
+Why is this so big?  I read 12 bytes at a time into 3 4-byte integers,
+then mix those integers.  This is fast (you can do a lot more thorough
+mixing with 12*3 instructions on 3 integers than you can with 3 instructions
+on 1 byte), but shoehorning those bytes into integers efficiently is messy.
+*/
+
+#define hashsize(n) ((uint32_t)1<<(n))
+#define hashmask(n) (hashsize(n)-1)
+#define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
+
+/*
+-------------------------------------------------------------------------------
+mix -- mix 3 32-bit values reversibly.
+
+This is reversible, so any information in (a,b,c) before mix() is
+still in (a,b,c) after mix().
+
+If four pairs of (a,b,c) inputs are run through mix(), or through
+mix() in reverse, there are at least 32 bits of the output that
+are sometimes the same for one pair and different for another pair.
+This was tested for:
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+
+Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
+satisfy this are
+    4  6  8 16 19  4
+    9 15  3 18 27 15
+   14  9  3  7 17  3
+Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
+for "differ" defined as + with a one-bit base and a two-bit delta.  I
+used http://burtleburtle.net/bob/hash/avalanche.html to choose
+the operations, constants, and arrangements of the variables.
+
+This does not achieve avalanche.  There are input bits of (a,b,c)
+that fail to affect some output bits of (a,b,c), especially of a.  The
+most thoroughly mixed value is c, but it doesn't really even achieve
+avalanche in c.
+
+This allows some parallelism.  Read-after-writes are good at doubling
+the number of bits affected, so the goal of mixing pulls in the opposite
+direction as the goal of parallelism.  I did what I could.  Rotates
+seem to cost as much as shifts on every machine I could lay my hands
+on, and rotates are much kinder to the top and bottom bits, so I used
+rotates.
+-------------------------------------------------------------------------------
+*/
+#define mix(a,b,c) \
+{ \
+  a -= c;  a ^= rot(c, 4);  c += b; \
+  b -= a;  b ^= rot(a, 6);  a += c; \
+  c -= b;  c ^= rot(b, 8);  b += a; \
+  a -= c;  a ^= rot(c,16);  c += b; \
+  b -= a;  b ^= rot(a,19);  a += c; \
+  c -= b;  c ^= rot(b, 4);  b += a; \
+}
+
+/*
+-------------------------------------------------------------------------------
+final -- final mixing of 3 32-bit values (a,b,c) into c
+
+Pairs of (a,b,c) values differing in only a few bits will usually
+produce values of c that look totally different.  This was tested for
+* pairs that differed by one bit, by two bits, in any combination
+  of top bits of (a,b,c), or in any combination of bottom bits of
+  (a,b,c).
+* "differ" is defined as +, -, ^, or ~^.  For + and -, I transformed
+  the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
+  is commonly produced by subtraction) look like a single 1-bit
+  difference.
+* the base values were pseudorandom, all zero but one bit set, or
+  all zero plus a counter that starts at zero.
+
+These constants passed:
+ 14 11 25 16 4 14 24
+ 12 14 25 16 4 14 24
+and these came close:
+  4  8 15 26 3 22 24
+ 10  8 15 26 3 22 24
+ 11  8 15 26 3 22 24
+-------------------------------------------------------------------------------
+*/
+#define final(a,b,c) \
+{ \
+  c ^= b; c -= rot(b,14); \
+  a ^= c; a -= rot(c,11); \
+  b ^= a; b -= rot(a,25); \
+  c ^= b; c -= rot(b,16); \
+  a ^= c; a -= rot(c,4);  \
+  b ^= a; b -= rot(a,14); \
+  c ^= b; c -= rot(b,24); \
+}
+
+
+/*
+-------------------------------------------------------------------------------
+hashlittle() -- hash a variable-length key into a 32-bit value
+  k       : the key (the unaligned variable-length array of bytes)
+  length  : the length of the key, counting by bytes
+  val2    : IN: can be any 4-byte value OUT: second 32 bit hash.
+Returns a 32-bit value.  Every bit of the key affects every bit of
+the return value.  Two keys differing by one or two bits will have
+totally different hash values.  Note that the return value is better
+mixed than val2, so use that first.
+
+The best hash table sizes are powers of 2.  There is no need to do
+mod a prime (mod is sooo slow!).  If you need less than 32 bits,
+use a bitmask.  For example, if you need only 10 bits, do
+  h = (h & hashmask(10));
+In which case, the hash table should have hashsize(10) elements.
+
+If you are hashing n strings (uint8_t **)k, do it like this:
+  for (i=0, h=0; i<n; ++i) h = hashlittle( k[i], len[i], h);
+
+By Bob Jenkins, 2006.  bob_jenkins@burtleburtle.net.  You may use this
+code any way you wish, private, educational, or commercial.  It's free.
+
+Use for hash table lookup, or anything where one collision in 2^^32 is
+acceptable.  Do NOT use for cryptographic purposes.
+-------------------------------------------------------------------------------
+*/
+
+static uint32_t hashlittle( const void *key, size_t length )
+{
+  uint32_t a,b,c;                                          /* internal state */
+  union { const void *ptr; size_t i; } u;     /* needed for Mac Powerbook G4 */
+
+  /* Set up the internal state */
+  a = b = c = 0xdeadbeef + ((uint32_t)length);
+
+  u.ptr = key;
+  if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
+    const uint32_t *k = (const uint32_t *)key;         /* read 32-bit chunks */
+    const uint8_t  *k8;
+
+    /*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += k[0];
+      b += k[1];
+      c += k[2];
+      mix(a,b,c);
+      length -= 12;
+      k += 3;
+    }
+
+    /*----------------------------- handle the last (probably partial) block */
+    k8 = (const uint8_t *)k;
+    switch(length)
+    {
+    case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
+    case 11: c+=((uint32_t)k8[10])<<16;  /* fall through */
+    case 10: c+=((uint32_t)k8[9])<<8;    /* fall through */
+    case 9 : c+=k8[8];                   /* fall through */
+    case 8 : b+=k[1]; a+=k[0]; break;
+    case 7 : b+=((uint32_t)k8[6])<<16;   /* fall through */
+    case 6 : b+=((uint32_t)k8[5])<<8;    /* fall through */
+    case 5 : b+=k8[4];                   /* fall through */
+    case 4 : a+=k[0]; break;
+    case 3 : a+=((uint32_t)k8[2])<<16;   /* fall through */
+    case 2 : a+=((uint32_t)k8[1])<<8;    /* fall through */
+    case 1 : a+=k8[0]; break;
+    case 0 : return c;
+    }
+  } else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
+    const uint16_t *k = (const uint16_t *)key;         /* read 16-bit chunks */
+    const uint8_t  *k8;
+
+    /*--------------- all but last block: aligned reads and different mixing */
+    while (length > 12)
+    {
+      a += k[0] + (((uint32_t)k[1])<<16);
+      b += k[2] + (((uint32_t)k[3])<<16);
+      c += k[4] + (((uint32_t)k[5])<<16);
+      mix(a,b,c);
+      length -= 12;
+      k += 6;
+    }
+
+    /*----------------------------- handle the last (probably partial) block */
+    k8 = (const uint8_t *)k;
+    switch(length)
+    {
+    case 12: c+=k[4]+(((uint32_t)k[5])<<16);
+             b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 11: c+=((uint32_t)k8[10])<<16;     /* fall through */
+    case 10: c+=k[4];
+             b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 9 : c+=k8[8];                      /* fall through */
+    case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 7 : b+=((uint32_t)k8[6])<<16;      /* fall through */
+    case 6 : b+=k[2];
+             a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 5 : b+=k8[4];                      /* fall through */
+    case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
+             break;
+    case 3 : a+=((uint32_t)k8[2])<<16;      /* fall through */
+    case 2 : a+=k[0];
+             break;
+    case 1 : a+=k8[0];
+             break;
+    case 0 : return c;                     /* zero length requires no mixing */
+    }
+
+  } else {                        /* need to read the key one byte at a time */
+    const uint8_t *k = (const uint8_t *)key;
+
+    /*--------------- all but the last block: affect some 32 bits of (a,b,c) */
+    while (length > 12)
+    {
+      a += k[0];
+      a += ((uint32_t)k[1])<<8;
+      a += ((uint32_t)k[2])<<16;
+      a += ((uint32_t)k[3])<<24;
+      b += k[4];
+      b += ((uint32_t)k[5])<<8;
+      b += ((uint32_t)k[6])<<16;
+      b += ((uint32_t)k[7])<<24;
+      c += k[8];
+      c += ((uint32_t)k[9])<<8;
+      c += ((uint32_t)k[10])<<16;
+      c += ((uint32_t)k[11])<<24;
+      mix(a,b,c);
+      length -= 12;
+      k += 12;
+    }
+
+    /*-------------------------------- last block: affect all 32 bits of (c) */
+    switch(length)                   /* all the case statements fall through */
+    {
+    case 12: c+=((uint32_t)k[11])<<24;
+    case 11: c+=((uint32_t)k[10])<<16;
+    case 10: c+=((uint32_t)k[9])<<8;
+    case 9 : c+=k[8];
+    case 8 : b+=((uint32_t)k[7])<<24;
+    case 7 : b+=((uint32_t)k[6])<<16;
+    case 6 : b+=((uint32_t)k[5])<<8;
+    case 5 : b+=k[4];
+    case 4 : a+=((uint32_t)k[3])<<24;
+    case 3 : a+=((uint32_t)k[2])<<16;
+    case 2 : a+=((uint32_t)k[1])<<8;
+    case 1 : a+=k[0];
+             break;
+    case 0 : return c;
+    }
+  }
+
+  final(a,b,c);
+  return c;
+}
+
+_PUBLIC_ unsigned int tdb_jenkins_hash(TDB_DATA *key)
+{
+       return hashlittle(key->dptr, key->dsize);
+}
diff --git a/ctdb/lib/tdb/common/io.c b/ctdb/lib/tdb/common/io.c
new file mode 100644 (file)
index 0000000..25968bf
--- /dev/null
@@ -0,0 +1,534 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#include "tdb_private.h"
+
+/* check for an out of bounds access - if it is out of bounds then
+   see if the database has been expanded by someone else and expand
+   if necessary 
+*/
+static int tdb_oob(struct tdb_context *tdb, tdb_off_t off, tdb_len_t len,
+                  int probe)
+{
+       struct stat st;
+       if (len + off < len) {
+               if (!probe) {
+                       /* Ensure ecode is set for log fn. */
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob off %d len %d wrap\n",
+                                (int)off, (int)len));
+               }
+               return -1;
+       }
+
+       if (off + len <= tdb->map_size)
+               return 0;
+       if (tdb->flags & TDB_INTERNAL) {
+               if (!probe) {
+                       /* Ensure ecode is set for log fn. */
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond internal malloc size %u\n",
+                                (int)(off + len), (int)tdb->map_size));
+               }
+               return -1;
+       }
+
+       if (fstat(tdb->fd, &st) == -1) {
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       /* Beware >4G files! */
+       if ((tdb_off_t)st.st_size != st.st_size) {
+               /* Ensure ecode is set for log fn. */
+               tdb->ecode = TDB_ERR_IO;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_oob len %llu too large!\n",
+                        (long long)st.st_size));
+               return -1;
+       }
+
+       /* Unmap, update size, remap.  We do this unconditionally, to handle
+        * the unusual case where the db is truncated.
+        *
+        * This can happen to a child using tdb_reopen_all(true) on a
+        * TDB_CLEAR_IF_FIRST tdb whose parent crashes: the next
+        * opener will truncate the database. */
+       if (tdb_munmap(tdb) == -1) {
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+       tdb->map_size = st.st_size;
+       if (tdb_mmap(tdb) != 0) {
+               return - 1;
+       }
+
+       if (st.st_size < (size_t)off + len) {
+               if (!probe) {
+                       /* Ensure ecode is set for log fn. */
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_oob len %u beyond eof at %u\n",
+                                (int)(off + len), (int)st.st_size));
+               }
+               return -1;
+       }
+       return 0;
+}
+
+/* write a lump of data at a specified offset */
+static int tdb_write(struct tdb_context *tdb, tdb_off_t off, 
+                    const void *buf, tdb_len_t len)
+{
+       if (len == 0) {
+               return 0;
+       }
+
+       if (tdb->read_only || tdb->traverse_read) {
+               tdb->ecode = TDB_ERR_RDONLY;
+               return -1;
+       }
+
+       if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0)
+               return -1;
+
+       if (tdb->map_ptr) {
+               memcpy(off + (char *)tdb->map_ptr, buf, len);
+       } else {
+#ifdef HAVE_INCOHERENT_MMAP
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+#else
+               ssize_t written = pwrite(tdb->fd, buf, len, off);
+               if ((written != (ssize_t)len) && (written != -1)) {
+                       /* try once more */
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: wrote only "
+                                "%d of %d bytes at %d, trying once more\n",
+                                (int)written, len, off));
+                       written = pwrite(tdb->fd, (const char *)buf+written,
+                                        len-written,
+                                        off+written);
+               }
+               if (written == -1) {
+                       /* Ensure ecode is set for log fn. */
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_write failed at %d "
+                                "len=%d (%s)\n", off, len, strerror(errno)));
+                       return -1;
+               } else if (written != (ssize_t)len) {
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_write: failed to "
+                                "write %d bytes at %d in two attempts\n",
+                                len, off));
+                       return -1;
+               }
+#endif
+       }
+       return 0;
+}
+
+/* Endian conversion: we only ever deal with 4 byte quantities */
+void *tdb_convert(void *buf, uint32_t size)
+{
+       uint32_t i, *p = (uint32_t *)buf;
+       for (i = 0; i < size / 4; i++)
+               p[i] = TDB_BYTEREV(p[i]);
+       return buf;
+}
+
+
+/* read a lump of data at a specified offset, maybe convert */
+static int tdb_read(struct tdb_context *tdb, tdb_off_t off, void *buf, 
+                   tdb_len_t len, int cv)
+{
+       if (tdb->methods->tdb_oob(tdb, off, len, 0) != 0) {
+               return -1;
+       }
+
+       if (tdb->map_ptr) {
+               memcpy(buf, off + (char *)tdb->map_ptr, len);
+       } else {
+#ifdef HAVE_INCOHERENT_MMAP
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+#else
+               ssize_t ret = pread(tdb->fd, buf, len, off);
+               if (ret != (ssize_t)len) {
+                       /* Ensure ecode is set for log fn. */
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_read failed at %d "
+                                "len=%d ret=%d (%s) map_size=%d\n",
+                                (int)off, (int)len, (int)ret, strerror(errno),
+                                (int)tdb->map_size));
+                       return -1;
+               }
+#endif
+       }
+       if (cv) {
+               tdb_convert(buf, len);
+       }
+       return 0;
+}
+
+
+
+/*
+  do an unlocked scan of the hash table heads to find the next non-zero head. The value
+  will then be confirmed with the lock held
+*/             
+static void tdb_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
+{
+       uint32_t h = *chain;
+       if (tdb->map_ptr) {
+               for (;h < tdb->header.hash_size;h++) {
+                       if (0 != *(uint32_t *)(TDB_HASH_TOP(h) + (unsigned char *)tdb->map_ptr)) {
+                               break;
+                       }
+               }
+       } else {
+               uint32_t off=0;
+               for (;h < tdb->header.hash_size;h++) {
+                       if (tdb_ofs_read(tdb, TDB_HASH_TOP(h), &off) != 0 || off != 0) {
+                               break;
+                       }
+               }
+       }
+       (*chain) = h;
+}
+
+
+int tdb_munmap(struct tdb_context *tdb)
+{
+       if (tdb->flags & TDB_INTERNAL)
+               return 0;
+
+#ifdef HAVE_MMAP
+       if (tdb->map_ptr) {
+               int ret;
+
+               ret = munmap(tdb->map_ptr, tdb->map_size);
+               if (ret != 0)
+                       return ret;
+       }
+#endif
+       tdb->map_ptr = NULL;
+       return 0;
+}
+
+/* If mmap isn't coherent, *everyone* must always mmap. */
+static bool should_mmap(const struct tdb_context *tdb)
+{
+#ifdef HAVE_INCOHERENT_MMAP
+       return true;
+#else
+       return !(tdb->flags & TDB_NOMMAP);
+#endif
+}
+
+int tdb_mmap(struct tdb_context *tdb)
+{
+       if (tdb->flags & TDB_INTERNAL)
+               return 0;
+
+#ifdef HAVE_MMAP
+       if (should_mmap(tdb)) {
+               tdb->map_ptr = mmap(NULL, tdb->map_size, 
+                                   PROT_READ|(tdb->read_only? 0:PROT_WRITE), 
+                                   MAP_SHARED|MAP_FILE, tdb->fd, 0);
+
+               /*
+                * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
+                */
+
+               if (tdb->map_ptr == MAP_FAILED) {
+                       tdb->map_ptr = NULL;
+                       TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_mmap failed for size %d (%s)\n", 
+                                tdb->map_size, strerror(errno)));
+#ifdef HAVE_INCOHERENT_MMAP
+                       tdb->ecode = TDB_ERR_IO;
+                       return -1;
+#endif
+               }
+       } else {
+               tdb->map_ptr = NULL;
+       }
+#else
+       tdb->map_ptr = NULL;
+#endif
+       return 0;
+}
+
+/* expand a file.  we prefer to use ftruncate, as that is what posix
+  says to use for mmap expansion */
+static int tdb_expand_file(struct tdb_context *tdb, tdb_off_t size, tdb_off_t addition)
+{
+       char buf[8192];
+
+       if (tdb->read_only || tdb->traverse_read) {
+               tdb->ecode = TDB_ERR_RDONLY;
+               return -1;
+       }
+
+       if (ftruncate(tdb->fd, size+addition) == -1) {
+               char b = 0;
+               ssize_t written = pwrite(tdb->fd,  &b, 1, (size+addition) - 1);
+               if (written == 0) {
+                       /* try once more, potentially revealing errno */
+                       written = pwrite(tdb->fd,  &b, 1, (size+addition) - 1);
+               }
+               if (written == 0) {
+                       /* again - give up, guessing errno */
+                       errno = ENOSPC;
+               }
+               if (written != 1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n", 
+                                size+addition, strerror(errno)));
+                       return -1;
+               }
+       }
+
+       /* now fill the file with something. This ensures that the
+          file isn't sparse, which would be very bad if we ran out of
+          disk. This must be done with write, not via mmap */
+       memset(buf, TDB_PAD_BYTE, sizeof(buf));
+       while (addition) {
+               size_t n = addition>sizeof(buf)?sizeof(buf):addition;
+               ssize_t written = pwrite(tdb->fd, buf, n, size);
+               if (written == 0) {
+                       /* prevent infinite loops: try _once_ more */
+                       written = pwrite(tdb->fd, buf, n, size);
+               }
+               if (written == 0) {
+                       /* give up, trying to provide a useful errno */
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write "
+                               "returned 0 twice: giving up!\n"));
+                       errno = ENOSPC;
+                       return -1;
+               } else if (written == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file write of "
+                                "%d bytes failed (%s)\n", (int)n,
+                                strerror(errno)));
+                       return -1;
+               } else if (written != n) {
+                       TDB_LOG((tdb, TDB_DEBUG_WARNING, "expand_file: wrote "
+                                "only %d of %d bytes - retrying\n", (int)written,
+                                (int)n));
+               }
+               addition -= written;
+               size += written;
+       }
+       return 0;
+}
+
+
+/* You need 'size', this tells you how much you should expand by. */
+tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size)
+{
+       tdb_off_t new_size, top_size;
+
+       /* limit size in order to avoid using up huge amounts of memory for
+        * in memory tdbs if an oddball huge record creeps in */
+       if (size > 100 * 1024) {
+               top_size = map_size + size * 2;
+       } else {
+               top_size = map_size + size * 100;
+       }
+
+       /* always make room for at least top_size more records, and at
+          least 25% more space. if the DB is smaller than 100MiB,
+          otherwise grow it by 10% only. */
+       if (map_size > 100 * 1024 * 1024) {
+               new_size = map_size * 1.10;
+       } else {
+               new_size = map_size * 1.25;
+       }
+
+       /* Round the database up to a multiple of the page size */
+       new_size = MAX(top_size, new_size);
+       return TDB_ALIGN(new_size, page_size) - map_size;
+}
+
+/* expand the database at least size bytes by expanding the underlying
+   file and doing the mmap again if necessary */
+int tdb_expand(struct tdb_context *tdb, tdb_off_t size)
+{
+       struct tdb_record rec;
+       tdb_off_t offset;
+
+       if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "lock failed in tdb_expand\n"));
+               return -1;
+       }
+
+       /* must know about any previous expansions by another process */
+       tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
+
+       size = tdb_expand_adjust(tdb->map_size, size, tdb->page_size);
+
+       /* expand the file itself */
+       if (!(tdb->flags & TDB_INTERNAL)) {
+               if (tdb->methods->tdb_expand_file(tdb, tdb->map_size, size) != 0)
+                       goto fail;
+       }
+
+       /* form a new freelist record */
+       offset = tdb->map_size;
+       memset(&rec,'\0',sizeof(rec));
+       rec.rec_len = size - sizeof(rec);
+
+       if (tdb->flags & TDB_INTERNAL) {
+               char *new_map_ptr = (char *)realloc(tdb->map_ptr,
+                                                   tdb->map_size + size);
+               if (!new_map_ptr) {
+                       goto fail;
+               }
+               tdb->map_ptr = new_map_ptr;
+               tdb->map_size += size;
+       } else {
+               /* Explicitly remap: if we're in a transaction, this won't
+                * happen automatically! */
+               tdb_munmap(tdb);
+               tdb->map_size += size;
+               if (tdb_mmap(tdb) != 0) {
+                       goto fail;
+               }
+       }
+
+       /* link it into the free list */
+       if (tdb_free(tdb, offset, &rec) == -1)
+               goto fail;
+
+       tdb_unlock(tdb, -1, F_WRLCK);
+       return 0;
+ fail:
+       tdb_unlock(tdb, -1, F_WRLCK);
+       return -1;
+}
+
+/* read/write a tdb_off_t */
+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
+{
+       return tdb->methods->tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
+}
+
+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d)
+{
+       tdb_off_t off = *d;
+       return tdb->methods->tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
+}
+
+
+/* read a lump of data, allocating the space for it */
+unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len)
+{
+       unsigned char *buf;
+
+       /* some systems don't like zero length malloc */
+
+       if (!(buf = (unsigned char *)malloc(len ? len : 1))) {
+               /* Ensure ecode is set for log fn. */
+               tdb->ecode = TDB_ERR_OOM;
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_alloc_read malloc failed len=%d (%s)\n",
+                          len, strerror(errno)));
+               return NULL;
+       }
+       if (tdb->methods->tdb_read(tdb, offset, buf, len, 0) == -1) {
+               SAFE_FREE(buf);
+               return NULL;
+       }
+       return buf;
+}
+
+/* Give a piece of tdb data to a parser */
+
+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
+                  tdb_off_t offset, tdb_len_t len,
+                  int (*parser)(TDB_DATA key, TDB_DATA data,
+                                void *private_data),
+                  void *private_data)
+{
+       TDB_DATA data;
+       int result;
+
+       data.dsize = len;
+
+       if ((tdb->transaction == NULL) && (tdb->map_ptr != NULL)) {
+               /*
+                * Optimize by avoiding the malloc/memcpy/free, point the
+                * parser directly at the mmap area.
+                */
+               if (tdb->methods->tdb_oob(tdb, offset, len, 0) != 0) {
+                       return -1;
+               }
+               data.dptr = offset + (unsigned char *)tdb->map_ptr;
+               return parser(key, data, private_data);
+       }
+
+       if (!(data.dptr = tdb_alloc_read(tdb, offset, len))) {
+               return -1;
+       }
+
+       result = parser(key, data, private_data);
+       free(data.dptr);
+       return result;
+}
+
+/* read/write a record */
+int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
+{
+       if (tdb->methods->tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
+               return -1;
+       if (TDB_BAD_MAGIC(rec)) {
+               /* Ensure ecode is set for log fn. */
+               tdb->ecode = TDB_ERR_CORRUPT;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
+               return -1;
+       }
+       return tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 0);
+}
+
+int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec)
+{
+       struct tdb_record r = *rec;
+       return tdb->methods->tdb_write(tdb, offset, CONVERT(r), sizeof(r));
+}
+
+static const struct tdb_methods io_methods = {
+       tdb_read,
+       tdb_write,
+       tdb_next_hash_chain,
+       tdb_oob,
+       tdb_expand_file,
+};
+
+/*
+  initialise the default methods table
+*/
+void tdb_io_init(struct tdb_context *tdb)
+{
+       tdb->methods = &io_methods;
+}
diff --git a/ctdb/lib/tdb/common/lock.c b/ctdb/lib/tdb/common/lock.c
new file mode 100644 (file)
index 0000000..260fab6
--- /dev/null
@@ -0,0 +1,875 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+_PUBLIC_ void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *ptr)
+{
+       tdb->interrupt_sig_ptr = ptr;
+}
+
+static int fcntl_lock(struct tdb_context *tdb,
+                     int rw, off_t off, off_t len, bool waitflag)
+{
+       struct flock fl;
+
+       fl.l_type = rw;
+       fl.l_whence = SEEK_SET;
+       fl.l_start = off;
+       fl.l_len = len;
+       fl.l_pid = 0;
+
+       if (waitflag)
+               return fcntl(tdb->fd, F_SETLKW, &fl);
+       else
+               return fcntl(tdb->fd, F_SETLK, &fl);
+}
+
+static int fcntl_unlock(struct tdb_context *tdb, int rw, off_t off, off_t len)
+{
+       struct flock fl;
+#if 0 /* Check they matched up locks and unlocks correctly. */
+       char line[80];
+       FILE *locks;
+       bool found = false;
+
+       locks = fopen("/proc/locks", "r");
+
+       while (fgets(line, 80, locks)) {
+               char *p;
+               int type, start, l;
+
+               /* eg. 1: FLOCK  ADVISORY  WRITE 2440 08:01:2180826 0 EOF */
+               p = strchr(line, ':') + 1;
+               if (strncmp(p, " POSIX  ADVISORY  ", strlen(" POSIX  ADVISORY  ")))
+                       continue;
+               p += strlen(" FLOCK  ADVISORY  ");
+               if (strncmp(p, "READ  ", strlen("READ  ")) == 0)
+                       type = F_RDLCK;
+               else if (strncmp(p, "WRITE ", strlen("WRITE ")) == 0)
+                       type = F_WRLCK;
+               else
+                       abort();
+               p += 6;
+               if (atoi(p) != getpid())
+                       continue;
+               p = strchr(strchr(p, ' ') + 1, ' ') + 1;
+               start = atoi(p);
+               p = strchr(p, ' ') + 1;
+               if (strncmp(p, "EOF", 3) == 0)
+                       l = 0;
+               else
+                       l = atoi(p) - start + 1;
+
+               if (off == start) {
+                       if (len != l) {
+                               fprintf(stderr, "Len %u should be %u: %s",
+                                       (int)len, l, line);
+                               abort();
+                       }
+                       if (type != rw) {
+                               fprintf(stderr, "Type %s wrong: %s",
+                                       rw == F_RDLCK ? "READ" : "WRITE", line);
+                               abort();
+                       }
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found) {
+               fprintf(stderr, "Unlock on %u@%u not found!\n",
+                       (int)off, (int)len);
+               abort();
+       }
+
+       fclose(locks);
+#endif
+
+       fl.l_type = F_UNLCK;
+       fl.l_whence = SEEK_SET;
+       fl.l_start = off;
+       fl.l_len = len;
+       fl.l_pid = 0;
+
+       return fcntl(tdb->fd, F_SETLKW, &fl);
+}
+
+/* list -1 is the alloc list, otherwise a hash chain. */
+static tdb_off_t lock_offset(int list)
+{
+       return FREELIST_TOP + 4*list;
+}
+
+/* a byte range locking function - return 0 on success
+   this functions locks/unlocks 1 byte at the specified offset.
+
+   On error, errno is also set so that errors are passed back properly
+   through tdb_open(). 
+
+   note that a len of zero means lock to end of file
+*/
+int tdb_brlock(struct tdb_context *tdb,
+              int rw_type, tdb_off_t offset, size_t len,
+              enum tdb_lock_flags flags)
+{
+       int ret;
+
+       if (tdb->flags & TDB_NOLOCK) {
+               return 0;
+       }
+
+       if (flags & TDB_LOCK_MARK_ONLY) {
+               return 0;
+       }
+
+       if ((rw_type == F_WRLCK) && (tdb->read_only || tdb->traverse_read)) {
+               tdb->ecode = TDB_ERR_RDONLY;
+               return -1;
+       }
+
+       do {
+               ret = fcntl_lock(tdb, rw_type, offset, len,
+                                flags & TDB_LOCK_WAIT);
+               /* Check for a sigalarm break. */
+               if (ret == -1 && errno == EINTR &&
+                               tdb->interrupt_sig_ptr &&
+                               *tdb->interrupt_sig_ptr) {
+                       break;
+               }
+       } while (ret == -1 && errno == EINTR);
+
+       if (ret == -1) {
+               tdb->ecode = TDB_ERR_LOCK;
+               /* Generic lock error. errno set by fcntl.
+                * EAGAIN is an expected return from non-blocking
+                * locks. */
+               if (!(flags & TDB_LOCK_PROBE) && errno != EAGAIN) {
+                       TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d flags=%d len=%d\n",
+                                tdb->fd, offset, rw_type, flags, (int)len));
+               }
+               return -1;
+       }
+       return 0;
+}
+
+int tdb_brunlock(struct tdb_context *tdb,
+                int rw_type, tdb_off_t offset, size_t len)
+{
+       int ret;
+
+       if (tdb->flags & TDB_NOLOCK) {
+               return 0;
+       }
+
+       do {
+               ret = fcntl_unlock(tdb, rw_type, offset, len);
+       } while (ret == -1 && errno == EINTR);
+
+       if (ret == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_brunlock failed (fd=%d) at offset %d rw_type=%d len=%d\n",
+                        tdb->fd, offset, rw_type, (int)len));
+       }
+       return ret;
+}
+
+/*
+  upgrade a read lock to a write lock. This needs to be handled in a
+  special way as some OSes (such as solaris) have too conservative
+  deadlock detection and claim a deadlock when progress can be
+  made. For those OSes we may loop for a while.  
+*/
+int tdb_allrecord_upgrade(struct tdb_context *tdb)
+{
+       int count = 1000;
+
+       if (tdb->allrecord_lock.count != 1) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "tdb_allrecord_upgrade failed: count %u too high\n",
+                        tdb->allrecord_lock.count));
+               return -1;
+       }
+
+       if (tdb->allrecord_lock.off != 1) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                        "tdb_allrecord_upgrade failed: already upgraded?\n"));
+               return -1;
+       }
+
+       while (count--) {
+               struct timeval tv;
+               if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0,
+                              TDB_LOCK_WAIT|TDB_LOCK_PROBE) == 0) {
+                       tdb->allrecord_lock.ltype = F_WRLCK;
+                       tdb->allrecord_lock.off = 0;
+                       return 0;
+               }
+               if (errno != EDEADLK) {
+                       break;
+               }
+               /* sleep for as short a time as we can - more portable than usleep() */
+               tv.tv_sec = 0;
+               tv.tv_usec = 1;
+               select(0, NULL, NULL, NULL, &tv);
+       }
+       TDB_LOG((tdb, TDB_DEBUG_TRACE,"tdb_allrecord_upgrade failed\n"));
+       return -1;
+}
+
+static struct tdb_lock_type *find_nestlock(struct tdb_context *tdb,
+                                          tdb_off_t offset)
+{
+       unsigned int i;
+
+       for (i=0; i<tdb->num_lockrecs; i++) {
+               if (tdb->lockrecs[i].off == offset) {
+                       return &tdb->lockrecs[i];
+               }
+       }
+       return NULL;
+}
+
+/* lock an offset in the database. */
+int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                 enum tdb_lock_flags flags)
+{
+       struct tdb_lock_type *new_lck;
+
+       if (offset >= lock_offset(tdb->header.hash_size)) {
+               tdb->ecode = TDB_ERR_LOCK;
+               TDB_LOG((tdb, TDB_DEBUG_ERROR,"tdb_lock: invalid offset %u for ltype=%d\n",
+                        offset, ltype));
+               return -1;
+       }
+       if (tdb->flags & TDB_NOLOCK)
+               return 0;
+
+       new_lck = find_nestlock(tdb, offset);
+       if (new_lck) {
+               /*
+                * Just increment the in-memory struct, posix locks
+                * don't stack.
+                */
+               new_lck->count++;
+               return 0;
+       }
+
+       new_lck = (struct tdb_lock_type *)realloc(
+               tdb->lockrecs,
+               sizeof(*tdb->lockrecs) * (tdb->num_lockrecs+1));
+       if (new_lck == NULL) {
+               errno = ENOMEM;
+               return -1;
+       }
+       tdb->lockrecs = new_lck;
+
+       /* Since fcntl locks don't nest, we do a lock for the first one,
+          and simply bump the count for future ones */
+       if (tdb_brlock(tdb, ltype, offset, 1, flags)) {
+               return -1;
+       }
+
+       tdb->lockrecs[tdb->num_lockrecs].off = offset;
+       tdb->lockrecs[tdb->num_lockrecs].count = 1;
+       tdb->lockrecs[tdb->num_lockrecs].ltype = ltype;
+       tdb->num_lockrecs++;
+
+       return 0;
+}
+
+static int tdb_lock_and_recover(struct tdb_context *tdb)
+{
+       int ret;
+
+       /* We need to match locking order in transaction commit. */
+       if (tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0, TDB_LOCK_WAIT)) {
+               return -1;
+       }
+
+       if (tdb_brlock(tdb, F_WRLCK, OPEN_LOCK, 1, TDB_LOCK_WAIT)) {
+               tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+               return -1;
+       }
+
+       ret = tdb_transaction_recover(tdb);
+
+       tdb_brunlock(tdb, F_WRLCK, OPEN_LOCK, 1);
+       tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+
+       return ret;
+}
+
+static bool have_data_locks(const struct tdb_context *tdb)
+{
+       unsigned int i;
+
+       for (i = 0; i < tdb->num_lockrecs; i++) {
+               if (tdb->lockrecs[i].off >= lock_offset(-1))
+                       return true;
+       }
+       return false;
+}
+
+static int tdb_lock_list(struct tdb_context *tdb, int list, int ltype,
+                        enum tdb_lock_flags waitflag)
+{
+       int ret;
+       bool check = false;
+
+       /* a allrecord lock allows us to avoid per chain locks */
+       if (tdb->allrecord_lock.count &&
+           (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
+               return 0;
+       }
+
+       if (tdb->allrecord_lock.count) {
+               tdb->ecode = TDB_ERR_LOCK;
+               ret = -1;
+       } else {
+               /* Only check when we grab first data lock. */
+               check = !have_data_locks(tdb);
+               ret = tdb_nest_lock(tdb, lock_offset(list), ltype, waitflag);
+
+               if (ret == 0 && check && tdb_needs_recovery(tdb)) {
+                       tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
+
+                       if (tdb_lock_and_recover(tdb) == -1) {
+                               return -1;
+                       }
+                       return tdb_lock_list(tdb, list, ltype, waitflag);
+               }
+       }
+       return ret;
+}
+
+/* lock a list in the database. list -1 is the alloc list */
+int tdb_lock(struct tdb_context *tdb, int list, int ltype)
+{
+       int ret;
+
+       ret = tdb_lock_list(tdb, list, ltype, TDB_LOCK_WAIT);
+       if (ret) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_lock failed on list %d "
+                        "ltype=%d (%s)\n",  list, ltype, strerror(errno)));
+       }
+       return ret;
+}
+
+/* lock a list in the database. list -1 is the alloc list. non-blocking lock */
+_PUBLIC_ int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype)
+{
+       return tdb_lock_list(tdb, list, ltype, TDB_LOCK_NOWAIT);
+}
+
+
+int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                   bool mark_lock)
+{
+       int ret = -1;
+       struct tdb_lock_type *lck;
+
+       if (tdb->flags & TDB_NOLOCK)
+               return 0;
+
+       /* Sanity checks */
+       if (offset >= lock_offset(tdb->header.hash_size)) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: offset %u invalid (%d)\n", offset, tdb->header.hash_size));
+               return ret;
+       }
+
+       lck = find_nestlock(tdb, offset);
+       if ((lck == NULL) || (lck->count == 0)) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: count is 0\n"));
+               return -1;
+       }
+
+       if (lck->count > 1) {
+               lck->count--;
+               return 0;
+       }
+
+       /*
+        * This lock has count==1 left, so we need to unlock it in the
+        * kernel. We don't bother with decrementing the in-memory array
+        * element, we're about to overwrite it with the last array element
+        * anyway.
+        */
+
+       if (mark_lock) {
+               ret = 0;
+       } else {
+               ret = tdb_brunlock(tdb, ltype, offset, 1);
+       }
+
+       /*
+        * Shrink the array by overwriting the element just unlocked with the
+        * last array element.
+        */
+       *lck = tdb->lockrecs[--tdb->num_lockrecs];
+
+       /*
+        * We don't bother with realloc when the array shrinks, but if we have
+        * a completely idle tdb we should get rid of the locked array.
+        */
+
+       if (tdb->num_lockrecs == 0) {
+               SAFE_FREE(tdb->lockrecs);
+       }
+
+       if (ret)
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlock: An error occurred unlocking!\n")); 
+       return ret;
+}
+
+_PUBLIC_ int tdb_unlock(struct tdb_context *tdb, int list, int ltype)
+{
+       /* a global lock allows us to avoid per chain locks */
+       if (tdb->allrecord_lock.count &&
+           (ltype == tdb->allrecord_lock.ltype || ltype == F_RDLCK)) {
+               return 0;
+       }
+
+       if (tdb->allrecord_lock.count) {
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       return tdb_nest_unlock(tdb, lock_offset(list), ltype, false);
+}
+
+/*
+  get the transaction lock
+ */
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
+                        enum tdb_lock_flags lockflags)
+{
+       return tdb_nest_lock(tdb, TRANSACTION_LOCK, ltype, lockflags);
+}
+
+/*
+  release the transaction lock
+ */
+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype)
+{
+       return tdb_nest_unlock(tdb, TRANSACTION_LOCK, ltype, false);
+}
+
+/* Returns 0 if all done, -1 if error, 1 if ok. */
+static int tdb_allrecord_check(struct tdb_context *tdb, int ltype,
+                              enum tdb_lock_flags flags, bool upgradable)
+{
+       /* There are no locks on read-only dbs */
+       if (tdb->read_only || tdb->traverse_read) {
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       if (tdb->allrecord_lock.count && tdb->allrecord_lock.ltype == ltype) {
+               tdb->allrecord_lock.count++;
+               return 0;
+       }
+
+       if (tdb->allrecord_lock.count) {
+               /* a global lock of a different type exists */
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       if (tdb_have_extra_locks(tdb)) {
+               /* can't combine global and chain locks */
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       if (upgradable && ltype != F_RDLCK) {
+               /* tdb error: you can't upgrade a write lock! */
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+       return 1;
+}
+
+/* We only need to lock individual bytes, but Linux merges consecutive locks
+ * so we lock in contiguous ranges. */
+static int tdb_chainlock_gradual(struct tdb_context *tdb,
+                                int ltype, enum tdb_lock_flags flags,
+                                size_t off, size_t len)
+{
+       int ret;
+       enum tdb_lock_flags nb_flags = (flags & ~TDB_LOCK_WAIT);
+
+       if (len <= 4) {
+               /* Single record.  Just do blocking lock. */
+               return tdb_brlock(tdb, ltype, off, len, flags);
+       }
+
+       /* First we try non-blocking. */
+       ret = tdb_brlock(tdb, ltype, off, len, nb_flags);
+       if (ret == 0) {
+               return 0;
+       }
+
+       /* Try locking first half, then second. */
+       ret = tdb_chainlock_gradual(tdb, ltype, flags, off, len / 2);
+       if (ret == -1)
+               return -1;
+
+       ret = tdb_chainlock_gradual(tdb, ltype, flags,
+                                   off + len / 2, len - len / 2);
+       if (ret == -1) {
+               tdb_brunlock(tdb, ltype, off, len / 2);
+               return -1;
+       }
+       return 0;
+}
+
+/* lock/unlock entire database.  It can only be upgradable if you have some
+ * other way of guaranteeing exclusivity (ie. transaction write lock).
+ * We do the locking gradually to avoid being starved by smaller locks. */
+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+                      enum tdb_lock_flags flags, bool upgradable)
+{
+       switch (tdb_allrecord_check(tdb, ltype, flags, upgradable)) {
+       case -1:
+               return -1;
+       case 0:
+               return 0;
+       }
+
+       /* We cover two kinds of locks:
+        * 1) Normal chain locks.  Taken for almost all operations.
+        * 2) Individual records locks.  Taken after normal or free
+        *    chain locks.
+        *
+        * It is (1) which cause the starvation problem, so we're only
+        * gradual for that. */
+       if (tdb_chainlock_gradual(tdb, ltype, flags, FREELIST_TOP,
+                                 tdb->header.hash_size * 4) == -1) {
+               return -1;
+       }
+
+       /* Grab individual record locks. */
+       if (tdb_brlock(tdb, ltype, lock_offset(tdb->header.hash_size), 0,
+                      flags) == -1) {
+               tdb_brunlock(tdb, ltype, FREELIST_TOP,
+                            tdb->header.hash_size * 4);
+               return -1;
+       }
+
+       tdb->allrecord_lock.count = 1;
+       /* If it's upgradable, it's actually exclusive so we can treat
+        * it as a write lock. */
+       tdb->allrecord_lock.ltype = upgradable ? F_WRLCK : ltype;
+       tdb->allrecord_lock.off = upgradable;
+
+       if (tdb_needs_recovery(tdb)) {
+               bool mark = flags & TDB_LOCK_MARK_ONLY;
+               tdb_allrecord_unlock(tdb, ltype, mark);
+               if (mark) {
+                       tdb->ecode = TDB_ERR_LOCK;
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                "tdb_lockall_mark cannot do recovery\n"));
+                       return -1;
+               }
+               if (tdb_lock_and_recover(tdb) == -1) {
+                       return -1;
+               }
+               return tdb_allrecord_lock(tdb, ltype, flags, upgradable);
+       }
+
+       return 0;
+}
+
+
+
+/* unlock entire db */
+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock)
+{
+       /* There are no locks on read-only dbs */
+       if (tdb->read_only || tdb->traverse_read) {
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       if (tdb->allrecord_lock.count == 0) {
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       /* Upgradable locks are marked as write locks. */
+       if (tdb->allrecord_lock.ltype != ltype
+           && (!tdb->allrecord_lock.off || ltype != F_RDLCK)) {
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       if (tdb->allrecord_lock.count > 1) {
+               tdb->allrecord_lock.count--;
+               return 0;
+       }
+
+       if (!mark_lock && tdb_brunlock(tdb, ltype, FREELIST_TOP, 0)) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_unlockall failed (%s)\n", strerror(errno)));
+               return -1;
+       }
+
+       tdb->allrecord_lock.count = 0;
+       tdb->allrecord_lock.ltype = 0;
+
+       return 0;
+}
+
+/* lock entire database with write lock */
+_PUBLIC_ int tdb_lockall(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_lockall");
+       return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_WAIT, false);
+}
+
+/* lock entire database with write lock - mark only */
+_PUBLIC_ int tdb_lockall_mark(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_lockall_mark");
+       return tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY, false);
+}
+
+/* unlock entire database with write lock - unmark only */
+_PUBLIC_ int tdb_lockall_unmark(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_lockall_unmark");
+       return tdb_allrecord_unlock(tdb, F_WRLCK, true);
+}
+
+/* lock entire database with write lock - nonblocking varient */
+_PUBLIC_ int tdb_lockall_nonblock(struct tdb_context *tdb)
+{
+       int ret = tdb_allrecord_lock(tdb, F_WRLCK, TDB_LOCK_NOWAIT, false);
+       tdb_trace_ret(tdb, "tdb_lockall_nonblock", ret);
+       return ret;
+}
+
+/* unlock entire database with write lock */
+_PUBLIC_ int tdb_unlockall(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_unlockall");
+       return tdb_allrecord_unlock(tdb, F_WRLCK, false);
+}
+
+/* lock entire database with read lock */
+_PUBLIC_ int tdb_lockall_read(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_lockall_read");
+       return tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, false);
+}
+
+/* lock entire database with read lock - nonblock varient */
+_PUBLIC_ int tdb_lockall_read_nonblock(struct tdb_context *tdb)
+{
+       int ret = tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_NOWAIT, false);
+       tdb_trace_ret(tdb, "tdb_lockall_read_nonblock", ret);
+       return ret;
+}
+
+/* unlock entire database with read lock */
+_PUBLIC_ int tdb_unlockall_read(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_unlockall_read");
+       return tdb_allrecord_unlock(tdb, F_RDLCK, false);
+}
+
+/* lock/unlock one hash chain. This is meant to be used to reduce
+   contention - it cannot guarantee how many records will be locked */
+_PUBLIC_ int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key)
+{
+       int ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
+       tdb_trace_1rec(tdb, "tdb_chainlock", key);
+       return ret;
+}
+
+/* lock/unlock one hash chain, non-blocking. This is meant to be used
+   to reduce contention - it cannot guarantee how many records will be
+   locked */
+_PUBLIC_ int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key)
+{
+       int ret = tdb_lock_nonblock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
+       tdb_trace_1rec_ret(tdb, "tdb_chainlock_nonblock", key, ret);
+       return ret;
+}
+
+/* mark a chain as locked without actually locking it. Warning! use with great caution! */
+_PUBLIC_ int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key)
+{
+       int ret = tdb_nest_lock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
+                               F_WRLCK, TDB_LOCK_MARK_ONLY);
+       tdb_trace_1rec(tdb, "tdb_chainlock_mark", key);
+       return ret;
+}
+
+/* unmark a chain as locked without actually locking it. Warning! use with great caution! */
+_PUBLIC_ int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key)
+{
+       tdb_trace_1rec(tdb, "tdb_chainlock_unmark", key);
+       return tdb_nest_unlock(tdb, lock_offset(BUCKET(tdb->hash_fn(&key))),
+                              F_WRLCK, true);
+}
+
+_PUBLIC_ int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key)
+{
+       tdb_trace_1rec(tdb, "tdb_chainunlock", key);
+       return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_WRLCK);
+}
+
+_PUBLIC_ int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+       int ret;
+       ret = tdb_lock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
+       tdb_trace_1rec(tdb, "tdb_chainlock_read", key);
+       return ret;
+}
+
+_PUBLIC_ int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key)
+{
+       tdb_trace_1rec(tdb, "tdb_chainunlock_read", key);
+       return tdb_unlock(tdb, BUCKET(tdb->hash_fn(&key)), F_RDLCK);
+}
+
+/* record lock stops delete underneath */
+int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+       if (tdb->allrecord_lock.count) {
+               return 0;
+       }
+       return off ? tdb_brlock(tdb, F_RDLCK, off, 1, TDB_LOCK_WAIT) : 0;
+}
+
+/*
+  Write locks override our own fcntl readlocks, so check it here.
+  Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
+  an error to fail to get the lock here.
+*/
+int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+       struct tdb_traverse_lock *i;
+       for (i = &tdb->travlocks; i; i = i->next)
+               if (i->off == off)
+                       return -1;
+       if (tdb->allrecord_lock.count) {
+               if (tdb->allrecord_lock.ltype == F_WRLCK) {
+                       return 0;
+               }
+               return -1;
+       }
+       return tdb_brlock(tdb, F_WRLCK, off, 1, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
+}
+
+int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+       if (tdb->allrecord_lock.count) {
+               return 0;
+       }
+       return tdb_brunlock(tdb, F_WRLCK, off, 1);
+}
+
+/* fcntl locks don't stack: avoid unlocking someone else's */
+int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off)
+{
+       struct tdb_traverse_lock *i;
+       uint32_t count = 0;
+
+       if (tdb->allrecord_lock.count) {
+               return 0;
+       }
+
+       if (off == 0)
+               return 0;
+       for (i = &tdb->travlocks; i; i = i->next)
+               if (i->off == off)
+                       count++;
+       return (count == 1 ? tdb_brunlock(tdb, F_RDLCK, off, 1) : 0);
+}
+
+bool tdb_have_extra_locks(struct tdb_context *tdb)
+{
+       unsigned int extra = tdb->num_lockrecs;
+
+       /* A transaction holds the lock for all records. */
+       if (!tdb->transaction && tdb->allrecord_lock.count) {
+               return true;
+       }
+
+       /* We always hold the active lock if CLEAR_IF_FIRST. */
+       if (find_nestlock(tdb, ACTIVE_LOCK)) {
+               extra--;
+       }
+
+       /* In a transaction, we expect to hold the transaction lock */
+       if (tdb->transaction && find_nestlock(tdb, TRANSACTION_LOCK)) {
+               extra--;
+       }
+
+       return extra;
+}
+
+/* The transaction code uses this to remove all locks. */
+void tdb_release_transaction_locks(struct tdb_context *tdb)
+{
+       unsigned int i, active = 0;
+
+       if (tdb->allrecord_lock.count != 0) {
+               tdb_brunlock(tdb, tdb->allrecord_lock.ltype, FREELIST_TOP, 0);
+               tdb->allrecord_lock.count = 0;
+       }
+
+       for (i=0;i<tdb->num_lockrecs;i++) {
+               struct tdb_lock_type *lck = &tdb->lockrecs[i];
+
+               /* Don't release the active lock!  Copy it to first entry. */
+               if (lck->off == ACTIVE_LOCK) {
+                       tdb->lockrecs[active++] = *lck;
+               } else {
+                       tdb_brunlock(tdb, lck->ltype, lck->off, 1);
+               }
+       }
+       tdb->num_lockrecs = active;
+       if (tdb->num_lockrecs == 0) {
+               SAFE_FREE(tdb->lockrecs);
+       }
+}
+
+/* Following functions are added specifically to support CTDB. */
+
+/* Don't do actual fcntl locking, just mark tdb locked */
+_PUBLIC_ int tdb_transaction_write_lock_mark(struct tdb_context *tdb)
+{
+       return tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_MARK_ONLY);
+}
+
+/* Don't do actual fcntl unlocking, just mark tdb unlocked */
+_PUBLIC_ int tdb_transaction_write_lock_unmark(struct tdb_context *tdb)
+{
+       return tdb_nest_unlock(tdb, TRANSACTION_LOCK, F_WRLCK, true);
+}
diff --git a/ctdb/lib/tdb/common/open.c b/ctdb/lib/tdb/common/open.c
new file mode 100644 (file)
index 0000000..d9f76f0
--- /dev/null
@@ -0,0 +1,671 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+/* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
+static struct tdb_context *tdbs = NULL;
+
+/* We use two hashes to double-check they're using the right hash function. */
+void tdb_header_hash(struct tdb_context *tdb,
+                    uint32_t *magic1_hash, uint32_t *magic2_hash)
+{
+       TDB_DATA hash_key;
+       uint32_t tdb_magic = TDB_MAGIC;
+
+       hash_key.dptr = discard_const_p(unsigned char, TDB_MAGIC_FOOD);
+       hash_key.dsize = sizeof(TDB_MAGIC_FOOD);
+       *magic1_hash = tdb->hash_fn(&hash_key);
+
+       hash_key.dptr = (unsigned char *)CONVERT(tdb_magic);
+       hash_key.dsize = sizeof(tdb_magic);
+       *magic2_hash = tdb->hash_fn(&hash_key);
+
+       /* Make sure at least one hash is non-zero! */
+       if (*magic1_hash == 0 && *magic2_hash == 0)
+               *magic1_hash = 1;
+}
+
+/* initialise a new database with a specified hash size */
+static int tdb_new_database(struct tdb_context *tdb, int hash_size)
+{
+       struct tdb_header *newdb;
+       size_t size;
+       int ret = -1;
+
+       /* We make it up in memory, then write it out if not internal */
+       size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off_t);
+       if (!(newdb = (struct tdb_header *)calloc(size, 1))) {
+               tdb->ecode = TDB_ERR_OOM;
+               return -1;
+       }
+
+       /* Fill in the header */
+       newdb->version = TDB_VERSION;
+       newdb->hash_size = hash_size;
+
+       tdb_header_hash(tdb, &newdb->magic1_hash, &newdb->magic2_hash);
+
+       /* Make sure older tdbs (which don't check the magic hash fields)
+        * will refuse to open this TDB. */
+       if (tdb->flags & TDB_INCOMPATIBLE_HASH)
+               newdb->rwlocks = TDB_HASH_RWLOCK_MAGIC;
+
+       if (tdb->flags & TDB_INTERNAL) {
+               tdb->map_size = size;
+               tdb->map_ptr = (char *)newdb;
+               memcpy(&tdb->header, newdb, sizeof(tdb->header));
+               /* Convert the `ondisk' version if asked. */
+               CONVERT(*newdb);
+               return 0;
+       }
+       if (lseek(tdb->fd, 0, SEEK_SET) == -1)
+               goto fail;
+
+       if (ftruncate(tdb->fd, 0) == -1)
+               goto fail;
+
+       /* This creates an endian-converted header, as if read from disk */
+       CONVERT(*newdb);
+       memcpy(&tdb->header, newdb, sizeof(tdb->header));
+       /* Don't endian-convert the magic food! */
+       memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
+       /* we still have "ret == -1" here */
+       if (tdb_write_all(tdb->fd, newdb, size))
+               ret = 0;
+
+  fail:
+       SAFE_FREE(newdb);
+       return ret;
+}
+
+
+
+static int tdb_already_open(dev_t device,
+                           ino_t ino)
+{
+       struct tdb_context *i;
+
+       for (i = tdbs; i; i = i->next) {
+               if (i->device == device && i->inode == ino) {
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+/* open the database, creating it if necessary 
+
+   The open_flags and mode are passed straight to the open call on the
+   database file. A flags value of O_WRONLY is invalid. The hash size
+   is advisory, use zero for a default value.
+
+   Return is NULL on error, in which case errno is also set.  Don't 
+   try to call tdb_error or tdb_errname, just do strerror(errno).
+
+   @param name may be NULL for internal databases. */
+_PUBLIC_ struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
+                     int open_flags, mode_t mode)
+{
+       return tdb_open_ex(name, hash_size, tdb_flags, open_flags, mode, NULL, NULL);
+}
+
+/* a default logging function */
+static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...) PRINTF_ATTRIBUTE(3, 4);
+static void null_log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
+{
+}
+
+static bool check_header_hash(struct tdb_context *tdb,
+                             bool default_hash, uint32_t *m1, uint32_t *m2)
+{
+       tdb_header_hash(tdb, m1, m2);
+       if (tdb->header.magic1_hash == *m1 &&
+           tdb->header.magic2_hash == *m2) {
+               return true;
+       }
+
+       /* If they explicitly set a hash, always respect it. */
+       if (!default_hash)
+               return false;
+
+       /* Otherwise, try the other inbuilt hash. */
+       if (tdb->hash_fn == tdb_old_hash)
+               tdb->hash_fn = tdb_jenkins_hash;
+       else
+               tdb->hash_fn = tdb_old_hash;
+       return check_header_hash(tdb, false, m1, m2);
+}
+
+_PUBLIC_ struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
+                               int open_flags, mode_t mode,
+                               const struct tdb_logging_context *log_ctx,
+                               tdb_hash_func hash_fn)
+{
+       struct tdb_context *tdb;
+       struct stat st;
+       int rev = 0, locked = 0;
+       unsigned char *vp;
+       uint32_t vertest;
+       unsigned v;
+       const char *hash_alg;
+       uint32_t magic1, magic2;
+
+       if (!(tdb = (struct tdb_context *)calloc(1, sizeof *tdb))) {
+               /* Can't log this */
+               errno = ENOMEM;
+               goto fail;
+       }
+       tdb_io_init(tdb);
+       tdb->fd = -1;
+#ifdef TDB_TRACE
+       tdb->tracefd = -1;
+#endif
+       tdb->name = NULL;
+       tdb->map_ptr = NULL;
+       tdb->flags = tdb_flags;
+       tdb->open_flags = open_flags;
+       if (log_ctx) {
+               tdb->log = *log_ctx;
+       } else {
+               tdb->log.log_fn = null_log_fn;
+               tdb->log.log_private = NULL;
+       }
+
+       if (name == NULL && (tdb_flags & TDB_INTERNAL)) {
+               name = "__TDB_INTERNAL__";
+       }
+
+       if (name == NULL) {
+               tdb->name = discard_const_p(char, "__NULL__");
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: called with name == NULL\n"));
+               tdb->name = NULL;
+               errno = EINVAL;
+               goto fail;
+       }
+
+       /* now make a copy of the name, as the caller memory might went away */
+       if (!(tdb->name = (char *)strdup(name))) {
+               /*
+                * set the name as the given string, so that tdb_name() will
+                * work in case of an error.
+                */
+               tdb->name = discard_const_p(char, name);
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't strdup(%s)\n",
+                        name));
+               tdb->name = NULL;
+               errno = ENOMEM;
+               goto fail;
+       }
+
+       if (hash_fn) {
+               tdb->hash_fn = hash_fn;
+               hash_alg = "the user defined";
+       } else {
+               /* This controls what we use when creating a tdb. */
+               if (tdb->flags & TDB_INCOMPATIBLE_HASH) {
+                       tdb->hash_fn = tdb_jenkins_hash;
+               } else {
+                       tdb->hash_fn = tdb_old_hash;
+               }
+               hash_alg = "either default";
+       }
+
+       /* cache the page size */
+       tdb->page_size = getpagesize();
+       if (tdb->page_size <= 0) {
+               tdb->page_size = 0x2000;
+       }
+
+       tdb->max_dead_records = (tdb_flags & TDB_VOLATILE) ? 5 : 0;
+
+       if ((open_flags & O_ACCMODE) == O_WRONLY) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: can't open tdb %s write-only\n",
+                        name));
+               errno = EINVAL;
+               goto fail;
+       }
+
+       if (hash_size == 0)
+               hash_size = DEFAULT_HASH_SIZE;
+       if ((open_flags & O_ACCMODE) == O_RDONLY) {
+               tdb->read_only = 1;
+               /* read only databases don't do locking or clear if first */
+               tdb->flags |= TDB_NOLOCK;
+               tdb->flags &= ~TDB_CLEAR_IF_FIRST;
+       }
+
+       if ((tdb->flags & TDB_ALLOW_NESTING) &&
+           (tdb->flags & TDB_DISALLOW_NESTING)) {
+               tdb->ecode = TDB_ERR_NESTING;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                       "allow_nesting and disallow_nesting are not allowed together!"));
+               errno = EINVAL;
+               goto fail;
+       }
+
+       if (getenv("TDB_NO_FSYNC")) {
+               tdb->flags |= TDB_NOSYNC;
+       }
+
+       /*
+        * TDB_ALLOW_NESTING is the default behavior.
+        * Note: this may change in future versions!
+        */
+       if (!(tdb->flags & TDB_DISALLOW_NESTING)) {
+               tdb->flags |= TDB_ALLOW_NESTING;
+       }
+
+       /* internal databases don't mmap or lock, and start off cleared */
+       if (tdb->flags & TDB_INTERNAL) {
+               tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
+               tdb->flags &= ~TDB_CLEAR_IF_FIRST;
+               if (tdb_new_database(tdb, hash_size) != 0) {
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: tdb_new_database failed!"));
+                       goto fail;
+               }
+               goto internal;
+       }
+
+       if ((tdb->fd = open(name, open_flags, mode)) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_open_ex: could not open file %s: %s\n",
+                        name, strerror(errno)));
+               goto fail;      /* errno set by open(2) */
+       }
+
+       /* on exec, don't inherit the fd */
+       v = fcntl(tdb->fd, F_GETFD, 0);
+        fcntl(tdb->fd, F_SETFD, v | FD_CLOEXEC);
+
+       /* ensure there is only one process initialising at once */
+       if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to get open lock on %s: %s\n",
+                        name, strerror(errno)));
+               goto fail;      /* errno set by tdb_brlock */
+       }
+
+       /* we need to zero database if we are the only one with it open */
+       if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
+           (!tdb->read_only) &&
+           (locked = (tdb_nest_lock(tdb, ACTIVE_LOCK, F_WRLCK, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE) == 0))) {
+               int ret;
+               ret = tdb_brlock(tdb, F_WRLCK, FREELIST_TOP, 0,
+                                TDB_LOCK_WAIT);
+               if (ret == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                                "tdb_brlock failed for %s: %s\n",
+                                name, strerror(errno)));
+                       goto fail;
+               }
+               ret = tdb_new_database(tdb, hash_size);
+               if (ret == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                                "tdb_new_database failed for %s: %s\n",
+                                name, strerror(errno)));
+                       tdb_unlockall(tdb);
+                       goto fail;
+               }
+               ret = tdb_brunlock(tdb, F_WRLCK, FREELIST_TOP, 0);
+               if (ret == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                                "tdb_unlockall failed for %s: %s\n",
+                                name, strerror(errno)));
+                       goto fail;
+               }
+               ret = lseek(tdb->fd, 0, SEEK_SET);
+               if (ret == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                                "lseek failed for %s: %s\n",
+                                name, strerror(errno)));
+                       goto fail;
+               }
+       }
+
+       errno = 0;
+       if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
+           || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0) {
+               if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
+                       if (errno == 0) {
+                               errno = EIO; /* ie bad format or something */
+                       }
+                       goto fail;
+               }
+               rev = (tdb->flags & TDB_CONVERT);
+       } else if (tdb->header.version != TDB_VERSION
+                  && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION)))) {
+               /* wrong version */
+               errno = EIO;
+               goto fail;
+       }
+       vp = (unsigned char *)&tdb->header.version;
+       vertest = (((uint32_t)vp[0]) << 24) | (((uint32_t)vp[1]) << 16) |
+                 (((uint32_t)vp[2]) << 8) | (uint32_t)vp[3];
+       tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
+       if (!rev)
+               tdb->flags &= ~TDB_CONVERT;
+       else {
+               tdb->flags |= TDB_CONVERT;
+               tdb_convert(&tdb->header, sizeof(tdb->header));
+       }
+       if (fstat(tdb->fd, &st) == -1)
+               goto fail;
+
+       if (tdb->header.rwlocks != 0 &&
+           tdb->header.rwlocks != TDB_HASH_RWLOCK_MAGIC) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: spinlocks no longer supported\n"));
+               goto fail;
+       }
+
+       if ((tdb->header.magic1_hash == 0) && (tdb->header.magic2_hash == 0)) {
+               /* older TDB without magic hash references */
+               tdb->hash_fn = tdb_old_hash;
+       } else if (!check_header_hash(tdb, !hash_fn, &magic1, &magic2)) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                        "%s was not created with %s hash function we are using\n"
+                        "magic1_hash[0x%08X %s 0x%08X] "
+                        "magic2_hash[0x%08X %s 0x%08X]\n",
+                        name, hash_alg,
+                        tdb->header.magic1_hash,
+                        (tdb->header.magic1_hash == magic1) ? "==" : "!=",
+                        magic1,
+                        tdb->header.magic2_hash,
+                        (tdb->header.magic2_hash == magic2) ? "==" : "!=",
+                        magic2));
+               errno = EINVAL;
+               goto fail;
+       }
+
+       /* Is it already in the open list?  If so, fail. */
+       if (tdb_already_open(st.st_dev, st.st_ino)) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+                        "%s (%d,%d) is already open in this process\n",
+                        name, (int)st.st_dev, (int)st.st_ino));
+               errno = EBUSY;
+               goto fail;
+       }
+
+       /* Beware truncation! */
+       tdb->map_size = st.st_size;
+       if (tdb->map_size != st.st_size) {
+               /* Ensure ecode is set for log fn. */
+               tdb->ecode = TDB_ERR_IO;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_open_ex: "
+                        "len %llu too large!\n", (long long)st.st_size));
+               errno = EIO;
+               goto fail;
+       }
+
+       tdb->device = st.st_dev;
+       tdb->inode = st.st_ino;
+       tdb_mmap(tdb);
+       if (locked) {
+               if (tdb_nest_unlock(tdb, ACTIVE_LOCK, F_WRLCK, false) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: "
+                                "failed to release ACTIVE_LOCK on %s: %s\n",
+                                name, strerror(errno)));
+                       goto fail;
+               }
+
+       }
+
+       /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
+          we didn't get the initial exclusive lock as we need to let all other
+          users know we're using it. */
+
+       if (tdb_flags & TDB_CLEAR_IF_FIRST) {
+               /* leave this lock in place to indicate it's in use */
+               if (tdb_nest_lock(tdb, ACTIVE_LOCK, F_RDLCK, TDB_LOCK_WAIT) == -1) {
+                       goto fail;
+               }
+       }
+
+       /* if needed, run recovery */
+       if (tdb_transaction_recover(tdb) == -1) {
+               goto fail;
+       }
+
+#ifdef TDB_TRACE
+       {
+               char tracefile[strlen(name) + 32];
+
+               snprintf(tracefile, sizeof(tracefile),
+                        "%s.trace.%li", name, (long)getpid());
+               tdb->tracefd = open(tracefile, O_WRONLY|O_CREAT|O_EXCL, 0600);
+               if (tdb->tracefd >= 0) {
+                       tdb_enable_seqnum(tdb);
+                       tdb_trace_open(tdb, "tdb_open", hash_size, tdb_flags,
+                                      open_flags);
+               } else
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to open trace file %s!\n", tracefile));
+       }
+#endif
+
+ internal:
+       /* Internal (memory-only) databases skip all the code above to
+        * do with disk files, and resume here by releasing their
+        * open lock and hooking into the active list. */
+       if (tdb_nest_unlock(tdb, OPEN_LOCK, F_WRLCK, false) == -1) {
+               goto fail;
+       }
+       tdb->next = tdbs;
+       tdbs = tdb;
+       return tdb;
+
+ fail:
+       { int save_errno = errno;
+
+       if (!tdb)
+               return NULL;
+
+#ifdef TDB_TRACE
+       close(tdb->tracefd);
+#endif
+       if (tdb->map_ptr) {
+               if (tdb->flags & TDB_INTERNAL)
+                       SAFE_FREE(tdb->map_ptr);
+               else
+                       tdb_munmap(tdb);
+       }
+       if (tdb->fd != -1)
+               if (close(tdb->fd) != 0)
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_open_ex: failed to close tdb->fd on error!\n"));
+       SAFE_FREE(tdb->lockrecs);
+       SAFE_FREE(tdb->name);
+       SAFE_FREE(tdb);
+       errno = save_errno;
+       return NULL;
+       }
+}
+
+/*
+ * Set the maximum number of dead records per hash chain
+ */
+
+_PUBLIC_ void tdb_set_max_dead(struct tdb_context *tdb, int max_dead)
+{
+       tdb->max_dead_records = max_dead;
+}
+
+/**
+ * Close a database.
+ *
+ * @returns -1 for error; 0 for success.
+ **/
+_PUBLIC_ int tdb_close(struct tdb_context *tdb)
+{
+       struct tdb_context **i;
+       int ret = 0;
+
+       if (tdb->transaction) {
+               tdb_transaction_cancel(tdb);
+       }
+       tdb_trace(tdb, "tdb_close");
+
+       if (tdb->map_ptr) {
+               if (tdb->flags & TDB_INTERNAL)
+                       SAFE_FREE(tdb->map_ptr);
+               else
+                       tdb_munmap(tdb);
+       }
+       SAFE_FREE(tdb->name);
+       if (tdb->fd != -1) {
+               ret = close(tdb->fd);
+               tdb->fd = -1;
+       }
+       SAFE_FREE(tdb->lockrecs);
+
+       /* Remove from contexts list */
+       for (i = &tdbs; *i; i = &(*i)->next) {
+               if (*i == tdb) {
+                       *i = tdb->next;
+                       break;
+               }
+       }
+
+#ifdef TDB_TRACE
+       close(tdb->tracefd);
+#endif
+       memset(tdb, 0, sizeof(*tdb));
+       SAFE_FREE(tdb);
+
+       return ret;
+}
+
+/* register a loging function */
+_PUBLIC_ void tdb_set_logging_function(struct tdb_context *tdb,
+                                       const struct tdb_logging_context *log_ctx)
+{
+        tdb->log = *log_ctx;
+}
+
+_PUBLIC_ void *tdb_get_logging_private(struct tdb_context *tdb)
+{
+       return tdb->log.log_private;
+}
+
+static int tdb_reopen_internal(struct tdb_context *tdb, bool active_lock)
+{
+#if !defined(LIBREPLACE_PREAD_NOT_REPLACED) || \
+       !defined(LIBREPLACE_PWRITE_NOT_REPLACED)
+       struct stat st;
+#endif
+
+       if (tdb->flags & TDB_INTERNAL) {
+               return 0; /* Nothing to do. */
+       }
+
+       if (tdb_have_extra_locks(tdb)) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed with locks held\n"));
+               goto fail;
+       }
+
+       if (tdb->transaction != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_reopen: reopen not allowed inside a transaction\n"));
+               goto fail;
+       }
+
+/* If we have real pread & pwrite, we can skip reopen. */
+#if !defined(LIBREPLACE_PREAD_NOT_REPLACED) || \
+       !defined(LIBREPLACE_PWRITE_NOT_REPLACED)
+       if (tdb_munmap(tdb) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: munmap failed (%s)\n", strerror(errno)));
+               goto fail;
+       }
+       if (close(tdb->fd) != 0)
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: WARNING closing tdb->fd failed!\n"));
+       tdb->fd = open(tdb->name, tdb->open_flags & ~(O_CREAT|O_TRUNC), 0);
+       if (tdb->fd == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: open failed (%s)\n", strerror(errno)));
+               goto fail;
+       }
+       if (fstat(tdb->fd, &st) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: fstat failed (%s)\n", strerror(errno)));
+               goto fail;
+       }
+       if (st.st_ino != tdb->inode || st.st_dev != tdb->device) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: file dev/inode has changed!\n"));
+               goto fail;
+       }
+       if (tdb_mmap(tdb) != 0) {
+               goto fail;
+       }
+#endif /* fake pread or pwrite */
+
+       /* We may still think we hold the active lock. */
+       tdb->num_lockrecs = 0;
+       SAFE_FREE(tdb->lockrecs);
+
+       if (active_lock && tdb_nest_lock(tdb, ACTIVE_LOCK, F_RDLCK, TDB_LOCK_WAIT) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_reopen: failed to obtain active lock\n"));
+               goto fail;
+       }
+
+       return 0;
+
+fail:
+       tdb_close(tdb);
+       return -1;
+}
+
+/* reopen a tdb - this can be used after a fork to ensure that we have an independent
+   seek pointer from our parent and to re-establish locks */
+_PUBLIC_ int tdb_reopen(struct tdb_context *tdb)
+{
+       return tdb_reopen_internal(tdb, tdb->flags & TDB_CLEAR_IF_FIRST);
+}
+
+/* reopen all tdb's */
+_PUBLIC_ int tdb_reopen_all(int parent_longlived)
+{
+       struct tdb_context *tdb;
+
+       for (tdb=tdbs; tdb; tdb = tdb->next) {
+               bool active_lock = (tdb->flags & TDB_CLEAR_IF_FIRST);
+
+               /*
+                * If the parent is longlived (ie. a
+                * parent daemon architecture), we know
+                * it will keep it's active lock on a
+                * tdb opened with CLEAR_IF_FIRST. Thus
+                * for child processes we don't have to
+                * add an active lock. This is essential
+                * to improve performance on systems that
+                * keep POSIX locks as a non-scalable data
+                * structure in the kernel.
+                */
+               if (parent_longlived) {
+                       /* Ensure no clear-if-first. */
+                       active_lock = false;
+               }
+
+               if (tdb_reopen_internal(tdb, active_lock) != 0)
+                       return -1;
+       }
+
+       return 0;
+}
diff --git a/ctdb/lib/tdb/common/rescue.c b/ctdb/lib/tdb/common/rescue.c
new file mode 100644 (file)
index 0000000..03ae8d6
--- /dev/null
@@ -0,0 +1,349 @@
+ /*
+   Unix SMB/CIFS implementation.
+
+   trivial database library, rescue attempt code.
+
+   Copyright (C) Rusty Russell            2012
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb_private.h"
+#include <assert.h>
+
+
+struct found {
+       tdb_off_t head; /* 0 -> invalid. */
+       struct tdb_record rec;
+       TDB_DATA key;
+       bool in_hash;
+       bool in_free;
+};
+
+struct found_table {
+       /* As an ordered array (by head offset). */
+       struct found *arr;
+       unsigned int num, max;
+};
+
+static bool looks_like_valid_record(struct tdb_context *tdb,
+                                   tdb_off_t off,
+                                   const struct tdb_record *rec,
+                                   TDB_DATA *key)
+{
+       unsigned int hval;
+
+       if (rec->magic != TDB_MAGIC)
+               return false;
+
+       if (rec->key_len + rec->data_len > rec->rec_len)
+               return false;
+
+       if (rec->rec_len % TDB_ALIGNMENT)
+               return false;
+
+       /* Next pointer must make some sense. */
+       if (rec->next > 0 && rec->next < TDB_DATA_START(tdb->header.hash_size))
+               return false;
+
+       if (tdb->methods->tdb_oob(tdb, rec->next, sizeof(*rec), 1))
+               return false;
+
+       key->dsize = rec->key_len;
+       key->dptr = tdb_alloc_read(tdb, off + sizeof(*rec), key->dsize);
+       if (!key->dptr)
+               return false;
+
+       hval = tdb->hash_fn(key);
+       if (hval != rec->full_hash) {
+               free(key->dptr);
+               return false;
+       }
+
+       /* Caller frees up key->dptr */
+       return true;
+}
+
+static bool add_to_table(struct found_table *found,
+                        tdb_off_t off,
+                        struct tdb_record *rec,
+                        TDB_DATA key)
+{
+       if (found->num + 1 > found->max) {
+               struct found *new;
+               found->max = (found->max ? found->max * 2 : 128);
+               new = realloc(found->arr, found->max * sizeof(found->arr[0]));
+               if (!new)
+                       return false;
+               found->arr = new;
+       }
+
+       found->arr[found->num].head = off;
+       found->arr[found->num].rec = *rec;
+       found->arr[found->num].key = key;
+       found->arr[found->num].in_hash = false;
+       found->arr[found->num].in_free = false;
+
+       found->num++;
+       return true;
+}
+
+static bool walk_record(struct tdb_context *tdb,
+                       const struct found *f,
+                       void (*walk)(TDB_DATA, TDB_DATA, void *private_data),
+                       void *private_data)
+{
+       TDB_DATA data;
+
+       data.dsize = f->rec.data_len;
+       data.dptr = tdb_alloc_read(tdb,
+                                  f->head + sizeof(f->rec) + f->rec.key_len,
+                                  data.dsize);
+       if (!data.dptr) {
+               if (tdb->ecode == TDB_ERR_OOM)
+                       return false;
+               /* I/O errors are expected. */
+               return true;
+       }
+
+       walk(f->key, data, private_data);
+       free(data.dptr);
+       return true;
+}
+
+/* First entry which has offset >= this one. */
+static unsigned int find_entry(struct found_table *found, tdb_off_t off)
+{
+       unsigned int start = 0, end = found->num;
+
+       while (start < end) {
+               /* We can't overflow here. */
+               unsigned int mid = (start + end) / 2;
+
+               if (off < found->arr[mid].head) {
+                       end = mid;
+               } else if (off > found->arr[mid].head) {
+                       start = mid + 1;
+               } else {
+                       return mid;
+               }
+       }
+
+       assert(start == end);
+       return end;
+}
+
+static void found_in_hashchain(struct found_table *found, tdb_off_t head)
+{
+       unsigned int match;
+
+       match = find_entry(found, head);
+       if (match < found->num && found->arr[match].head == head) {
+               found->arr[match].in_hash = true;
+       }
+}
+
+static void mark_free_area(struct found_table *found, tdb_off_t head,
+                          tdb_len_t len)
+{
+       unsigned int match;
+
+       match = find_entry(found, head);
+       /* Mark everything within this free entry. */
+       while (match < found->num) {
+               if (found->arr[match].head >= head + len) {
+                       break;
+               }
+               found->arr[match].in_free = true;
+               match++;
+       }
+}
+
+static int cmp_key(const void *a, const void *b)
+{
+       const struct found *fa = a, *fb = b;
+
+       if (fa->key.dsize < fb->key.dsize) {
+               return -1;
+       } else if (fa->key.dsize > fb->key.dsize) {
+               return 1;
+       }
+       return memcmp(fa->key.dptr, fb->key.dptr, fa->key.dsize);
+}
+
+static bool key_eq(TDB_DATA a, TDB_DATA b)
+{
+       return a.dsize == b.dsize
+               && memcmp(a.dptr, b.dptr, a.dsize) == 0;
+}
+
+static void free_table(struct found_table *found)
+{
+       unsigned int i;
+
+       for (i = 0; i < found->num; i++) {
+               free(found->arr[i].key.dptr);
+       }
+       free(found->arr);
+}
+
+static void logging_suppressed(struct tdb_context *tdb,
+                              enum tdb_debug_level level, const char *fmt, ...)
+{
+}
+
+_PUBLIC_ int tdb_rescue(struct tdb_context *tdb,
+                       void (*walk)(TDB_DATA, TDB_DATA, void *private_data),
+                       void *private_data)
+{
+       struct found_table found = { NULL, 0, 0 };
+       tdb_off_t h, off, i;
+       tdb_log_func oldlog = tdb->log.log_fn;
+       struct tdb_record rec;
+       TDB_DATA key;
+       bool locked;
+
+       /* Read-only databases use no locking at all: it's best-effort.
+        * We may have a write lock already, so skip that case too. */
+       if (tdb->read_only || tdb->allrecord_lock.count != 0) {
+               locked = false;
+       } else {
+               if (tdb_lockall_read(tdb) == -1)
+                       return -1;
+               locked = true;
+       }
+
+       /* Make sure we know true size of the underlying file. */
+       tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
+
+       /* Suppress logging, since we anticipate errors. */
+       tdb->log.log_fn = logging_suppressed;
+
+       /* Now walk entire db looking for records. */
+       for (off = TDB_DATA_START(tdb->header.hash_size);
+            off < tdb->map_size;
+            off += TDB_ALIGNMENT) {
+               if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
+                                          DOCONV()) == -1)
+                       continue;
+
+               if (looks_like_valid_record(tdb, off, &rec, &key)) {
+                       if (!add_to_table(&found, off, &rec, key)) {
+                               goto oom;
+                       }
+               }
+       }
+
+       /* Walk hash chains to positive vet. */
+       for (h = 0; h < 1+tdb->header.hash_size; h++) {
+               bool slow_chase = false;
+               tdb_off_t slow_off = FREELIST_TOP + h*sizeof(tdb_off_t);
+
+               if (tdb_ofs_read(tdb, FREELIST_TOP + h*sizeof(tdb_off_t),
+                                &off) == -1)
+                       continue;
+
+               while (off && off != slow_off) {
+                       if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
+                                                  DOCONV()) != 0) {
+                               break;
+                       }
+
+                       /* 0 is the free list, rest are hash chains. */
+                       if (h == 0) {
+                               /* Don't mark garbage as free. */
+                               if (rec.magic != TDB_FREE_MAGIC) {
+                                       break;
+                               }
+                               mark_free_area(&found, off,
+                                              sizeof(rec) + rec.rec_len);
+                       } else {
+                               found_in_hashchain(&found, off);
+                       }
+
+                       off = rec.next;
+
+                       /* Loop detection using second pointer at half-speed */
+                       if (slow_chase) {
+                               /* First entry happens to be next ptr */
+                               tdb_ofs_read(tdb, slow_off, &slow_off);
+                       }
+                       slow_chase = !slow_chase;
+               }
+       }
+
+       /* Recovery area: must be marked as free, since it often has old
+        * records in there! */
+       if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &off) == 0 && off != 0) {
+               if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
+                                          DOCONV()) == 0) {
+                       mark_free_area(&found, off, sizeof(rec) + rec.rec_len);
+               }
+       }
+
+       /* Now sort by key! */
+       qsort(found.arr, found.num, sizeof(found.arr[0]), cmp_key);
+
+       for (i = 0; i < found.num; ) {
+               unsigned int num, num_in_hash = 0;
+
+               /* How many are identical? */
+               for (num = 0; num < found.num - i; num++) {
+                       if (!key_eq(found.arr[i].key, found.arr[i+num].key)) {
+                               break;
+                       }
+                       if (found.arr[i+num].in_hash) {
+                               if (!walk_record(tdb, &found.arr[i+num],
+                                                walk, private_data))
+                                       goto oom;
+                               num_in_hash++;
+                       }
+               }
+               assert(num);
+
+               /* If none were in the hash, we print any not in free list. */
+               if (num_in_hash == 0) {
+                       unsigned int j;
+
+                       for (j = i; j < i + num; j++) {
+                               if (!found.arr[j].in_free) {
+                                       if (!walk_record(tdb, &found.arr[j],
+                                                        walk, private_data))
+                                               goto oom;
+                               }
+                       }
+               }
+
+               i += num;
+       }
+
+       tdb->log.log_fn = oldlog;
+       if (locked) {
+               tdb_unlockall_read(tdb);
+       }
+       return 0;
+
+oom:
+       tdb->log.log_fn = oldlog;
+       tdb->ecode = TDB_ERR_OOM;
+       TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_rescue: failed allocating\n"));
+       free_table(&found);
+       if (locked) {
+               tdb_unlockall_read(tdb);
+       }
+       return -1;
+}
diff --git a/ctdb/lib/tdb/common/summary.c b/ctdb/lib/tdb/common/summary.c
new file mode 100644 (file)
index 0000000..171a1a2
--- /dev/null
@@ -0,0 +1,201 @@
+ /* 
+   Trivial Database: human-readable summary code
+   Copyright (C) Rusty Russell 2010
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "tdb_private.h"
+
+#define SUMMARY_FORMAT \
+       "Size of file/data: %u/%zu\n" \
+       "Number of records: %zu\n" \
+       "Smallest/average/largest keys: %zu/%zu/%zu\n" \
+       "Smallest/average/largest data: %zu/%zu/%zu\n" \
+       "Smallest/average/largest padding: %zu/%zu/%zu\n" \
+       "Number of dead records: %zu\n" \
+       "Smallest/average/largest dead records: %zu/%zu/%zu\n" \
+       "Number of free records: %zu\n" \
+       "Smallest/average/largest free records: %zu/%zu/%zu\n" \
+       "Number of hash chains: %zu\n" \
+       "Smallest/average/largest hash chains: %zu/%zu/%zu\n" \
+       "Number of uncoalesced records: %zu\n" \
+       "Smallest/average/largest uncoalesced runs: %zu/%zu/%zu\n" \
+       "Percentage keys/data/padding/free/dead/rechdrs&tailers/hashes: %.0f/%.0f/%.0f/%.0f/%.0f/%.0f/%.0f\n"
+
+/* We don't use tally module, to keep upstream happy. */
+struct tally {
+       size_t min, max, total;
+       size_t num;
+};
+
+static void tally_init(struct tally *tally)
+{
+       tally->total = 0;
+       tally->num = 0;
+       tally->min = tally->max = 0;
+}
+
+static void tally_add(struct tally *tally, size_t len)
+{
+       if (tally->num == 0)
+               tally->max = tally->min = len;
+       else if (len > tally->max)
+               tally->max = len;
+       else if (len < tally->min)
+               tally->min = len;
+       tally->num++;
+       tally->total += len;
+}
+
+static size_t tally_mean(const struct tally *tally)
+{
+       if (!tally->num)
+               return 0;
+       return tally->total / tally->num;
+}
+
+static size_t get_hash_length(struct tdb_context *tdb, unsigned int i)
+{
+       tdb_off_t rec_ptr;
+       size_t count = 0;
+
+       if (tdb_ofs_read(tdb, TDB_HASH_TOP(i), &rec_ptr) == -1)
+               return 0;
+
+       /* keep looking until we find the right record */
+       while (rec_ptr) {
+               struct tdb_record r;
+               ++count;
+               if (tdb_rec_read(tdb, rec_ptr, &r) == -1)
+                       return 0;
+               rec_ptr = r.next;
+       }
+       return count;
+}
+
+_PUBLIC_ char *tdb_summary(struct tdb_context *tdb)
+{
+       tdb_off_t off, rec_off;
+       struct tally freet, keys, data, dead, extra, hash, uncoal;
+       struct tdb_record rec;
+       char *ret = NULL;
+       bool locked;
+       size_t len, unc = 0;
+       struct tdb_record recovery;
+
+       /* Read-only databases use no locking at all: it's best-effort.
+        * We may have a write lock already, so skip that case too. */
+       if (tdb->read_only || tdb->allrecord_lock.count != 0) {
+               locked = false;
+       } else {
+               if (tdb_lockall_read(tdb) == -1)
+                       return NULL;
+               locked = true;
+       }
+
+       if (tdb_recovery_area(tdb, tdb->methods, &rec_off, &recovery) != 0) {
+               goto unlock;
+       }
+
+       tally_init(&freet);
+       tally_init(&keys);
+       tally_init(&data);
+       tally_init(&dead);
+       tally_init(&extra);
+       tally_init(&hash);
+       tally_init(&uncoal);
+
+       for (off = TDB_DATA_START(tdb->header.hash_size);
+            off < tdb->map_size - 1;
+            off += sizeof(rec) + rec.rec_len) {
+               if (tdb->methods->tdb_read(tdb, off, &rec, sizeof(rec),
+                                          DOCONV()) == -1)
+                       goto unlock;
+               switch (rec.magic) {
+               case TDB_MAGIC:
+                       tally_add(&keys, rec.key_len);
+                       tally_add(&data, rec.data_len);
+                       tally_add(&extra, rec.rec_len - (rec.key_len
+                                                        + rec.data_len));
+                       if (unc > 1)
+                               tally_add(&uncoal, unc - 1);
+                       unc = 0;
+                       break;
+               case TDB_FREE_MAGIC:
+                       tally_add(&freet, rec.rec_len);
+                       unc++;
+                       break;
+               /* If we crash after ftruncate, we can get zeroes or fill. */
+               case TDB_RECOVERY_INVALID_MAGIC:
+               case 0x42424242:
+                       unc++;
+                       /* If it's a valid recovery, we can trust rec_len. */
+                       if (off != rec_off) {
+                               rec.rec_len = tdb_dead_space(tdb, off)
+                                       - sizeof(rec);
+                       }
+                       /* Fall through */
+               case TDB_DEAD_MAGIC:
+                       tally_add(&dead, rec.rec_len);
+                       break;
+               default:
+                       TDB_LOG((tdb, TDB_DEBUG_ERROR,
+                                "Unexpected record magic 0x%x at offset %d\n",
+                                rec.magic, off));
+                       goto unlock;
+               }
+       }
+       if (unc > 1)
+               tally_add(&uncoal, unc - 1);
+
+       for (off = 0; off < tdb->header.hash_size; off++)
+               tally_add(&hash, get_hash_length(tdb, off));
+
+       /* 20 is max length of a %zu. */
+       len = strlen(SUMMARY_FORMAT) + 35*20 + 1;
+       ret = (char *)malloc(len);
+       if (!ret)
+               goto unlock;
+
+       snprintf(ret, len, SUMMARY_FORMAT,
+                tdb->map_size, keys.total+data.total,
+                keys.num,
+                keys.min, tally_mean(&keys), keys.max,
+                data.min, tally_mean(&data), data.max,
+                extra.min, tally_mean(&extra), extra.max,
+                dead.num,
+                dead.min, tally_mean(&dead), dead.max,
+                freet.num,
+                freet.min, tally_mean(&freet), freet.max,
+                hash.num,
+                hash.min, tally_mean(&hash), hash.max,
+                uncoal.total,
+                uncoal.min, tally_mean(&uncoal), uncoal.max,
+                keys.total * 100.0 / tdb->map_size,
+                data.total * 100.0 / tdb->map_size,
+                extra.total * 100.0 / tdb->map_size,
+                freet.total * 100.0 / tdb->map_size,
+                dead.total * 100.0 / tdb->map_size,
+                (keys.num + freet.num + dead.num)
+                * (sizeof(struct tdb_record) + sizeof(uint32_t))
+                * 100.0 / tdb->map_size,
+                tdb->header.hash_size * sizeof(tdb_off_t)
+                * 100.0 / tdb->map_size);
+
+unlock:
+       if (locked) {
+               tdb_unlockall_read(tdb);
+       }
+       return ret;
+}
diff --git a/ctdb/lib/tdb/common/tdb.c b/ctdb/lib/tdb/common/tdb.c
new file mode 100644 (file)
index 0000000..fc1f560
--- /dev/null
@@ -0,0 +1,1154 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+_PUBLIC_ TDB_DATA tdb_null;
+
+/*
+  non-blocking increment of the tdb sequence number if the tdb has been opened using
+  the TDB_SEQNUM flag
+*/
+_PUBLIC_ void tdb_increment_seqnum_nonblock(struct tdb_context *tdb)
+{
+       tdb_off_t seqnum=0;
+
+       if (!(tdb->flags & TDB_SEQNUM)) {
+               return;
+       }
+
+       /* we ignore errors from this, as we have no sane way of
+          dealing with them.
+       */
+       tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
+       seqnum++;
+       tdb_ofs_write(tdb, TDB_SEQNUM_OFS, &seqnum);
+}
+
+/*
+  increment the tdb sequence number if the tdb has been opened using
+  the TDB_SEQNUM flag
+*/
+static void tdb_increment_seqnum(struct tdb_context *tdb)
+{
+       if (!(tdb->flags & TDB_SEQNUM)) {
+               return;
+       }
+
+       if (tdb_nest_lock(tdb, TDB_SEQNUM_OFS, F_WRLCK,
+                         TDB_LOCK_WAIT|TDB_LOCK_PROBE) != 0) {
+               return;
+       }
+
+       tdb_increment_seqnum_nonblock(tdb);
+
+       tdb_nest_unlock(tdb, TDB_SEQNUM_OFS, F_WRLCK, false);
+}
+
+static int tdb_key_compare(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+       return memcmp(data.dptr, key.dptr, data.dsize);
+}
+
+/* Returns 0 on fail.  On success, return offset of record, and fills
+   in rec */
+static tdb_off_t tdb_find(struct tdb_context *tdb, TDB_DATA key, uint32_t hash,
+                       struct tdb_record *r)
+{
+       tdb_off_t rec_ptr;
+
+       /* read in the hash top */
+       if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
+               return 0;
+
+       /* keep looking until we find the right record */
+       while (rec_ptr) {
+               if (tdb_rec_read(tdb, rec_ptr, r) == -1)
+                       return 0;
+
+               if (!TDB_DEAD(r) && hash==r->full_hash
+                   && key.dsize==r->key_len
+                   && tdb_parse_data(tdb, key, rec_ptr + sizeof(*r),
+                                     r->key_len, tdb_key_compare,
+                                     NULL) == 0) {
+                       return rec_ptr;
+               }
+               /* detect tight infinite loop */
+               if (rec_ptr == r->next) {
+                       tdb->ecode = TDB_ERR_CORRUPT;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_find: loop detected.\n"));
+                       return 0;
+               }
+               rec_ptr = r->next;
+       }
+       tdb->ecode = TDB_ERR_NOEXIST;
+       return 0;
+}
+
+/* As tdb_find, but if you succeed, keep the lock */
+tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
+                          struct tdb_record *rec)
+{
+       uint32_t rec_ptr;
+
+       if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
+               return 0;
+       if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
+               tdb_unlock(tdb, BUCKET(hash), locktype);
+       return rec_ptr;
+}
+
+static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
+
+static int tdb_update_hash_cmp(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+       TDB_DATA *dbuf = (TDB_DATA *)private_data;
+
+       if (dbuf->dsize != data.dsize) {
+               return -1;
+       }
+       if (memcmp(dbuf->dptr, data.dptr, data.dsize) != 0) {
+               return -1;
+       }
+       return 0;
+}
+
+/* update an entry in place - this only works if the new data size
+   is <= the old data size and the key exists.
+   on failure return -1.
+*/
+static int tdb_update_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
+{
+       struct tdb_record rec;
+       tdb_off_t rec_ptr;
+
+       /* find entry */
+       if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
+               return -1;
+
+       /* it could be an exact duplicate of what is there - this is
+        * surprisingly common (eg. with a ldb re-index). */
+       if (rec.key_len == key.dsize && 
+           rec.data_len == dbuf.dsize &&
+           rec.full_hash == hash &&
+           tdb_parse_record(tdb, key, tdb_update_hash_cmp, &dbuf) == 0) {
+               return 0;
+       }
+
+       /* must be long enough key, data and tailer */
+       if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off_t)) {
+               tdb->ecode = TDB_SUCCESS; /* Not really an error */
+               return -1;
+       }
+
+       if (tdb->methods->tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
+                     dbuf.dptr, dbuf.dsize) == -1)
+               return -1;
+
+       if (dbuf.dsize != rec.data_len) {
+               /* update size */
+               rec.data_len = dbuf.dsize;
+               return tdb_rec_write(tdb, rec_ptr, &rec);
+       }
+
+       return 0;
+}
+
+/* find an entry in the database given a key */
+/* If an entry doesn't exist tdb_err will be set to
+ * TDB_ERR_NOEXIST. If a key has no data attached
+ * then the TDB_DATA will have zero length but
+ * a non-zero pointer
+ */
+static TDB_DATA _tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
+{
+       tdb_off_t rec_ptr;
+       struct tdb_record rec;
+       TDB_DATA ret;
+       uint32_t hash;
+
+       /* find which hash bucket it is in */
+       hash = tdb->hash_fn(&key);
+       if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
+               return tdb_null;
+
+       ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
+                                 rec.data_len);
+       ret.dsize = rec.data_len;
+       tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
+       return ret;
+}
+
+_PUBLIC_ TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key)
+{
+       TDB_DATA ret = _tdb_fetch(tdb, key);
+
+       tdb_trace_1rec_retrec(tdb, "tdb_fetch", key, ret);
+       return ret;
+}
+
+/*
+ * Find an entry in the database and hand the record's data to a parsing
+ * function. The parsing function is executed under the chain read lock, so it
+ * should be fast and should not block on other syscalls.
+ *
+ * DON'T CALL OTHER TDB CALLS FROM THE PARSER, THIS MIGHT LEAD TO SEGFAULTS.
+ *
+ * For mmapped tdb's that do not have a transaction open it points the parsing
+ * function directly at the mmap area, it avoids the malloc/memcpy in this
+ * case. If a transaction is open or no mmap is available, it has to do
+ * malloc/read/parse/free.
+ *
+ * This is interesting for all readers of potentially large data structures in
+ * the tdb records, ldb indexes being one example.
+ *
+ * Return -1 if the record was not found.
+ */
+
+_PUBLIC_ int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
+                    int (*parser)(TDB_DATA key, TDB_DATA data,
+                                  void *private_data),
+                    void *private_data)
+{
+       tdb_off_t rec_ptr;
+       struct tdb_record rec;
+       int ret;
+       uint32_t hash;
+
+       /* find which hash bucket it is in */
+       hash = tdb->hash_fn(&key);
+
+       if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec))) {
+               /* record not found */
+               tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, -1);
+               tdb->ecode = TDB_ERR_NOEXIST;
+               return -1;
+       }
+       tdb_trace_1rec_ret(tdb, "tdb_parse_record", key, 0);
+
+       ret = tdb_parse_data(tdb, key, rec_ptr + sizeof(rec) + rec.key_len,
+                            rec.data_len, parser, private_data);
+
+       tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
+
+       return ret;
+}
+
+/* check if an entry in the database exists 
+
+   note that 1 is returned if the key is found and 0 is returned if not found
+   this doesn't match the conventions in the rest of this module, but is
+   compatible with gdbm
+*/
+static int tdb_exists_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
+{
+       struct tdb_record rec;
+
+       if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
+               return 0;
+       tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
+       return 1;
+}
+
+_PUBLIC_ int tdb_exists(struct tdb_context *tdb, TDB_DATA key)
+{
+       uint32_t hash = tdb->hash_fn(&key);
+       int ret;
+
+       ret = tdb_exists_hash(tdb, key, hash);
+       tdb_trace_1rec_ret(tdb, "tdb_exists", key, ret);
+       return ret;
+}
+
+/* actually delete an entry in the database given the offset */
+int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec)
+{
+       tdb_off_t last_ptr, i;
+       struct tdb_record lastrec;
+
+       if (tdb->read_only || tdb->traverse_read) return -1;
+
+       if (((tdb->traverse_write != 0) && (!TDB_DEAD(rec))) ||
+           tdb_write_lock_record(tdb, rec_ptr) == -1) {
+               /* Someone traversing here: mark it as dead */
+               rec->magic = TDB_DEAD_MAGIC;
+               return tdb_rec_write(tdb, rec_ptr, rec);
+       }
+       if (tdb_write_unlock_record(tdb, rec_ptr) != 0)
+               return -1;
+
+       /* find previous record in hash chain */
+       if (tdb_ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
+               return -1;
+       for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
+               if (tdb_rec_read(tdb, i, &lastrec) == -1)
+                       return -1;
+
+       /* unlink it: next ptr is at start of record. */
+       if (last_ptr == 0)
+               last_ptr = TDB_HASH_TOP(rec->full_hash);
+       if (tdb_ofs_write(tdb, last_ptr, &rec->next) == -1)
+               return -1;
+
+       /* recover the space */
+       if (tdb_free(tdb, rec_ptr, rec) == -1)
+               return -1;
+       return 0;
+}
+
+static int tdb_count_dead(struct tdb_context *tdb, uint32_t hash)
+{
+       int res = 0;
+       tdb_off_t rec_ptr;
+       struct tdb_record rec;
+
+       /* read in the hash top */
+       if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
+               return 0;
+
+       while (rec_ptr) {
+               if (tdb_rec_read(tdb, rec_ptr, &rec) == -1)
+                       return 0;
+
+               if (rec.magic == TDB_DEAD_MAGIC) {
+                       res += 1;
+               }
+               rec_ptr = rec.next;
+       }
+       return res;
+}
+
+/*
+ * Purge all DEAD records from a hash chain
+ */
+static int tdb_purge_dead(struct tdb_context *tdb, uint32_t hash)
+{
+       int res = -1;
+       struct tdb_record rec;
+       tdb_off_t rec_ptr;
+
+       if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
+               return -1;
+       }
+
+       /* read in the hash top */
+       if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
+               goto fail;
+
+       while (rec_ptr) {
+               tdb_off_t next;
+
+               if (tdb_rec_read(tdb, rec_ptr, &rec) == -1) {
+                       goto fail;
+               }
+
+               next = rec.next;
+
+               if (rec.magic == TDB_DEAD_MAGIC
+                   && tdb_do_delete(tdb, rec_ptr, &rec) == -1) {
+                       goto fail;
+               }
+               rec_ptr = next;
+       }
+       res = 0;
+ fail:
+       tdb_unlock(tdb, -1, F_WRLCK);
+       return res;
+}
+
+/* delete an entry in the database given a key */
+static int tdb_delete_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash)
+{
+       tdb_off_t rec_ptr;
+       struct tdb_record rec;
+       int ret;
+
+       if (tdb->max_dead_records != 0) {
+
+               /*
+                * Allow for some dead records per hash chain, mainly for
+                * tdb's with a very high create/delete rate like locking.tdb.
+                */
+
+               if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
+                       return -1;
+
+               if (tdb_count_dead(tdb, hash) >= tdb->max_dead_records) {
+                       /*
+                        * Don't let the per-chain freelist grow too large,
+                        * delete all existing dead records
+                        */
+                       tdb_purge_dead(tdb, hash);
+               }
+
+               if (!(rec_ptr = tdb_find(tdb, key, hash, &rec))) {
+                       tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
+                       return -1;
+               }
+
+               /*
+                * Just mark the record as dead.
+                */
+               rec.magic = TDB_DEAD_MAGIC;
+               ret = tdb_rec_write(tdb, rec_ptr, &rec);
+       }
+       else {
+               if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK,
+                                                  &rec)))
+                       return -1;
+
+               ret = tdb_do_delete(tdb, rec_ptr, &rec);
+       }
+
+       if (ret == 0) {
+               tdb_increment_seqnum(tdb);
+       }
+
+       if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
+               TDB_LOG((tdb, TDB_DEBUG_WARNING, "tdb_delete: WARNING tdb_unlock failed!\n"));
+       return ret;
+}
+
+_PUBLIC_ int tdb_delete(struct tdb_context *tdb, TDB_DATA key)
+{
+       uint32_t hash = tdb->hash_fn(&key);
+       int ret;
+
+       ret = tdb_delete_hash(tdb, key, hash);
+       tdb_trace_1rec_ret(tdb, "tdb_delete", key, ret);
+       return ret;
+}
+
+/*
+ * See if we have a dead record around with enough space
+ */
+static tdb_off_t tdb_find_dead(struct tdb_context *tdb, uint32_t hash,
+                              struct tdb_record *r, tdb_len_t length)
+{
+       tdb_off_t rec_ptr;
+
+       /* read in the hash top */
+       if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
+               return 0;
+
+       /* keep looking until we find the right record */
+       while (rec_ptr) {
+               if (tdb_rec_read(tdb, rec_ptr, r) == -1)
+                       return 0;
+
+               if (TDB_DEAD(r) && r->rec_len >= length) {
+                       /*
+                        * First fit for simple coding, TODO: change to best
+                        * fit
+                        */
+                       return rec_ptr;
+               }
+               rec_ptr = r->next;
+       }
+       return 0;
+}
+
+static int _tdb_store(struct tdb_context *tdb, TDB_DATA key,
+                      TDB_DATA dbuf, int flag, uint32_t hash)
+{
+       struct tdb_record rec;
+       tdb_off_t rec_ptr;
+       int ret = -1;
+
+       /* check for it existing, on insert. */
+       if (flag == TDB_INSERT) {
+               if (tdb_exists_hash(tdb, key, hash)) {
+                       tdb->ecode = TDB_ERR_EXISTS;
+                       goto fail;
+               }
+       } else {
+               /* first try in-place update, on modify or replace. */
+               if (tdb_update_hash(tdb, key, hash, dbuf) == 0) {
+                       goto done;
+               }
+               if (tdb->ecode == TDB_ERR_NOEXIST &&
+                   flag == TDB_MODIFY) {
+                       /* if the record doesn't exist and we are in TDB_MODIFY mode then
+                        we should fail the store */
+                       goto fail;
+               }
+       }
+       /* reset the error code potentially set by the tdb_update() */
+       tdb->ecode = TDB_SUCCESS;
+
+       /* delete any existing record - if it doesn't exist we don't
+           care.  Doing this first reduces fragmentation, and avoids
+           coalescing with `allocated' block before it's updated. */
+       if (flag != TDB_INSERT)
+               tdb_delete_hash(tdb, key, hash);
+
+       if (tdb->max_dead_records != 0) {
+               /*
+                * Allow for some dead records per hash chain, look if we can
+                * find one that can hold the new record. We need enough space
+                * for key, data and tailer. If we find one, we don't have to
+                * consult the central freelist.
+                */
+               rec_ptr = tdb_find_dead(
+                       tdb, hash, &rec,
+                       key.dsize + dbuf.dsize + sizeof(tdb_off_t));
+
+               if (rec_ptr != 0) {
+                       rec.key_len = key.dsize;
+                       rec.data_len = dbuf.dsize;
+                       rec.full_hash = hash;
+                       rec.magic = TDB_MAGIC;
+                       if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
+                           || tdb->methods->tdb_write(
+                                   tdb, rec_ptr + sizeof(rec),
+                                   key.dptr, key.dsize) == -1
+                           || tdb->methods->tdb_write(
+                                   tdb, rec_ptr + sizeof(rec) + key.dsize,
+                                   dbuf.dptr, dbuf.dsize) == -1) {
+                               goto fail;
+                       }
+                       goto done;
+               }
+       }
+
+       /*
+        * We have to allocate some space from the freelist, so this means we
+        * have to lock it. Use the chance to purge all the DEAD records from
+        * the hash chain under the freelist lock.
+        */
+
+       if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
+               goto fail;
+       }
+
+       if ((tdb->max_dead_records != 0)
+           && (tdb_purge_dead(tdb, hash) == -1)) {
+               tdb_unlock(tdb, -1, F_WRLCK);
+               goto fail;
+       }
+
+       /* we have to allocate some space */
+       rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec);
+
+       tdb_unlock(tdb, -1, F_WRLCK);
+
+       if (rec_ptr == 0) {
+               goto fail;
+       }
+
+       /* Read hash top into next ptr */
+       if (tdb_ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
+               goto fail;
+
+       rec.key_len = key.dsize;
+       rec.data_len = dbuf.dsize;
+       rec.full_hash = hash;
+       rec.magic = TDB_MAGIC;
+
+       /* write out and point the top of the hash chain at it */
+       if (tdb_rec_write(tdb, rec_ptr, &rec) == -1
+           || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec),
+                                      key.dptr, key.dsize) == -1
+           || tdb->methods->tdb_write(tdb, rec_ptr+sizeof(rec)+key.dsize,
+                                      dbuf.dptr, dbuf.dsize) == -1
+           || tdb_ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
+               /* Need to tdb_unallocate() here */
+               goto fail;
+       }
+
+ done:
+       ret = 0;
+ fail:
+       if (ret == 0) {
+               tdb_increment_seqnum(tdb);
+       }
+       return ret;
+}
+
+/* store an element in the database, replacing any existing element
+   with the same key
+
+   return 0 on success, -1 on failure
+*/
+_PUBLIC_ int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
+{
+       uint32_t hash;
+       int ret;
+
+       if (tdb->read_only || tdb->traverse_read) {
+               tdb->ecode = TDB_ERR_RDONLY;
+               tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, -1);
+               return -1;
+       }
+
+       /* find which hash bucket it is in */
+       hash = tdb->hash_fn(&key);
+       if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
+               return -1;
+
+       ret = _tdb_store(tdb, key, dbuf, flag, hash);
+       tdb_trace_2rec_flag_ret(tdb, "tdb_store", key, dbuf, flag, ret);
+       tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
+       return ret;
+}
+
+/* Append to an entry. Create if not exist. */
+_PUBLIC_ int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf)
+{
+       uint32_t hash;
+       TDB_DATA dbuf;
+       int ret = -1;
+
+       /* find which hash bucket it is in */
+       hash = tdb->hash_fn(&key);
+       if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
+               return -1;
+
+       dbuf = _tdb_fetch(tdb, key);
+
+       if (dbuf.dptr == NULL) {
+               dbuf.dptr = (unsigned char *)malloc(new_dbuf.dsize);
+       } else {
+               unsigned int new_len = dbuf.dsize + new_dbuf.dsize;
+               unsigned char *new_dptr;
+
+               /* realloc '0' is special: don't do that. */
+               if (new_len == 0)
+                       new_len = 1;
+               new_dptr = (unsigned char *)realloc(dbuf.dptr, new_len);
+               if (new_dptr == NULL) {
+                       free(dbuf.dptr);
+               }
+               dbuf.dptr = new_dptr;
+       }
+
+       if (dbuf.dptr == NULL) {
+               tdb->ecode = TDB_ERR_OOM;
+               goto failed;
+       }
+
+       memcpy(dbuf.dptr + dbuf.dsize, new_dbuf.dptr, new_dbuf.dsize);
+       dbuf.dsize += new_dbuf.dsize;
+
+       ret = _tdb_store(tdb, key, dbuf, 0, hash);
+       tdb_trace_2rec_retrec(tdb, "tdb_append", key, new_dbuf, dbuf);
+
+failed:
+       tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
+       SAFE_FREE(dbuf.dptr);
+       return ret;
+}
+
+
+/*
+  return the name of the current tdb file
+  useful for external logging functions
+*/
+_PUBLIC_ const char *tdb_name(struct tdb_context *tdb)
+{
+       return tdb->name;
+}
+
+/*
+  return the underlying file descriptor being used by tdb, or -1
+  useful for external routines that want to check the device/inode
+  of the fd
+*/
+_PUBLIC_ int tdb_fd(struct tdb_context *tdb)
+{
+       return tdb->fd;
+}
+
+/*
+  return the current logging function
+  useful for external tdb routines that wish to log tdb errors
+*/
+_PUBLIC_ tdb_log_func tdb_log_fn(struct tdb_context *tdb)
+{
+       return tdb->log.log_fn;
+}
+
+
+/*
+  get the tdb sequence number. Only makes sense if the writers opened
+  with TDB_SEQNUM set. Note that this sequence number will wrap quite
+  quickly, so it should only be used for a 'has something changed'
+  test, not for code that relies on the count of the number of changes
+  made. If you want a counter then use a tdb record.
+
+  The aim of this sequence number is to allow for a very lightweight
+  test of a possible tdb change.
+*/
+_PUBLIC_ int tdb_get_seqnum(struct tdb_context *tdb)
+{
+       tdb_off_t seqnum=0;
+
+       tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
+       return seqnum;
+}
+
+_PUBLIC_ int tdb_hash_size(struct tdb_context *tdb)
+{
+       return tdb->header.hash_size;
+}
+
+_PUBLIC_ size_t tdb_map_size(struct tdb_context *tdb)
+{
+       return tdb->map_size;
+}
+
+_PUBLIC_ int tdb_get_flags(struct tdb_context *tdb)
+{
+       return tdb->flags;
+}
+
+_PUBLIC_ void tdb_add_flags(struct tdb_context *tdb, unsigned flags)
+{
+       if ((flags & TDB_ALLOW_NESTING) &&
+           (flags & TDB_DISALLOW_NESTING)) {
+               tdb->ecode = TDB_ERR_NESTING;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_add_flags: "
+                       "allow_nesting and disallow_nesting are not allowed together!"));
+               return;
+       }
+
+       if (flags & TDB_ALLOW_NESTING) {
+               tdb->flags &= ~TDB_DISALLOW_NESTING;
+       }
+       if (flags & TDB_DISALLOW_NESTING) {
+               tdb->flags &= ~TDB_ALLOW_NESTING;
+       }
+
+       tdb->flags |= flags;
+}
+
+_PUBLIC_ void tdb_remove_flags(struct tdb_context *tdb, unsigned flags)
+{
+       if ((flags & TDB_ALLOW_NESTING) &&
+           (flags & TDB_DISALLOW_NESTING)) {
+               tdb->ecode = TDB_ERR_NESTING;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_remove_flags: "
+                       "allow_nesting and disallow_nesting are not allowed together!"));
+               return;
+       }
+
+       if (flags & TDB_ALLOW_NESTING) {
+               tdb->flags |= TDB_DISALLOW_NESTING;
+       }
+       if (flags & TDB_DISALLOW_NESTING) {
+               tdb->flags |= TDB_ALLOW_NESTING;
+       }
+
+       tdb->flags &= ~flags;
+}
+
+
+/*
+  enable sequence number handling on an open tdb
+*/
+_PUBLIC_ void tdb_enable_seqnum(struct tdb_context *tdb)
+{
+       tdb->flags |= TDB_SEQNUM;
+}
+
+
+/*
+  add a region of the file to the freelist. Length is the size of the region in bytes, 
+  which includes the free list header that needs to be added
+ */
+static int tdb_free_region(struct tdb_context *tdb, tdb_off_t offset, ssize_t length)
+{
+       struct tdb_record rec;
+       if (length <= sizeof(rec)) {
+               /* the region is not worth adding */
+               return 0;
+       }
+       if (length + offset > tdb->map_size) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: adding region beyond end of file\n"));
+               return -1;              
+       }
+       memset(&rec,'\0',sizeof(rec));
+       rec.rec_len = length - sizeof(rec);
+       if (tdb_free(tdb, offset, &rec) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_free_region: failed to add free record\n"));
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  wipe the entire database, deleting all records. This can be done
+  very fast by using a allrecord lock. The entire data portion of the
+  file becomes a single entry in the freelist.
+
+  This code carefully steps around the recovery area, leaving it alone
+ */
+_PUBLIC_ int tdb_wipe_all(struct tdb_context *tdb)
+{
+       int i;
+       tdb_off_t offset = 0;
+       ssize_t data_len;
+       tdb_off_t recovery_head;
+       tdb_len_t recovery_size = 0;
+
+       if (tdb_lockall(tdb) != 0) {
+               return -1;
+       }
+
+       tdb_trace(tdb, "tdb_wipe_all");
+
+       /* see if the tdb has a recovery area, and remember its size
+          if so. We don't want to lose this as otherwise each
+          tdb_wipe_all() in a transaction will increase the size of
+          the tdb by the size of the recovery area */
+       if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery head\n"));
+               goto failed;
+       }
+
+       if (recovery_head != 0) {
+               struct tdb_record rec;
+               if (tdb->methods->tdb_read(tdb, recovery_head, &rec, sizeof(rec), DOCONV()) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_wipe_all: failed to read recovery record\n"));
+                       return -1;
+               }       
+               recovery_size = rec.rec_len + sizeof(rec);
+       }
+
+       /* wipe the hashes */
+       for (i=0;i<tdb->header.hash_size;i++) {
+               if (tdb_ofs_write(tdb, TDB_HASH_TOP(i), &offset) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write hash %d\n", i));
+                       goto failed;
+               }
+       }
+
+       /* wipe the freelist */
+       if (tdb_ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to write freelist\n"));
+               goto failed;
+       }
+
+       /* add all the rest of the file to the freelist, possibly leaving a gap 
+          for the recovery area */
+       if (recovery_size == 0) {
+               /* the simple case - the whole file can be used as a freelist */
+               data_len = (tdb->map_size - TDB_DATA_START(tdb->header.hash_size));
+               if (tdb_free_region(tdb, TDB_DATA_START(tdb->header.hash_size), data_len) != 0) {
+                       goto failed;
+               }
+       } else {
+               /* we need to add two freelist entries - one on either
+                  side of the recovery area 
+
+                  Note that we cannot shift the recovery area during
+                  this operation. Only the transaction.c code may
+                  move the recovery area or we risk subtle data
+                  corruption
+               */
+               data_len = (recovery_head - TDB_DATA_START(tdb->header.hash_size));
+               if (tdb_free_region(tdb, TDB_DATA_START(tdb->header.hash_size), data_len) != 0) {
+                       goto failed;
+               }
+               /* and the 2nd free list entry after the recovery area - if any */
+               data_len = tdb->map_size - (recovery_head+recovery_size);
+               if (tdb_free_region(tdb, recovery_head+recovery_size, data_len) != 0) {
+                       goto failed;
+               }
+       }
+
+       tdb_increment_seqnum_nonblock(tdb);
+
+       if (tdb_unlockall(tdb) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL,"tdb_wipe_all: failed to unlock\n"));
+               goto failed;
+       }
+
+       return 0;
+
+failed:
+       tdb_unlockall(tdb);
+       return -1;
+}
+
+struct traverse_state {
+       bool error;
+       struct tdb_context *dest_db;
+};
+
+/*
+  traverse function for repacking
+ */
+static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
+{
+       struct traverse_state *state = (struct traverse_state *)private_data;
+       if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
+               state->error = true;
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  repack a tdb
+ */
+_PUBLIC_ int tdb_repack(struct tdb_context *tdb)
+{
+       struct tdb_context *tmp_db;
+       struct traverse_state state;
+
+       tdb_trace(tdb, "tdb_repack");
+
+       if (tdb_transaction_start(tdb) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to start transaction\n"));
+               return -1;
+       }
+
+       tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb), TDB_INTERNAL, O_RDWR|O_CREAT, 0);
+       if (tmp_db == NULL) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to create tmp_db\n"));
+               tdb_transaction_cancel(tdb);
+               return -1;
+       }
+
+       state.error = false;
+       state.dest_db = tmp_db;
+
+       if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying out\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;              
+       }
+
+       if (state.error) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during traversal\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       if (tdb_wipe_all(tdb) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to wipe database\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       state.error = false;
+       state.dest_db = tdb;
+
+       if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to traverse copying back\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;              
+       }
+
+       if (state.error) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Error during second traversal\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       tdb_close(tmp_db);
+
+       if (tdb_transaction_commit(tdb) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, __location__ " Failed to commit\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/* Even on files, we can get partial writes due to signals. */
+bool tdb_write_all(int fd, const void *buf, size_t count)
+{
+       while (count) {
+               ssize_t ret;
+               ret = write(fd, buf, count);
+               if (ret < 0)
+                       return false;
+               buf = (const char *)buf + ret;
+               count -= ret;
+       }
+       return true;
+}
+
+#ifdef TDB_TRACE
+static void tdb_trace_write(struct tdb_context *tdb, const char *str)
+{
+       if (!tdb_write_all(tdb->tracefd, str, strlen(str))) {
+               close(tdb->tracefd);
+               tdb->tracefd = -1;
+       }
+}
+
+static void tdb_trace_start(struct tdb_context *tdb)
+{
+       tdb_off_t seqnum=0;
+       char msg[sizeof(tdb_off_t) * 4 + 1];
+
+       tdb_ofs_read(tdb, TDB_SEQNUM_OFS, &seqnum);
+       snprintf(msg, sizeof(msg), "%u ", seqnum);
+       tdb_trace_write(tdb, msg);
+}
+
+static void tdb_trace_end(struct tdb_context *tdb)
+{
+       tdb_trace_write(tdb, "\n");
+}
+
+static void tdb_trace_end_ret(struct tdb_context *tdb, int ret)
+{
+       char msg[sizeof(ret) * 4 + 4];
+       snprintf(msg, sizeof(msg), " = %i\n", ret);
+       tdb_trace_write(tdb, msg);
+}
+
+static void tdb_trace_record(struct tdb_context *tdb, TDB_DATA rec)
+{
+       char msg[20 + rec.dsize*2], *p;
+       unsigned int i;
+
+       /* We differentiate zero-length records from non-existent ones. */
+       if (rec.dptr == NULL) {
+               tdb_trace_write(tdb, " NULL");
+               return;
+       }
+
+       /* snprintf here is purely cargo-cult programming. */
+       p = msg;
+       p += snprintf(p, sizeof(msg), " %zu:", rec.dsize);
+       for (i = 0; i < rec.dsize; i++)
+               p += snprintf(p, 2, "%02x", rec.dptr[i]);
+
+       tdb_trace_write(tdb, msg);
+}
+
+void tdb_trace(struct tdb_context *tdb, const char *op)
+{
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_end(tdb);
+}
+
+void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op)
+{
+       char msg[sizeof(tdb_off_t) * 4 + 1];
+
+       snprintf(msg, sizeof(msg), "%u ", seqnum);
+       tdb_trace_write(tdb, msg);
+       tdb_trace_write(tdb, op);
+       tdb_trace_end(tdb);
+}
+
+void tdb_trace_open(struct tdb_context *tdb, const char *op,
+                   unsigned hash_size, unsigned tdb_flags, unsigned open_flags)
+{
+       char msg[128];
+
+       snprintf(msg, sizeof(msg),
+                "%s %u 0x%x 0x%x", op, hash_size, tdb_flags, open_flags);
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, msg);
+       tdb_trace_end(tdb);
+}
+
+void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret)
+{
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_end_ret(tdb, ret);
+}
+
+void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret)
+{
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_write(tdb, " =");
+       tdb_trace_record(tdb, ret);
+       tdb_trace_end(tdb);
+}
+
+void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
+                   TDB_DATA rec)
+{
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_record(tdb, rec);
+       tdb_trace_end(tdb);
+}
+
+void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
+                       TDB_DATA rec, int ret)
+{
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_record(tdb, rec);
+       tdb_trace_end_ret(tdb, ret);
+}
+
+void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
+                          TDB_DATA rec, TDB_DATA ret)
+{
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_record(tdb, rec);
+       tdb_trace_write(tdb, " =");
+       tdb_trace_record(tdb, ret);
+       tdb_trace_end(tdb);
+}
+
+void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
+                            TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
+                            int ret)
+{
+       char msg[1 + sizeof(ret) * 4];
+
+       snprintf(msg, sizeof(msg), " %#x", flag);
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_record(tdb, rec1);
+       tdb_trace_record(tdb, rec2);
+       tdb_trace_write(tdb, msg);
+       tdb_trace_end_ret(tdb, ret);
+}
+
+void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
+                          TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret)
+{
+       tdb_trace_start(tdb);
+       tdb_trace_write(tdb, op);
+       tdb_trace_record(tdb, rec1);
+       tdb_trace_record(tdb, rec2);
+       tdb_trace_write(tdb, " =");
+       tdb_trace_record(tdb, ret);
+       tdb_trace_end(tdb);
+}
+#endif
diff --git a/ctdb/lib/tdb/common/tdb_private.h b/ctdb/lib/tdb/common/tdb_private.h
new file mode 100644 (file)
index 0000000..0441fb2
--- /dev/null
@@ -0,0 +1,285 @@
+#ifndef TDB_PRIVATE_H
+#define TDB_PRIVATE_H
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library - private includes
+
+   Copyright (C) Andrew Tridgell              2005
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/shmem.h"
+#include "system/select.h"
+#include "system/wait.h"
+#include "tdb.h"
+
+/* #define TDB_TRACE 1 */
+#ifndef HAVE_GETPAGESIZE
+#define getpagesize() 0x2000
+#endif
+
+typedef uint32_t tdb_len_t;
+typedef uint32_t tdb_off_t;
+
+#ifndef offsetof
+#define offsetof(t,f) ((unsigned int)&((t *)0)->f)
+#endif
+
+#define TDB_MAGIC_FOOD "TDB file\n"
+#define TDB_VERSION (0x26011967 + 6)
+#define TDB_MAGIC (0x26011999U)
+#define TDB_FREE_MAGIC (~TDB_MAGIC)
+#define TDB_DEAD_MAGIC (0xFEE1DEAD)
+#define TDB_RECOVERY_MAGIC (0xf53bc0e7U)
+#define TDB_RECOVERY_INVALID_MAGIC (0x0)
+#define TDB_HASH_RWLOCK_MAGIC (0xbad1a51U)
+#define TDB_ALIGNMENT 4
+#define DEFAULT_HASH_SIZE 131
+#define FREELIST_TOP (sizeof(struct tdb_header))
+#define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
+#define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
+#define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
+#define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
+#define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off_t))
+#define TDB_HASHTABLE_SIZE(tdb) ((tdb->header.hash_size+1)*sizeof(tdb_off_t))
+#define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1) + sizeof(tdb_off_t))
+#define TDB_RECOVERY_HEAD offsetof(struct tdb_header, recovery_start)
+#define TDB_SEQNUM_OFS    offsetof(struct tdb_header, sequence_number)
+#define TDB_PAD_BYTE 0x42
+#define TDB_PAD_U32  0x42424242
+
+/* NB assumes there is a local variable called "tdb" that is the
+ * current context, also takes doubly-parenthesized print-style
+ * argument. */
+#define TDB_LOG(x) tdb->log.log_fn x
+
+#ifdef TDB_TRACE
+void tdb_trace(struct tdb_context *tdb, const char *op);
+void tdb_trace_seqnum(struct tdb_context *tdb, uint32_t seqnum, const char *op);
+void tdb_trace_open(struct tdb_context *tdb, const char *op,
+                   unsigned hash_size, unsigned tdb_flags, unsigned open_flags);
+void tdb_trace_ret(struct tdb_context *tdb, const char *op, int ret);
+void tdb_trace_retrec(struct tdb_context *tdb, const char *op, TDB_DATA ret);
+void tdb_trace_1rec(struct tdb_context *tdb, const char *op,
+                   TDB_DATA rec);
+void tdb_trace_1rec_ret(struct tdb_context *tdb, const char *op,
+                       TDB_DATA rec, int ret);
+void tdb_trace_1rec_retrec(struct tdb_context *tdb, const char *op,
+                          TDB_DATA rec, TDB_DATA ret);
+void tdb_trace_2rec_flag_ret(struct tdb_context *tdb, const char *op,
+                            TDB_DATA rec1, TDB_DATA rec2, unsigned flag,
+                            int ret);
+void tdb_trace_2rec_retrec(struct tdb_context *tdb, const char *op,
+                          TDB_DATA rec1, TDB_DATA rec2, TDB_DATA ret);
+#else
+#define tdb_trace(tdb, op)
+#define tdb_trace_seqnum(tdb, seqnum, op)
+#define tdb_trace_open(tdb, op, hash_size, tdb_flags, open_flags)
+#define tdb_trace_ret(tdb, op, ret)
+#define tdb_trace_retrec(tdb, op, ret)
+#define tdb_trace_1rec(tdb, op, rec)
+#define tdb_trace_1rec_ret(tdb, op, rec, ret)
+#define tdb_trace_1rec_retrec(tdb, op, rec, ret)
+#define tdb_trace_2rec_flag_ret(tdb, op, rec1, rec2, flag, ret)
+#define tdb_trace_2rec_retrec(tdb, op, rec1, rec2, ret)
+#endif /* !TDB_TRACE */
+
+/* lock offsets */
+#define OPEN_LOCK        0
+#define ACTIVE_LOCK      4
+#define TRANSACTION_LOCK 8
+
+/* free memory if the pointer is valid and zero the pointer */
+#ifndef SAFE_FREE
+#define SAFE_FREE(x) do { if ((x) != NULL) {free(x); (x)=NULL;} } while(0)
+#endif
+
+#define BUCKET(hash) ((hash) % tdb->header.hash_size)
+
+#define DOCONV() (tdb->flags & TDB_CONVERT)
+#define CONVERT(x) (DOCONV() ? tdb_convert(&x, sizeof(x)) : &x)
+
+
+/* the body of the database is made of one tdb_record for the free space
+   plus a separate data list for each hash value */
+struct tdb_record {
+       tdb_off_t next; /* offset of the next record in the list */
+       tdb_len_t rec_len; /* total byte length of record */
+       tdb_len_t key_len; /* byte length of key */
+       tdb_len_t data_len; /* byte length of data */
+       uint32_t full_hash; /* the full 32 bit hash of the key */
+       uint32_t magic;   /* try to catch errors */
+       /* the following union is implied:
+               union {
+                       char record[rec_len];
+                       struct {
+                               char key[key_len];
+                               char data[data_len];
+                       }
+                       uint32_t totalsize; (tailer)
+               }
+       */
+};
+
+
+/* this is stored at the front of every database */
+struct tdb_header {
+       char magic_food[32]; /* for /etc/magic */
+       uint32_t version; /* version of the code */
+       uint32_t hash_size; /* number of hash entries */
+       tdb_off_t rwlocks; /* obsolete - kept to detect old formats */
+       tdb_off_t recovery_start; /* offset of transaction recovery region */
+       tdb_off_t sequence_number; /* used when TDB_SEQNUM is set */
+       uint32_t magic1_hash; /* hash of TDB_MAGIC_FOOD. */
+       uint32_t magic2_hash; /* hash of TDB_MAGIC. */
+       tdb_off_t reserved[27];
+};
+
+struct tdb_lock_type {
+       uint32_t off;
+       uint32_t count;
+       uint32_t ltype;
+};
+
+struct tdb_traverse_lock {
+       struct tdb_traverse_lock *next;
+       uint32_t off;
+       uint32_t hash;
+       int lock_rw;
+};
+
+enum tdb_lock_flags {
+       /* WAIT == F_SETLKW, NOWAIT == F_SETLK */
+       TDB_LOCK_NOWAIT = 0,
+       TDB_LOCK_WAIT = 1,
+       /* If set, don't log an error on failure. */
+       TDB_LOCK_PROBE = 2,
+       /* If set, don't actually lock at all. */
+       TDB_LOCK_MARK_ONLY = 4,
+};
+
+struct tdb_methods {
+       int (*tdb_read)(struct tdb_context *, tdb_off_t , void *, tdb_len_t , int );
+       int (*tdb_write)(struct tdb_context *, tdb_off_t, const void *, tdb_len_t);
+       void (*next_hash_chain)(struct tdb_context *, uint32_t *);
+       int (*tdb_oob)(struct tdb_context *, tdb_off_t , tdb_len_t, int );
+       int (*tdb_expand_file)(struct tdb_context *, tdb_off_t , tdb_off_t );
+};
+
+struct tdb_context {
+       char *name; /* the name of the database */
+       void *map_ptr; /* where it is currently mapped */
+       int fd; /* open file descriptor for the database */
+       tdb_len_t map_size; /* how much space has been mapped */
+       int read_only; /* opened read-only */
+       int traverse_read; /* read-only traversal */
+       int traverse_write; /* read-write traversal */
+       struct tdb_lock_type allrecord_lock; /* .offset == upgradable */
+       int num_lockrecs;
+       struct tdb_lock_type *lockrecs; /* only real locks, all with count>0 */
+       enum TDB_ERROR ecode; /* error code for last tdb error */
+       struct tdb_header header; /* a cached copy of the header */
+       uint32_t flags; /* the flags passed to tdb_open */
+       struct tdb_traverse_lock travlocks; /* current traversal locks */
+       struct tdb_context *next; /* all tdbs to avoid multiple opens */
+       dev_t device;   /* uniquely identifies this tdb */
+       ino_t inode;    /* uniquely identifies this tdb */
+       struct tdb_logging_context log;
+       unsigned int (*hash_fn)(TDB_DATA *key);
+       int open_flags; /* flags used in the open - needed by reopen */
+       const struct tdb_methods *methods;
+       struct tdb_transaction *transaction;
+       int page_size;
+       int max_dead_records;
+#ifdef TDB_TRACE
+       int tracefd;
+#endif
+       volatile sig_atomic_t *interrupt_sig_ptr;
+};
+
+
+/*
+  internal prototypes
+*/
+int tdb_munmap(struct tdb_context *tdb);
+int tdb_mmap(struct tdb_context *tdb);
+int tdb_lock(struct tdb_context *tdb, int list, int ltype);
+int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+int tdb_nest_lock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                 enum tdb_lock_flags flags);
+int tdb_nest_unlock(struct tdb_context *tdb, uint32_t offset, int ltype,
+                   bool mark_lock);
+int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
+int tdb_brlock(struct tdb_context *tdb,
+              int rw_type, tdb_off_t offset, size_t len,
+              enum tdb_lock_flags flags);
+int tdb_brunlock(struct tdb_context *tdb,
+                int rw_type, tdb_off_t offset, size_t len);
+bool tdb_have_extra_locks(struct tdb_context *tdb);
+void tdb_release_transaction_locks(struct tdb_context *tdb);
+int tdb_transaction_lock(struct tdb_context *tdb, int ltype,
+                        enum tdb_lock_flags lockflags);
+int tdb_transaction_unlock(struct tdb_context *tdb, int ltype);
+int tdb_recovery_area(struct tdb_context *tdb,
+                     const struct tdb_methods *methods,
+                     tdb_off_t *recovery_offset,
+                     struct tdb_record *rec);
+int tdb_allrecord_lock(struct tdb_context *tdb, int ltype,
+                      enum tdb_lock_flags flags, bool upgradable);
+int tdb_allrecord_unlock(struct tdb_context *tdb, int ltype, bool mark_lock);
+int tdb_allrecord_upgrade(struct tdb_context *tdb);
+int tdb_write_lock_record(struct tdb_context *tdb, tdb_off_t off);
+int tdb_write_unlock_record(struct tdb_context *tdb, tdb_off_t off);
+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+void *tdb_convert(void *buf, uint32_t size);
+int tdb_free(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
+tdb_off_t tdb_allocate(struct tdb_context *tdb, tdb_len_t length, struct tdb_record *rec);
+int tdb_ofs_read(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+int tdb_ofs_write(struct tdb_context *tdb, tdb_off_t offset, tdb_off_t *d);
+int tdb_lock_record(struct tdb_context *tdb, tdb_off_t off);
+int tdb_unlock_record(struct tdb_context *tdb, tdb_off_t off);
+bool tdb_needs_recovery(struct tdb_context *tdb);
+int tdb_rec_read(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
+int tdb_rec_write(struct tdb_context *tdb, tdb_off_t offset, struct tdb_record *rec);
+int tdb_do_delete(struct tdb_context *tdb, tdb_off_t rec_ptr, struct tdb_record *rec);
+unsigned char *tdb_alloc_read(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t len);
+int tdb_parse_data(struct tdb_context *tdb, TDB_DATA key,
+                  tdb_off_t offset, tdb_len_t len,
+                  int (*parser)(TDB_DATA key, TDB_DATA data,
+                                void *private_data),
+                  void *private_data);
+tdb_off_t tdb_find_lock_hash(struct tdb_context *tdb, TDB_DATA key, uint32_t hash, int locktype,
+                          struct tdb_record *rec);
+void tdb_io_init(struct tdb_context *tdb);
+int tdb_expand(struct tdb_context *tdb, tdb_off_t size);
+tdb_off_t tdb_expand_adjust(tdb_off_t map_size, tdb_off_t size, int page_size);
+int tdb_rec_free_read(struct tdb_context *tdb, tdb_off_t off,
+                     struct tdb_record *rec);
+bool tdb_write_all(int fd, const void *buf, size_t count);
+int tdb_transaction_recover(struct tdb_context *tdb);
+void tdb_header_hash(struct tdb_context *tdb,
+                    uint32_t *magic1_hash, uint32_t *magic2_hash);
+unsigned int tdb_old_hash(TDB_DATA *key);
+size_t tdb_dead_space(struct tdb_context *tdb, tdb_off_t off);
+#endif /* TDB_PRIVATE_H */
diff --git a/ctdb/lib/tdb/common/transaction.c b/ctdb/lib/tdb/common/transaction.c
new file mode 100644 (file)
index 0000000..f18b4c2
--- /dev/null
@@ -0,0 +1,1294 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              2005
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+/*
+  transaction design:
+
+  - only allow a single transaction at a time per database. This makes
+    using the transaction API simpler, as otherwise the caller would
+    have to cope with temporary failures in transactions that conflict
+    with other current transactions
+
+  - keep the transaction recovery information in the same file as the
+    database, using a special 'transaction recovery' record pointed at
+    by the header. This removes the need for extra journal files as
+    used by some other databases
+
+  - dynamically allocated the transaction recover record, re-using it
+    for subsequent transactions. If a larger record is needed then
+    tdb_free() the old record to place it on the normal tdb freelist
+    before allocating the new record
+
+  - during transactions, keep a linked list of writes all that have
+    been performed by intercepting all tdb_write() calls. The hooked
+    transaction versions of tdb_read() and tdb_write() check this
+    linked list and try to use the elements of the list in preference
+    to the real database.
+
+  - don't allow any locks to be held when a transaction starts,
+    otherwise we can end up with deadlock (plus lack of lock nesting
+    in posix locks would mean the lock is lost)
+
+  - if the caller gains a lock during the transaction but doesn't
+    release it then fail the commit
+
+  - allow for nested calls to tdb_transaction_start(), re-using the
+    existing transaction record. If the inner transaction is cancelled
+    then a subsequent commit will fail
+
+  - keep a mirrored copy of the tdb hash chain heads to allow for the
+    fast hash heads scan on traverse, updating the mirrored copy in
+    the transaction version of tdb_write
+
+  - allow callers to mix transaction and non-transaction use of tdb,
+    although once a transaction is started then an exclusive lock is
+    gained until the transaction is committed or cancelled
+
+  - the commit stategy involves first saving away all modified data
+    into a linearised buffer in the transaction recovery area, then
+    marking the transaction recovery area with a magic value to
+    indicate a valid recovery record. In total 4 fsync/msync calls are
+    needed per commit to prevent race conditions. It might be possible
+    to reduce this to 3 or even 2 with some more work.
+
+  - check for a valid recovery record on open of the tdb, while the
+    open lock is held. Automatically recover from the transaction
+    recovery area if needed, then continue with the open as
+    usual. This allows for smooth crash recovery with no administrator
+    intervention.
+
+  - if TDB_NOSYNC is passed to flags in tdb_open then transactions are
+    still available, but no fsync/msync calls are made.  This means we
+    are still proof against a process dying during transaction commit,
+    but not against machine reboot.
+
+  - if TDB_ALLOW_NESTING is passed to flags in tdb open, or added using
+    tdb_add_flags() transaction nesting is enabled.
+    It resets the TDB_DISALLOW_NESTING flag, as both cannot be used together.
+    The default is that transaction nesting is allowed.
+    Note: this default may change in future versions of tdb.
+
+    Beware. when transactions are nested a transaction successfully
+    completed with tdb_transaction_commit() can be silently unrolled later.
+
+  - if TDB_DISALLOW_NESTING is passed to flags in tdb open, or added using
+    tdb_add_flags() transaction nesting is disabled.
+    It resets the TDB_ALLOW_NESTING flag, as both cannot be used together.
+    An attempt create a nested transaction will fail with TDB_ERR_NESTING.
+    The default is that transaction nesting is allowed.
+    Note: this default may change in future versions of tdb.
+*/
+
+
+/*
+  hold the context of any current transaction
+*/
+struct tdb_transaction {
+       /* we keep a mirrored copy of the tdb hash heads here so
+          tdb_next_hash_chain() can operate efficiently */
+       uint32_t *hash_heads;
+
+       /* the original io methods - used to do IOs to the real db */
+       const struct tdb_methods *io_methods;
+
+       /* the list of transaction blocks. When a block is first
+          written to, it gets created in this list */
+       uint8_t **blocks;
+       uint32_t num_blocks;
+       uint32_t block_size;      /* bytes in each block */
+       uint32_t last_block_size; /* number of valid bytes in the last block */
+
+       /* non-zero when an internal transaction error has
+          occurred. All write operations will then fail until the
+          transaction is ended */
+       int transaction_error;
+
+       /* when inside a transaction we need to keep track of any
+          nested tdb_transaction_start() calls, as these are allowed,
+          but don't create a new transaction */
+       int nesting;
+
+       /* set when a prepare has already occurred */
+       bool prepared;
+       tdb_off_t magic_offset;
+
+       /* old file size before transaction */
+       tdb_len_t old_map_size;
+
+       /* did we expand in this transaction */
+       bool expanded;
+};
+
+
+/*
+  read while in a transaction. We need to check first if the data is in our list
+  of transaction elements, then if not do a real read
+*/
+static int transaction_read(struct tdb_context *tdb, tdb_off_t off, void *buf, 
+                           tdb_len_t len, int cv)
+{
+       uint32_t blk;
+
+       /* break it down into block sized ops */
+       while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
+               tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
+               if (transaction_read(tdb, off, buf, len2, cv) != 0) {
+                       return -1;
+               }
+               len -= len2;
+               off += len2;
+               buf = (void *)(len2 + (char *)buf);
+       }
+
+       if (len == 0) {
+               return 0;
+       }
+
+       blk = off / tdb->transaction->block_size;
+
+       /* see if we have it in the block list */
+       if (tdb->transaction->num_blocks <= blk ||
+           tdb->transaction->blocks[blk] == NULL) {
+               /* nope, do a real read */
+               if (tdb->transaction->io_methods->tdb_read(tdb, off, buf, len, cv) != 0) {
+                       goto fail;
+               }
+               return 0;
+       }
+
+       /* it is in the block list. Now check for the last block */
+       if (blk == tdb->transaction->num_blocks-1) {
+               if (len > tdb->transaction->last_block_size) {
+                       goto fail;
+               }
+       }
+
+       /* now copy it out of this block */
+       memcpy(buf, tdb->transaction->blocks[blk] + (off % tdb->transaction->block_size), len);
+       if (cv) {
+               tdb_convert(buf, len);
+       }
+       return 0;
+
+fail:
+       TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_read: failed at off=%d len=%d\n", off, len));
+       tdb->ecode = TDB_ERR_IO;
+       tdb->transaction->transaction_error = 1;
+       return -1;
+}
+
+
+/*
+  write while in a transaction
+*/
+static int transaction_write(struct tdb_context *tdb, tdb_off_t off, 
+                            const void *buf, tdb_len_t len)
+{
+       uint32_t blk;
+
+       /* Only a commit is allowed on a prepared transaction */
+       if (tdb->transaction->prepared) {
+               tdb->ecode = TDB_ERR_EINVAL;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: transaction already prepared, write not allowed\n"));
+               tdb->transaction->transaction_error = 1;
+               return -1;
+       }
+
+       /* if the write is to a hash head, then update the transaction
+          hash heads */
+       if (len == sizeof(tdb_off_t) && off >= FREELIST_TOP &&
+           off < FREELIST_TOP+TDB_HASHTABLE_SIZE(tdb)) {
+               uint32_t chain = (off-FREELIST_TOP) / sizeof(tdb_off_t);
+               memcpy(&tdb->transaction->hash_heads[chain], buf, len);
+       }
+
+       /* break it up into block sized chunks */
+       while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
+               tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
+               if (transaction_write(tdb, off, buf, len2) != 0) {
+                       return -1;
+               }
+               len -= len2;
+               off += len2;
+               if (buf != NULL) {
+                       buf = (const void *)(len2 + (const char *)buf);
+               }
+       }
+
+       if (len == 0) {
+               return 0;
+       }
+
+       blk = off / tdb->transaction->block_size;
+       off = off % tdb->transaction->block_size;
+
+       if (tdb->transaction->num_blocks <= blk) {
+               uint8_t **new_blocks;
+               /* expand the blocks array */
+               if (tdb->transaction->blocks == NULL) {
+                       new_blocks = (uint8_t **)malloc(
+                               (blk+1)*sizeof(uint8_t *));
+               } else {
+                       new_blocks = (uint8_t **)realloc(
+                               tdb->transaction->blocks,
+                               (blk+1)*sizeof(uint8_t *));
+               }
+               if (new_blocks == NULL) {
+                       tdb->ecode = TDB_ERR_OOM;
+                       goto fail;
+               }
+               memset(&new_blocks[tdb->transaction->num_blocks], 0, 
+                      (1+(blk - tdb->transaction->num_blocks))*sizeof(uint8_t *));
+               tdb->transaction->blocks = new_blocks;
+               tdb->transaction->num_blocks = blk+1;
+               tdb->transaction->last_block_size = 0;
+       }
+
+       /* allocate and fill a block? */
+       if (tdb->transaction->blocks[blk] == NULL) {
+               tdb->transaction->blocks[blk] = (uint8_t *)calloc(tdb->transaction->block_size, 1);
+               if (tdb->transaction->blocks[blk] == NULL) {
+                       tdb->ecode = TDB_ERR_OOM;
+                       tdb->transaction->transaction_error = 1;
+                       return -1;                      
+               }
+               if (tdb->transaction->old_map_size > blk * tdb->transaction->block_size) {
+                       tdb_len_t len2 = tdb->transaction->block_size;
+                       if (len2 + (blk * tdb->transaction->block_size) > tdb->transaction->old_map_size) {
+                               len2 = tdb->transaction->old_map_size - (blk * tdb->transaction->block_size);
+                       }
+                       if (tdb->transaction->io_methods->tdb_read(tdb, blk * tdb->transaction->block_size, 
+                                                                  tdb->transaction->blocks[blk], 
+                                                                  len2, 0) != 0) {
+                               SAFE_FREE(tdb->transaction->blocks[blk]);                               
+                               tdb->ecode = TDB_ERR_IO;
+                               goto fail;
+                       }
+                       if (blk == tdb->transaction->num_blocks-1) {
+                               tdb->transaction->last_block_size = len2;
+                       }                       
+               }
+       }
+
+       /* overwrite part of an existing block */
+       if (buf == NULL) {
+               memset(tdb->transaction->blocks[blk] + off, 0, len);
+       } else {
+               memcpy(tdb->transaction->blocks[blk] + off, buf, len);
+       }
+       if (blk == tdb->transaction->num_blocks-1) {
+               if (len + off > tdb->transaction->last_block_size) {
+                       tdb->transaction->last_block_size = len + off;
+               }
+       }
+
+       return 0;
+
+fail:
+       TDB_LOG((tdb, TDB_DEBUG_FATAL, "transaction_write: failed at off=%d len=%d\n", 
+                (blk*tdb->transaction->block_size) + off, len));
+       tdb->transaction->transaction_error = 1;
+       return -1;
+}
+
+
+/*
+  write while in a transaction - this varient never expands the transaction blocks, it only
+  updates existing blocks. This means it cannot change the recovery size
+*/
+static int transaction_write_existing(struct tdb_context *tdb, tdb_off_t off, 
+                                     const void *buf, tdb_len_t len)
+{
+       uint32_t blk;
+
+       /* break it up into block sized chunks */
+       while (len + (off % tdb->transaction->block_size) > tdb->transaction->block_size) {
+               tdb_len_t len2 = tdb->transaction->block_size - (off % tdb->transaction->block_size);
+               if (transaction_write_existing(tdb, off, buf, len2) != 0) {
+                       return -1;
+               }
+               len -= len2;
+               off += len2;
+               if (buf != NULL) {
+                       buf = (const void *)(len2 + (const char *)buf);
+               }
+       }
+
+       if (len == 0) {
+               return 0;
+       }
+
+       blk = off / tdb->transaction->block_size;
+       off = off % tdb->transaction->block_size;
+
+       if (tdb->transaction->num_blocks <= blk ||
+           tdb->transaction->blocks[blk] == NULL) {
+               return 0;
+       }
+
+       if (blk == tdb->transaction->num_blocks-1 &&
+           off + len > tdb->transaction->last_block_size) {
+               if (off >= tdb->transaction->last_block_size) {
+                       return 0;
+               }
+               len = tdb->transaction->last_block_size - off;
+       }
+
+       /* overwrite part of an existing block */
+       memcpy(tdb->transaction->blocks[blk] + off, buf, len);
+
+       return 0;
+}
+
+
+/*
+  accelerated hash chain head search, using the cached hash heads
+*/
+static void transaction_next_hash_chain(struct tdb_context *tdb, uint32_t *chain)
+{
+       uint32_t h = *chain;
+       for (;h < tdb->header.hash_size;h++) {
+               /* the +1 takes account of the freelist */
+               if (0 != tdb->transaction->hash_heads[h+1]) {
+                       break;
+               }
+       }
+       (*chain) = h;
+}
+
+/*
+  out of bounds check during a transaction
+*/
+static int transaction_oob(struct tdb_context *tdb, tdb_off_t off,
+                          tdb_len_t len, int probe)
+{
+       if (off + len >= off && off + len <= tdb->map_size) {
+               return 0;
+       }
+       tdb->ecode = TDB_ERR_IO;
+       return -1;
+}
+
+/*
+  transaction version of tdb_expand().
+*/
+static int transaction_expand_file(struct tdb_context *tdb, tdb_off_t size, 
+                                  tdb_off_t addition)
+{
+       /* add a write to the transaction elements, so subsequent
+          reads see the zero data */
+       if (transaction_write(tdb, size, NULL, addition) != 0) {
+               return -1;
+       }
+
+       tdb->transaction->expanded = true;
+
+       return 0;
+}
+
+static const struct tdb_methods transaction_methods = {
+       transaction_read,
+       transaction_write,
+       transaction_next_hash_chain,
+       transaction_oob,
+       transaction_expand_file,
+};
+
+
+/*
+  start a tdb transaction. No token is returned, as only a single
+  transaction is allowed to be pending per tdb_context
+*/
+static int _tdb_transaction_start(struct tdb_context *tdb,
+                                 enum tdb_lock_flags lockflags)
+{
+       /* some sanity checks */
+       if (tdb->read_only || (tdb->flags & TDB_INTERNAL) || tdb->traverse_read) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction on a read-only or internal db\n"));
+               tdb->ecode = TDB_ERR_EINVAL;
+               return -1;
+       }
+
+       /* cope with nested tdb_transaction_start() calls */
+       if (tdb->transaction != NULL) {
+               if (!(tdb->flags & TDB_ALLOW_NESTING)) {
+                       tdb->ecode = TDB_ERR_NESTING;
+                       return -1;
+               }
+               tdb->transaction->nesting++;
+               TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_start: nesting %d\n", 
+                        tdb->transaction->nesting));
+               return 0;
+       }
+
+       if (tdb_have_extra_locks(tdb)) {
+               /* the caller must not have any locks when starting a
+                  transaction as otherwise we'll be screwed by lack
+                  of nested locks in posix */
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction with locks held\n"));
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       if (tdb->travlocks.next != NULL) {
+               /* you cannot use transactions inside a traverse (although you can use
+                  traverse inside a transaction) as otherwise you can end up with
+                  deadlock */
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: cannot start a transaction within a traverse\n"));
+               tdb->ecode = TDB_ERR_LOCK;
+               return -1;
+       }
+
+       tdb->transaction = (struct tdb_transaction *)
+               calloc(sizeof(struct tdb_transaction), 1);
+       if (tdb->transaction == NULL) {
+               tdb->ecode = TDB_ERR_OOM;
+               return -1;
+       }
+
+       /* a page at a time seems like a reasonable compromise between compactness and efficiency */
+       tdb->transaction->block_size = tdb->page_size;
+
+       /* get the transaction write lock. This is a blocking lock. As
+          discussed with Volker, there are a number of ways we could
+          make this async, which we will probably do in the future */
+       if (tdb_transaction_lock(tdb, F_WRLCK, lockflags) == -1) {
+               SAFE_FREE(tdb->transaction->blocks);
+               SAFE_FREE(tdb->transaction);
+               if ((lockflags & TDB_LOCK_WAIT) == 0) {
+                       tdb->ecode = TDB_ERR_NOLOCK;
+               }
+               return -1;
+       }
+
+       /* get a read lock from the freelist to the end of file. This
+          is upgraded to a write lock during the commit */
+       if (tdb_allrecord_lock(tdb, F_RDLCK, TDB_LOCK_WAIT, true) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_start: failed to get hash locks\n"));
+               goto fail_allrecord_lock;
+       }
+
+       /* setup a copy of the hash table heads so the hash scan in
+          traverse can be fast */
+       tdb->transaction->hash_heads = (uint32_t *)
+               calloc(tdb->header.hash_size+1, sizeof(uint32_t));
+       if (tdb->transaction->hash_heads == NULL) {
+               tdb->ecode = TDB_ERR_OOM;
+               goto fail;
+       }
+       if (tdb->methods->tdb_read(tdb, FREELIST_TOP, tdb->transaction->hash_heads,
+                                  TDB_HASHTABLE_SIZE(tdb), 0) != 0) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_start: failed to read hash heads\n"));
+               tdb->ecode = TDB_ERR_IO;
+               goto fail;
+       }
+
+       /* make sure we know about any file expansions already done by
+          anyone else */
+       tdb->methods->tdb_oob(tdb, tdb->map_size, 1, 1);
+       tdb->transaction->old_map_size = tdb->map_size;
+
+       /* finally hook the io methods, replacing them with
+          transaction specific methods */
+       tdb->transaction->io_methods = tdb->methods;
+       tdb->methods = &transaction_methods;
+
+       /* Trace at the end, so we get sequence number correct. */
+       tdb_trace(tdb, "tdb_transaction_start");
+       return 0;
+
+fail:
+       tdb_allrecord_unlock(tdb, F_RDLCK, false);
+fail_allrecord_lock:
+       tdb_transaction_unlock(tdb, F_WRLCK);
+       SAFE_FREE(tdb->transaction->blocks);
+       SAFE_FREE(tdb->transaction->hash_heads);
+       SAFE_FREE(tdb->transaction);
+       return -1;
+}
+
+_PUBLIC_ int tdb_transaction_start(struct tdb_context *tdb)
+{
+       return _tdb_transaction_start(tdb, TDB_LOCK_WAIT);
+}
+
+_PUBLIC_ int tdb_transaction_start_nonblock(struct tdb_context *tdb)
+{
+       return _tdb_transaction_start(tdb, TDB_LOCK_NOWAIT|TDB_LOCK_PROBE);
+}
+
+/*
+  sync to disk
+*/
+static int transaction_sync(struct tdb_context *tdb, tdb_off_t offset, tdb_len_t length)
+{      
+       if (tdb->flags & TDB_NOSYNC) {
+               return 0;
+       }
+
+#ifdef HAVE_FDATASYNC
+       if (fdatasync(tdb->fd) != 0) {
+#else
+       if (fsync(tdb->fd) != 0) {
+#endif
+               tdb->ecode = TDB_ERR_IO;
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: fsync failed\n"));
+               return -1;
+       }
+#ifdef HAVE_MMAP
+       if (tdb->map_ptr) {
+               tdb_off_t moffset = offset & ~(tdb->page_size-1);
+               if (msync(moffset + (char *)tdb->map_ptr, 
+                         length + (offset - moffset), MS_SYNC) != 0) {
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction: msync failed - %s\n",
+                                strerror(errno)));
+                       return -1;
+               }
+       }
+#endif
+       return 0;
+}
+
+
+static int _tdb_transaction_cancel(struct tdb_context *tdb)
+{      
+       int i, ret = 0;
+
+       if (tdb->transaction == NULL) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_cancel: no transaction\n"));
+               return -1;
+       }
+
+       if (tdb->transaction->nesting != 0) {
+               tdb->transaction->transaction_error = 1;
+               tdb->transaction->nesting--;
+               return 0;
+       }               
+
+       tdb->map_size = tdb->transaction->old_map_size;
+
+       /* free all the transaction blocks */
+       for (i=0;i<tdb->transaction->num_blocks;i++) {
+               if (tdb->transaction->blocks[i] != NULL) {
+                       free(tdb->transaction->blocks[i]);
+               }
+       }
+       SAFE_FREE(tdb->transaction->blocks);
+
+       if (tdb->transaction->magic_offset) {
+               const struct tdb_methods *methods = tdb->transaction->io_methods;
+               const uint32_t invalid = TDB_RECOVERY_INVALID_MAGIC;
+
+               /* remove the recovery marker */
+               if (methods->tdb_write(tdb, tdb->transaction->magic_offset, &invalid, 4) == -1 ||
+               transaction_sync(tdb, tdb->transaction->magic_offset, 4) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_cancel: failed to remove recovery magic\n"));
+                       ret = -1;
+               }
+       }
+
+       /* This also removes the OPEN_LOCK, if we have it. */
+       tdb_release_transaction_locks(tdb);
+
+       /* restore the normal io methods */
+       tdb->methods = tdb->transaction->io_methods;
+
+       SAFE_FREE(tdb->transaction->hash_heads);
+       SAFE_FREE(tdb->transaction);
+
+       return ret;
+}
+
+/*
+  cancel the current transaction
+*/
+_PUBLIC_ int tdb_transaction_cancel(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_transaction_cancel");
+       return _tdb_transaction_cancel(tdb);
+}
+
+/*
+  work out how much space the linearised recovery data will consume
+*/
+static tdb_len_t tdb_recovery_size(struct tdb_context *tdb)
+{
+       tdb_len_t recovery_size = 0;
+       int i;
+
+       recovery_size = sizeof(uint32_t);
+       for (i=0;i<tdb->transaction->num_blocks;i++) {
+               if (i * tdb->transaction->block_size >= tdb->transaction->old_map_size) {
+                       break;
+               }
+               if (tdb->transaction->blocks[i] == NULL) {
+                       continue;
+               }
+               recovery_size += 2*sizeof(tdb_off_t);
+               if (i == tdb->transaction->num_blocks-1) {
+                       recovery_size += tdb->transaction->last_block_size;
+               } else {
+                       recovery_size += tdb->transaction->block_size;
+               }
+       }       
+
+       return recovery_size;
+}
+
+int tdb_recovery_area(struct tdb_context *tdb,
+                     const struct tdb_methods *methods,
+                     tdb_off_t *recovery_offset,
+                     struct tdb_record *rec)
+{
+       if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, recovery_offset) == -1) {
+               return -1;
+       }
+
+       if (*recovery_offset == 0) {
+               rec->rec_len = 0;
+               return 0;
+       }
+
+       if (methods->tdb_read(tdb, *recovery_offset, rec, sizeof(*rec),
+                             DOCONV()) == -1) {
+               return -1;
+       }
+
+       /* ignore invalid recovery regions: can happen in crash */
+       if (rec->magic != TDB_RECOVERY_MAGIC &&
+           rec->magic != TDB_RECOVERY_INVALID_MAGIC) {
+               *recovery_offset = 0;
+               rec->rec_len = 0;
+       }
+       return 0;
+}
+
+/*
+  allocate the recovery area, or use an existing recovery area if it is
+  large enough
+*/
+static int tdb_recovery_allocate(struct tdb_context *tdb, 
+                                tdb_len_t *recovery_size,
+                                tdb_off_t *recovery_offset,
+                                tdb_len_t *recovery_max_size)
+{
+       struct tdb_record rec;
+       const struct tdb_methods *methods = tdb->transaction->io_methods;
+       tdb_off_t recovery_head, new_end;
+
+       if (tdb_recovery_area(tdb, methods, &recovery_head, &rec) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to read recovery head\n"));
+               return -1;
+       }
+
+       *recovery_size = tdb_recovery_size(tdb);
+
+       /* Existing recovery area? */
+       if (recovery_head != 0 && *recovery_size <= rec.rec_len) {
+               /* it fits in the existing area */
+               *recovery_max_size = rec.rec_len;
+               *recovery_offset = recovery_head;
+               return 0;
+       }
+
+       /* If recovery area in middle of file, we need a new one. */
+       if (recovery_head == 0
+           || recovery_head + sizeof(rec) + rec.rec_len != tdb->map_size) {
+               /* we need to free up the old recovery area, then allocate a
+                  new one at the end of the file. Note that we cannot use
+                  tdb_allocate() to allocate the new one as that might return
+                  us an area that is being currently used (as of the start of
+                  the transaction) */
+               if (recovery_head) {
+                       if (tdb_free(tdb, recovery_head, &rec) == -1) {
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL,
+                                        "tdb_recovery_allocate: failed to"
+                                        " free previous recovery area\n"));
+                               return -1;
+                       }
+
+                       /* the tdb_free() call might have increased
+                        * the recovery size */
+                       *recovery_size = tdb_recovery_size(tdb);
+               }
+
+               /* New head will be at end of file. */
+               recovery_head = tdb->map_size;
+       }
+
+       /* Now we know where it will be. */
+       *recovery_offset = recovery_head;
+
+       /* Expand by more than we need, so we don't do it often. */
+       *recovery_max_size = tdb_expand_adjust(tdb->map_size,
+                                              *recovery_size,
+                                              tdb->page_size)
+               - sizeof(rec);
+
+       new_end = recovery_head + sizeof(rec) + *recovery_max_size;
+
+       if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size, 
+                                    new_end - tdb->transaction->old_map_size)
+           == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to create recovery area\n"));
+               return -1;
+       }
+
+       /* remap the file (if using mmap) */
+       methods->tdb_oob(tdb, tdb->map_size, 1, 1);
+
+       /* we have to reset the old map size so that we don't try to expand the file
+          again in the transaction commit, which would destroy the recovery area */
+       tdb->transaction->old_map_size = tdb->map_size;
+
+       /* write the recovery header offset and sync - we can sync without a race here
+          as the magic ptr in the recovery record has not been set */
+       CONVERT(recovery_head);
+       if (methods->tdb_write(tdb, TDB_RECOVERY_HEAD, 
+                              &recovery_head, sizeof(tdb_off_t)) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
+               return -1;
+       }
+       if (transaction_write_existing(tdb, TDB_RECOVERY_HEAD, &recovery_head, sizeof(tdb_off_t)) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_recovery_allocate: failed to write recovery head\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  setup the recovery data that will be used on a crash during commit
+*/
+static int transaction_setup_recovery(struct tdb_context *tdb, 
+                                     tdb_off_t *magic_offset)
+{
+       tdb_len_t recovery_size;
+       unsigned char *data, *p;
+       const struct tdb_methods *methods = tdb->transaction->io_methods;
+       struct tdb_record *rec;
+       tdb_off_t recovery_offset, recovery_max_size;
+       tdb_off_t old_map_size = tdb->transaction->old_map_size;
+       uint32_t magic, tailer;
+       int i;
+
+       /*
+         check that the recovery area has enough space
+       */
+       if (tdb_recovery_allocate(tdb, &recovery_size, 
+                                 &recovery_offset, &recovery_max_size) == -1) {
+               return -1;
+       }
+
+       data = (unsigned char *)malloc(recovery_size + sizeof(*rec));
+       if (data == NULL) {
+               tdb->ecode = TDB_ERR_OOM;
+               return -1;
+       }
+
+       rec = (struct tdb_record *)data;
+       memset(rec, 0, sizeof(*rec));
+
+       rec->magic    = TDB_RECOVERY_INVALID_MAGIC;
+       rec->data_len = recovery_size;
+       rec->rec_len  = recovery_max_size;
+       rec->key_len  = old_map_size;
+       CONVERT(*rec);
+
+       /* build the recovery data into a single blob to allow us to do a single
+          large write, which should be more efficient */
+       p = data + sizeof(*rec);
+       for (i=0;i<tdb->transaction->num_blocks;i++) {
+               tdb_off_t offset;
+               tdb_len_t length;
+
+               if (tdb->transaction->blocks[i] == NULL) {
+                       continue;
+               }
+
+               offset = i * tdb->transaction->block_size;
+               length = tdb->transaction->block_size;
+               if (i == tdb->transaction->num_blocks-1) {
+                       length = tdb->transaction->last_block_size;
+               }
+
+               if (offset >= old_map_size) {
+                       continue;
+               }
+               if (offset + length > tdb->transaction->old_map_size) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: transaction data over new region boundary\n"));
+                       free(data);
+                       tdb->ecode = TDB_ERR_CORRUPT;
+                       return -1;
+               }
+               memcpy(p, &offset, 4);
+               memcpy(p+4, &length, 4);
+               if (DOCONV()) {
+                       tdb_convert(p, 8);
+               }
+               /* the recovery area contains the old data, not the
+                  new data, so we have to call the original tdb_read
+                  method to get it */
+               if (methods->tdb_read(tdb, offset, p + 8, length, 0) != 0) {
+                       free(data);
+                       tdb->ecode = TDB_ERR_IO;
+                       return -1;
+               }
+               p += 8 + length;
+       }
+
+       /* and the tailer */
+       tailer = sizeof(*rec) + recovery_max_size;
+       memcpy(p, &tailer, 4);
+       if (DOCONV()) {
+               tdb_convert(p, 4);
+       }
+
+       /* write the recovery data to the recovery area */
+       if (methods->tdb_write(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery data\n"));
+               free(data);
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+       if (transaction_write_existing(tdb, recovery_offset, data, sizeof(*rec) + recovery_size) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery data\n"));
+               free(data);
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       /* as we don't have ordered writes, we have to sync the recovery
+          data before we update the magic to indicate that the recovery
+          data is present */
+       if (transaction_sync(tdb, recovery_offset, sizeof(*rec) + recovery_size) == -1) {
+               free(data);
+               return -1;
+       }
+
+       free(data);
+
+       magic = TDB_RECOVERY_MAGIC;
+       CONVERT(magic);
+
+       *magic_offset = recovery_offset + offsetof(struct tdb_record, magic);
+
+       if (methods->tdb_write(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write recovery magic\n"));
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+       if (transaction_write_existing(tdb, *magic_offset, &magic, sizeof(magic)) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_setup_recovery: failed to write secondary recovery magic\n"));
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       /* ensure the recovery magic marker is on disk */
+       if (transaction_sync(tdb, *magic_offset, sizeof(magic)) == -1) {
+               return -1;
+       }
+
+       return 0;
+}
+
+static int _tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{      
+       const struct tdb_methods *methods;
+
+       if (tdb->transaction == NULL) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: no transaction\n"));
+               return -1;
+       }
+
+       if (tdb->transaction->prepared) {
+               tdb->ecode = TDB_ERR_EINVAL;
+               _tdb_transaction_cancel(tdb);
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction already prepared\n"));
+               return -1;
+       }
+
+       if (tdb->transaction->transaction_error) {
+               tdb->ecode = TDB_ERR_IO;
+               _tdb_transaction_cancel(tdb);
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: transaction error pending\n"));
+               return -1;
+       }
+
+
+       if (tdb->transaction->nesting != 0) {
+               return 0;
+       }               
+
+       /* check for a null transaction */
+       if (tdb->transaction->blocks == NULL) {
+               return 0;
+       }
+
+       methods = tdb->transaction->io_methods;
+
+       /* if there are any locks pending then the caller has not
+          nested their locks properly, so fail the transaction */
+       if (tdb_have_extra_locks(tdb)) {
+               tdb->ecode = TDB_ERR_LOCK;
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: locks pending on commit\n"));
+               _tdb_transaction_cancel(tdb);
+               return -1;
+       }
+
+       /* upgrade the main transaction lock region to a write lock */
+       if (tdb_allrecord_upgrade(tdb) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to upgrade hash locks\n"));
+               _tdb_transaction_cancel(tdb);
+               return -1;
+       }
+
+       /* get the open lock - this prevents new users attaching to the database
+          during the commit */
+       if (tdb_nest_lock(tdb, OPEN_LOCK, F_WRLCK, TDB_LOCK_WAIT) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_prepare_commit: failed to get open lock\n"));
+               _tdb_transaction_cancel(tdb);
+               return -1;
+       }
+
+       /* write the recovery data to the end of the file */
+       if (transaction_setup_recovery(tdb, &tdb->transaction->magic_offset) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: failed to setup recovery data\n"));
+               _tdb_transaction_cancel(tdb);
+               return -1;
+       }
+
+       tdb->transaction->prepared = true;
+
+       /* expand the file to the new size if needed */
+       if (tdb->map_size != tdb->transaction->old_map_size) {
+               if (methods->tdb_expand_file(tdb, tdb->transaction->old_map_size, 
+                                            tdb->map_size - 
+                                            tdb->transaction->old_map_size) == -1) {
+                       tdb->ecode = TDB_ERR_IO;
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_prepare_commit: expansion failed\n"));
+                       _tdb_transaction_cancel(tdb);
+                       return -1;
+               }
+               tdb->map_size = tdb->transaction->old_map_size;
+               methods->tdb_oob(tdb, tdb->map_size, 1, 1);
+       }
+
+       /* Keep the open lock until the actual commit */
+
+       return 0;
+}
+
+/*
+   prepare to commit the current transaction
+*/
+_PUBLIC_ int tdb_transaction_prepare_commit(struct tdb_context *tdb)
+{
+       tdb_trace(tdb, "tdb_transaction_prepare_commit");
+       return _tdb_transaction_prepare_commit(tdb);
+}
+
+/* A repack is worthwhile if the largest is less than half total free. */
+static bool repack_worthwhile(struct tdb_context *tdb)
+{
+       tdb_off_t ptr;
+       struct tdb_record rec;
+       tdb_len_t total = 0, largest = 0;
+
+       if (tdb_ofs_read(tdb, FREELIST_TOP, &ptr) == -1) {
+               return false;
+       }
+
+       while (ptr != 0 && tdb_rec_free_read(tdb, ptr, &rec) == 0) {
+               total += rec.rec_len;
+               if (rec.rec_len > largest) {
+                       largest = rec.rec_len;
+               }
+               ptr = rec.next;
+       }
+
+       return total > largest * 2;
+}
+
+/*
+  commit the current transaction
+*/
+_PUBLIC_ int tdb_transaction_commit(struct tdb_context *tdb)
+{
+       const struct tdb_methods *methods;
+       int i;
+       bool need_repack = false;
+
+       if (tdb->transaction == NULL) {
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: no transaction\n"));
+               return -1;
+       }
+
+       tdb_trace(tdb, "tdb_transaction_commit");
+
+       if (tdb->transaction->transaction_error) {
+               tdb->ecode = TDB_ERR_IO;
+               _tdb_transaction_cancel(tdb);
+               TDB_LOG((tdb, TDB_DEBUG_ERROR, "tdb_transaction_commit: transaction error pending\n"));
+               return -1;
+       }
+
+
+       if (tdb->transaction->nesting != 0) {
+               tdb->transaction->nesting--;
+               return 0;
+       }
+
+       /* check for a null transaction */
+       if (tdb->transaction->blocks == NULL) {
+               _tdb_transaction_cancel(tdb);
+               return 0;
+       }
+
+       if (!tdb->transaction->prepared) {
+               int ret = _tdb_transaction_prepare_commit(tdb);
+               if (ret)
+                       return ret;
+       }
+
+       methods = tdb->transaction->io_methods;
+
+       /* perform all the writes */
+       for (i=0;i<tdb->transaction->num_blocks;i++) {
+               tdb_off_t offset;
+               tdb_len_t length;
+
+               if (tdb->transaction->blocks[i] == NULL) {
+                       continue;
+               }
+
+               offset = i * tdb->transaction->block_size;
+               length = tdb->transaction->block_size;
+               if (i == tdb->transaction->num_blocks-1) {
+                       length = tdb->transaction->last_block_size;
+               }
+
+               if (methods->tdb_write(tdb, offset, tdb->transaction->blocks[i], length) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed during commit\n"));
+
+                       /* we've overwritten part of the data and
+                          possibly expanded the file, so we need to
+                          run the crash recovery code */
+                       tdb->methods = methods;
+                       tdb_transaction_recover(tdb); 
+
+                       _tdb_transaction_cancel(tdb);
+
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_commit: write failed\n"));
+                       return -1;
+               }
+               SAFE_FREE(tdb->transaction->blocks[i]);
+       } 
+
+       /* Do this before we drop lock or blocks. */
+       if (tdb->transaction->expanded) {
+               need_repack = repack_worthwhile(tdb);
+       }
+
+       SAFE_FREE(tdb->transaction->blocks);
+       tdb->transaction->num_blocks = 0;
+
+       /* ensure the new data is on disk */
+       if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
+               return -1;
+       }
+
+       /*
+         TODO: maybe write to some dummy hdr field, or write to magic
+         offset without mmap, before the last sync, instead of the
+         utime() call
+       */
+
+       /* on some systems (like Linux 2.6.x) changes via mmap/msync
+          don't change the mtime of the file, this means the file may
+          not be backed up (as tdb rounding to block sizes means that
+          file size changes are quite rare too). The following forces
+          mtime changes when a transaction completes */
+#ifdef HAVE_UTIME
+       utime(tdb->name, NULL);
+#endif
+
+       /* use a transaction cancel to free memory and remove the
+          transaction locks */
+       _tdb_transaction_cancel(tdb);
+
+       if (need_repack) {
+               return tdb_repack(tdb);
+       }
+
+       return 0;
+}
+
+
+/*
+  recover from an aborted transaction. Must be called with exclusive
+  database write access already established (including the open
+  lock to prevent new processes attaching)
+*/
+int tdb_transaction_recover(struct tdb_context *tdb)
+{
+       tdb_off_t recovery_head, recovery_eof;
+       unsigned char *data, *p;
+       uint32_t zero = 0;
+       struct tdb_record rec;
+
+       /* find the recovery area */
+       if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery head\n"));
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       if (recovery_head == 0) {
+               /* we have never allocated a recovery record */
+               return 0;
+       }
+
+       /* read the recovery record */
+       if (tdb->methods->tdb_read(tdb, recovery_head, &rec, 
+                                  sizeof(rec), DOCONV()) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery record\n"));           
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       if (rec.magic != TDB_RECOVERY_MAGIC) {
+               /* there is no valid recovery data */
+               return 0;
+       }
+
+       if (tdb->read_only) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: attempt to recover read only database\n"));
+               tdb->ecode = TDB_ERR_CORRUPT;
+               return -1;
+       }
+
+       recovery_eof = rec.key_len;
+
+       data = (unsigned char *)malloc(rec.data_len);
+       if (data == NULL) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to allocate recovery data\n"));         
+               tdb->ecode = TDB_ERR_OOM;
+               return -1;
+       }
+
+       /* read the full recovery data */
+       if (tdb->methods->tdb_read(tdb, recovery_head + sizeof(rec), data,
+                                  rec.data_len, 0) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to read recovery data\n"));             
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       /* recover the file data */
+       p = data;
+       while (p+8 < data + rec.data_len) {
+               uint32_t ofs, len;
+               if (DOCONV()) {
+                       tdb_convert(p, 8);
+               }
+               memcpy(&ofs, p, 4);
+               memcpy(&len, p+4, 4);
+
+               if (tdb->methods->tdb_write(tdb, ofs, p+8, len) == -1) {
+                       free(data);
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to recover %d bytes at offset %d\n", len, ofs));
+                       tdb->ecode = TDB_ERR_IO;
+                       return -1;
+               }
+               p += 8 + len;
+       }
+
+       free(data);
+
+       if (transaction_sync(tdb, 0, tdb->map_size) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync recovery\n"));
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       /* if the recovery area is after the recovered eof then remove it */
+       if (recovery_eof <= recovery_head) {
+               if (tdb_ofs_write(tdb, TDB_RECOVERY_HEAD, &zero) == -1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery head\n"));
+                       tdb->ecode = TDB_ERR_IO;
+                       return -1;                      
+               }
+       }
+
+       /* remove the recovery magic */
+       if (tdb_ofs_write(tdb, recovery_head + offsetof(struct tdb_record, magic),
+                         &zero) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to remove recovery magic\n"));
+               tdb->ecode = TDB_ERR_IO;
+               return -1;                      
+       }
+
+       if (transaction_sync(tdb, 0, recovery_eof) == -1) {
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_transaction_recover: failed to sync2 recovery\n"));
+               tdb->ecode = TDB_ERR_IO;
+               return -1;
+       }
+
+       TDB_LOG((tdb, TDB_DEBUG_TRACE, "tdb_transaction_recover: recovered %d byte database\n", 
+                recovery_eof));
+
+       /* all done */
+       return 0;
+}
+
+/* Any I/O failures we say "needs recovery". */
+bool tdb_needs_recovery(struct tdb_context *tdb)
+{
+       tdb_off_t recovery_head;
+       struct tdb_record rec;
+
+       /* find the recovery area */
+       if (tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &recovery_head) == -1) {
+               return true;
+       }
+
+       if (recovery_head == 0) {
+               /* we have never allocated a recovery record */
+               return false;
+       }
+
+       /* read the recovery record */
+       if (tdb->methods->tdb_read(tdb, recovery_head, &rec,
+                                  sizeof(rec), DOCONV()) == -1) {
+               return true;
+       }
+
+       return (rec.magic == TDB_RECOVERY_MAGIC);
+}
diff --git a/ctdb/lib/tdb/common/traverse.c b/ctdb/lib/tdb/common/traverse.c
new file mode 100644 (file)
index 0000000..517fecb
--- /dev/null
@@ -0,0 +1,366 @@
+ /* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell              1999-2005
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000-2003
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "tdb_private.h"
+
+#define TDB_NEXT_LOCK_ERR ((tdb_off_t)-1)
+
+/* Uses traverse lock: 0 = finish, TDB_NEXT_LOCK_ERR = error,
+   other = record offset */
+static tdb_off_t tdb_next_lock(struct tdb_context *tdb, struct tdb_traverse_lock *tlock,
+                        struct tdb_record *rec)
+{
+       int want_next = (tlock->off != 0);
+
+       /* Lock each chain from the start one. */
+       for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
+               if (!tlock->off && tlock->hash != 0) {
+                       /* this is an optimisation for the common case where
+                          the hash chain is empty, which is particularly
+                          common for the use of tdb with ldb, where large
+                          hashes are used. In that case we spend most of our
+                          time in tdb_brlock(), locking empty hash chains.
+
+                          To avoid this, we do an unlocked pre-check to see
+                          if the hash chain is empty before starting to look
+                          inside it. If it is empty then we can avoid that
+                          hash chain. If it isn't empty then we can't believe
+                          the value we get back, as we read it without a
+                          lock, so instead we get the lock and re-fetch the
+                          value below.
+
+                          Notice that not doing this optimisation on the
+                          first hash chain is critical. We must guarantee
+                          that we have done at least one fcntl lock at the
+                          start of a search to guarantee that memory is
+                          coherent on SMP systems. If records are added by
+                          others during the search then thats OK, and we
+                          could possibly miss those with this trick, but we
+                          could miss them anyway without this trick, so the
+                          semantics don't change.
+
+                          With a non-indexed ldb search this trick gains us a
+                          factor of around 80 in speed on a linux 2.6.x
+                          system (testing using ldbtest).
+                       */
+                       tdb->methods->next_hash_chain(tdb, &tlock->hash);
+                       if (tlock->hash == tdb->header.hash_size) {
+                               continue;
+                       }
+               }
+
+               if (tdb_lock(tdb, tlock->hash, tlock->lock_rw) == -1)
+                       return TDB_NEXT_LOCK_ERR;
+
+               /* No previous record?  Start at top of chain. */
+               if (!tlock->off) {
+                       if (tdb_ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
+                                    &tlock->off) == -1)
+                               goto fail;
+               } else {
+                       /* Otherwise unlock the previous record. */
+                       if (tdb_unlock_record(tdb, tlock->off) != 0)
+                               goto fail;
+               }
+
+               if (want_next) {
+                       /* We have offset of old record: grab next */
+                       if (tdb_rec_read(tdb, tlock->off, rec) == -1)
+                               goto fail;
+                       tlock->off = rec->next;
+               }
+
+               /* Iterate through chain */
+               while( tlock->off) {
+                       tdb_off_t current;
+                       if (tdb_rec_read(tdb, tlock->off, rec) == -1)
+                               goto fail;
+
+                       /* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
+                       if (tlock->off == rec->next) {
+                               tdb->ecode = TDB_ERR_CORRUPT;
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: loop detected.\n"));
+                               goto fail;
+                       }
+
+                       if (!TDB_DEAD(rec)) {
+                               /* Woohoo: we found one! */
+                               if (tdb_lock_record(tdb, tlock->off) != 0)
+                                       goto fail;
+                               return tlock->off;
+                       }
+
+                       /* Try to clean dead ones from old traverses */
+                       current = tlock->off;
+                       tlock->off = rec->next;
+                       if (!(tdb->read_only || tdb->traverse_read) && 
+                           tdb_do_delete(tdb, current, rec) != 0)
+                               goto fail;
+               }
+               tdb_unlock(tdb, tlock->hash, tlock->lock_rw);
+               want_next = 0;
+       }
+       /* We finished iteration without finding anything */
+       tdb->ecode = TDB_SUCCESS;
+       return 0;
+
+ fail:
+       tlock->off = 0;
+       if (tdb_unlock(tdb, tlock->hash, tlock->lock_rw) != 0)
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_next_lock: On error unlock failed!\n"));
+       return TDB_NEXT_LOCK_ERR;
+}
+
+/* traverse the entire database - calling fn(tdb, key, data) on each element.
+   return -1 on error or the record count traversed
+   if fn is NULL then it is not called
+   a non-zero return value from fn() indicates that the traversal should stop
+  */
+static int tdb_traverse_internal(struct tdb_context *tdb, 
+                                tdb_traverse_func fn, void *private_data,
+                                struct tdb_traverse_lock *tl)
+{
+       TDB_DATA key, dbuf;
+       struct tdb_record rec;
+       int ret = 0, count = 0;
+       tdb_off_t off;
+
+       /* This was in the initializaton, above, but the IRIX compiler
+        * did not like it.  crh
+        */
+       tl->next = tdb->travlocks.next;
+
+       /* fcntl locks don't stack: beware traverse inside traverse */
+       tdb->travlocks.next = tl;
+
+       /* tdb_next_lock places locks on the record returned, and its chain */
+       while ((off = tdb_next_lock(tdb, tl, &rec)) != 0) {
+               if (off == TDB_NEXT_LOCK_ERR) {
+                       ret = -1;
+                       goto out;
+               }
+               count++;
+               /* now read the full record */
+               key.dptr = tdb_alloc_read(tdb, tl->off + sizeof(rec), 
+                                         rec.key_len + rec.data_len);
+               if (!key.dptr) {
+                       ret = -1;
+                       if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0)
+                               goto out;
+                       if (tdb_unlock_record(tdb, tl->off) != 0)
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
+                       goto out;
+               }
+               key.dsize = rec.key_len;
+               dbuf.dptr = key.dptr + rec.key_len;
+               dbuf.dsize = rec.data_len;
+
+               tdb_trace_1rec_retrec(tdb, "traverse", key, dbuf);
+
+               /* Drop chain lock, call out */
+               if (tdb_unlock(tdb, tl->hash, tl->lock_rw) != 0) {
+                       ret = -1;
+                       SAFE_FREE(key.dptr);
+                       goto out;
+               }
+               if (fn && fn(tdb, key, dbuf, private_data)) {
+                       /* They want us to terminate traversal */
+                       tdb_trace_ret(tdb, "tdb_traverse_end", count);
+                       if (tdb_unlock_record(tdb, tl->off) != 0) {
+                               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_traverse: unlock_record failed!\n"));;
+                               ret = -1;
+                       }
+                       SAFE_FREE(key.dptr);
+                       goto out;
+               }
+               SAFE_FREE(key.dptr);
+       }
+       tdb_trace(tdb, "tdb_traverse_end");
+out:
+       tdb->travlocks.next = tl->next;
+       if (ret < 0)
+               return -1;
+       else
+               return count;
+}
+
+
+/*
+  a write style traverse - temporarily marks the db read only
+*/
+_PUBLIC_ int tdb_traverse_read(struct tdb_context *tdb, 
+                     tdb_traverse_func fn, void *private_data)
+{
+       struct tdb_traverse_lock tl = { NULL, 0, 0, F_RDLCK };
+       int ret;
+
+       /* we need to get a read lock on the transaction lock here to
+          cope with the lock ordering semantics of solaris10 */
+       if (tdb_transaction_lock(tdb, F_RDLCK, TDB_LOCK_WAIT)) {
+               return -1;
+       }
+
+       tdb->traverse_read++;
+       tdb_trace(tdb, "tdb_traverse_read_start");
+       ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
+       tdb->traverse_read--;
+
+       tdb_transaction_unlock(tdb, F_RDLCK);
+
+       return ret;
+}
+
+/*
+  a write style traverse - needs to get the transaction lock to
+  prevent deadlocks
+
+  WARNING: The data buffer given to the callback fn does NOT meet the
+  alignment restrictions malloc gives you.
+*/
+_PUBLIC_ int tdb_traverse(struct tdb_context *tdb, 
+                tdb_traverse_func fn, void *private_data)
+{
+       struct tdb_traverse_lock tl = { NULL, 0, 0, F_WRLCK };
+       int ret;
+
+       if (tdb->read_only || tdb->traverse_read) {
+               return tdb_traverse_read(tdb, fn, private_data);
+       }
+
+       if (tdb_transaction_lock(tdb, F_WRLCK, TDB_LOCK_WAIT)) {
+               return -1;
+       }
+
+       tdb->traverse_write++;
+       tdb_trace(tdb, "tdb_traverse_start");
+       ret = tdb_traverse_internal(tdb, fn, private_data, &tl);
+       tdb->traverse_write--;
+
+       tdb_transaction_unlock(tdb, F_WRLCK);
+
+       return ret;
+}
+
+
+/* find the first entry in the database and return its key */
+_PUBLIC_ TDB_DATA tdb_firstkey(struct tdb_context *tdb)
+{
+       TDB_DATA key;
+       struct tdb_record rec;
+       tdb_off_t off;
+
+       /* release any old lock */
+       if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0)
+               return tdb_null;
+       tdb->travlocks.off = tdb->travlocks.hash = 0;
+       tdb->travlocks.lock_rw = F_RDLCK;
+
+       /* Grab first record: locks chain and returned record. */
+       off = tdb_next_lock(tdb, &tdb->travlocks, &rec);
+       if (off == 0 || off == TDB_NEXT_LOCK_ERR) {
+               tdb_trace_retrec(tdb, "tdb_firstkey", tdb_null);
+               return tdb_null;
+       }
+       /* now read the key */
+       key.dsize = rec.key_len;
+       key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
+
+       tdb_trace_retrec(tdb, "tdb_firstkey", key);
+
+       /* Unlock the hash chain of the record we just read. */
+       if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
+       return key;
+}
+
+/* find the next entry in the database, returning its key */
+_PUBLIC_ TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA oldkey)
+{
+       uint32_t oldhash;
+       TDB_DATA key = tdb_null;
+       struct tdb_record rec;
+       unsigned char *k = NULL;
+       tdb_off_t off;
+
+       /* Is locked key the old key?  If so, traverse will be reliable. */
+       if (tdb->travlocks.off) {
+               if (tdb_lock(tdb,tdb->travlocks.hash,tdb->travlocks.lock_rw))
+                       return tdb_null;
+               if (tdb_rec_read(tdb, tdb->travlocks.off, &rec) == -1
+                   || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
+                                           rec.key_len))
+                   || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
+                       /* No, it wasn't: unlock it and start from scratch */
+                       if (tdb_unlock_record(tdb, tdb->travlocks.off) != 0) {
+                               tdb_trace_1rec_retrec(tdb, "tdb_nextkey",
+                                                     oldkey, tdb_null);
+                               SAFE_FREE(k);
+                               return tdb_null;
+                       }
+                       if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0) {
+                               SAFE_FREE(k);
+                               return tdb_null;
+                       }
+                       tdb->travlocks.off = 0;
+               }
+
+               SAFE_FREE(k);
+       }
+
+       if (!tdb->travlocks.off) {
+               /* No previous element: do normal find, and lock record */
+               tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), tdb->travlocks.lock_rw, &rec);
+               if (!tdb->travlocks.off) {
+                       tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, tdb_null);
+                       return tdb_null;
+               }
+               tdb->travlocks.hash = BUCKET(rec.full_hash);
+               if (tdb_lock_record(tdb, tdb->travlocks.off) != 0) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
+                       return tdb_null;
+               }
+       }
+       oldhash = tdb->travlocks.hash;
+
+       /* Grab next record: locks chain and returned record,
+          unlocks old record */
+       off = tdb_next_lock(tdb, &tdb->travlocks, &rec);
+       if (off != TDB_NEXT_LOCK_ERR && off != 0) {
+               key.dsize = rec.key_len;
+               key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
+                                         key.dsize);
+               /* Unlock the chain of this new record */
+               if (tdb_unlock(tdb, tdb->travlocks.hash, tdb->travlocks.lock_rw) != 0)
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
+       }
+       /* Unlock the chain of old record */
+       if (tdb_unlock(tdb, BUCKET(oldhash), tdb->travlocks.lock_rw) != 0)
+               TDB_LOG((tdb, TDB_DEBUG_FATAL, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
+       tdb_trace_1rec_retrec(tdb, "tdb_nextkey", oldkey, key);
+       return key;
+}
+
diff --git a/ctdb/lib/tdb/docs/README b/ctdb/lib/tdb/docs/README
new file mode 100644 (file)
index 0000000..fe0e258
--- /dev/null
@@ -0,0 +1,273 @@
+tdb - a trivial database system
+tridge@linuxcare.com December 1999
+==================================
+
+This is a simple database API. It was inspired by the realisation that
+in Samba we have several ad-hoc bits of code that essentially
+implement small databases for sharing structures between parts of
+Samba. As I was about to add another I realised that a generic
+database module was called for to replace all the ad-hoc bits.
+
+I based the interface on gdbm. I couldn't use gdbm as we need to be
+able to have multiple writers to the databases at one time.
+
+Compilation
+-----------
+
+add HAVE_MMAP=1 to use mmap instead of read/write
+add NOLOCK=1 to disable locking code
+
+Testing
+-------
+
+Compile tdbtest.c and link with gdbm for testing. tdbtest will perform
+identical operations via tdb and gdbm then make sure the result is the
+same
+
+Also included is tdbtool, which allows simple database manipulation
+on the commandline.
+
+tdbtest and tdbtool are not built as part of Samba, but are included
+for completeness.
+
+Interface
+---------
+
+The interface is very similar to gdbm except for the following:
+
+- different open interface. The tdb_open call is more similar to a
+  traditional open()
+- no tdbm_reorganise() function
+- no tdbm_sync() function. No operations are cached in the library anyway
+- added a tdb_traverse() function for traversing the whole database
+- added transactions support
+
+A general rule for using tdb is that the caller frees any returned
+TDB_DATA structures. Just call free(p.dptr) to free a TDB_DATA
+return value called p. This is the same as gdbm.
+
+here is a full list of tdb functions with brief descriptions.
+
+
+----------------------------------------------------------------------
+TDB_CONTEXT *tdb_open(char *name, int hash_size, int tdb_flags,
+                     int open_flags, mode_t mode)
+
+   open the database, creating it if necessary 
+
+   The open_flags and mode are passed straight to the open call on the database
+   file. A flags value of O_WRONLY is invalid
+
+   The hash size is advisory, use zero for a default value. 
+
+   return is NULL on error
+
+   possible tdb_flags are:
+    TDB_CLEAR_IF_FIRST - clear database if we are the only one with it open
+    TDB_INTERNAL - don't use a file, instaed store the data in
+                   memory. The filename is ignored in this case.
+    TDB_NOLOCK - don't do any locking
+    TDB_NOMMAP - don't use mmap
+    TDB_NOSYNC - don't synchronise transactions to disk
+    TDB_SEQNUM - maintain a sequence number
+    TDB_VOLATILE - activate the per-hashchain freelist, default 5
+    TDB_ALLOW_NESTING - allow transactions to nest
+    TDB_DISALLOW_NESTING - disallow transactions to nest
+
+----------------------------------------------------------------------
+TDB_CONTEXT *tdb_open_ex(char *name, int hash_size, int tdb_flags,
+                        int open_flags, mode_t mode,
+                        const struct tdb_logging_context *log_ctx,
+                        tdb_hash_func hash_fn)
+
+This is like tdb_open(), but allows you to pass an initial logging and
+hash function. Be careful when passing a hash function - all users of
+the database must use the same hash function or you will get data
+corruption.
+
+
+----------------------------------------------------------------------
+char *tdb_error(TDB_CONTEXT *tdb);
+
+     return a error string for the last tdb error
+
+----------------------------------------------------------------------
+int tdb_close(TDB_CONTEXT *tdb);
+
+   close a database
+
+----------------------------------------------------------------------
+TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key);
+
+   fetch an entry in the database given a key 
+   if the return value has a null dptr then a error occurred
+
+   caller must free the resulting data
+
+----------------------------------------------------------------------
+int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
+                    int (*parser)(TDB_DATA key, TDB_DATA data,
+                                  void *private_data),
+                    void *private_data);
+
+   Hand a record to a parser function without allocating it.
+
+   This function is meant as a fast tdb_fetch alternative for large records
+   that are frequently read. The "key" and "data" arguments point directly
+   into the tdb shared memory, they are not aligned at any boundary.
+
+   WARNING: The parser is called while tdb holds a lock on the record. DO NOT
+   call other tdb routines from within the parser. Also, for good performance
+   you should make the parser fast to allow parallel operations.
+
+   tdb_parse_record returns -1 if the record was not found.  If the record was
+   found, the return value of "parser" is passed up to the caller.
+
+----------------------------------------------------------------------
+int tdb_exists(TDB_CONTEXT *tdb, TDB_DATA key);
+
+   check if an entry in the database exists 
+
+   note that 1 is returned if the key is found and 0 is returned if not found
+   this doesn't match the conventions in the rest of this module, but is
+   compatible with gdbm
+
+----------------------------------------------------------------------
+int tdb_traverse(TDB_CONTEXT *tdb, int (*fn)(TDB_CONTEXT *tdb,
+                 TDB_DATA key, TDB_DATA dbuf, void *state), void *state);
+
+   traverse the entire database - calling fn(tdb, key, data, state) on each 
+   element.
+
+   return -1 on error or the record count traversed
+
+   if fn is NULL then it is not called
+
+   a non-zero return value from fn() indicates that the traversal
+   should stop. Traversal callbacks may not start transactions.
+
+   WARNING: The data buffer given to the callback fn does NOT meet the
+   alignment restrictions malloc gives you.
+
+----------------------------------------------------------------------
+int tdb_traverse_read(TDB_CONTEXT *tdb, int (*fn)(TDB_CONTEXT *tdb,
+                     TDB_DATA key, TDB_DATA dbuf, void *state), void *state);
+
+   traverse the entire database - calling fn(tdb, key, data, state) on
+   each element, but marking the database read only during the
+   traversal, so any write operations will fail. This allows tdb to
+   use read locks, which increases the parallelism possible during the
+   traversal.
+
+   return -1 on error or the record count traversed
+
+   if fn is NULL then it is not called
+
+   a non-zero return value from fn() indicates that the traversal
+   should stop. Traversal callbacks may not start transactions.
+
+----------------------------------------------------------------------
+TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb);
+
+   find the first entry in the database and return its key
+
+   the caller must free the returned data
+
+----------------------------------------------------------------------
+TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA key);
+
+   find the next entry in the database, returning its key
+
+   the caller must free the returned data
+
+----------------------------------------------------------------------
+int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key);
+
+   delete an entry in the database given a key
+
+----------------------------------------------------------------------
+int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag);
+
+   store an element in the database, replacing any existing element
+   with the same key 
+
+   If flag==TDB_INSERT then don't overwrite an existing entry
+   If flag==TDB_MODIFY then don't create a new entry
+
+   return 0 on success, -1 on failure
+
+----------------------------------------------------------------------
+int tdb_writelock(TDB_CONTEXT *tdb);
+
+   lock the database. If we already have it locked then don't do anything
+
+----------------------------------------------------------------------
+int tdb_writeunlock(TDB_CONTEXT *tdb);
+   unlock the database
+
+----------------------------------------------------------------------
+int tdb_lockchain(TDB_CONTEXT *tdb, TDB_DATA key);
+
+   lock one hash chain. This is meant to be used to reduce locking
+   contention - it cannot guarantee how many records will be locked
+
+----------------------------------------------------------------------
+int tdb_unlockchain(TDB_CONTEXT *tdb, TDB_DATA key);
+
+   unlock one hash chain
+
+----------------------------------------------------------------------
+int tdb_transaction_start(TDB_CONTEXT *tdb)
+
+   start a transaction. All operations after the transaction start can
+   either be committed with tdb_transaction_commit() or cancelled with
+   tdb_transaction_cancel(). 
+
+   If you call tdb_transaction_start() again on the same tdb context
+   while a transaction is in progress, then the same transaction
+   buffer is re-used. The number of tdb_transaction_{commit,cancel}
+   operations must match the number of successful
+   tdb_transaction_start() calls.
+
+   Note that transactions are by default disk synchronous, and use a
+   recover area in the database to automatically recover the database
+   on the next open if the system crashes during a transaction. You
+   can disable the synchronous transaction recovery setup using the
+   TDB_NOSYNC flag, which will greatly speed up operations at the risk
+   of corrupting your database if the system crashes.
+
+   Operations made within a transaction are not visible to other users
+   of the database until a successful commit.
+
+----------------------------------------------------------------------
+int tdb_transaction_cancel(TDB_CONTEXT *tdb)
+
+   cancel a current transaction, discarding all write and lock
+   operations that have been made since the transaction started.
+
+
+----------------------------------------------------------------------
+int tdb_transaction_commit(TDB_CONTEXT *tdb)
+
+   commit a current transaction, updating the database and releasing
+   the transaction locks.
+
+----------------------------------------------------------------------
+int tdb_transaction_prepare_commit(TDB_CONTEXT *tdb)
+
+   prepare to commit a current transaction, for two-phase commits.
+   Once prepared for commit, the only allowed calls are
+   tdb_transaction_commit() or tdb_transaction_cancel(). Preparing
+   allocates disk space for the pending updates, so a subsequent
+   commit should succeed (barring any hardware failures).
+
+----------------------------------------------------------------------
+int tdb_check(TDB_CONTEXT *tdb,
+             int (*check)(TDB_DATA key, TDB_DATA data, void *private_data),
+             void *private_data);)
+
+   check the consistency of the database, calling back the check function
+   (if non-NULL) with each record.  If some consistency check fails, or
+   the supplied check function returns -1, tdb_check returns -1, otherwise
+   0.  Note that logging function (if set) will be called with additional
+   information on the corruption found.
diff --git a/ctdb/lib/tdb/docs/mainpage.dox b/ctdb/lib/tdb/docs/mainpage.dox
new file mode 100644 (file)
index 0000000..d130769
--- /dev/null
@@ -0,0 +1,61 @@
+/**
+
+@mainpage
+
+This is a simple database API. It was inspired by the realisation that in Samba
+we have several ad-hoc bits of code that essentially implement small databases
+for sharing structures between parts of Samba.
+
+The interface is based on gdbm. gdbm couldn't be use as we needed to be able to
+have multiple writers to the databases at one time.
+
+@section tdb_download Download
+
+You can download the latest releases of tdb from the
+<a href="http://samba.org/ftp/tdb">tdb directory</a> on the samba public source
+archive.
+
+You can download the latest code either via git or rsync.
+
+To fetch via git see the following guide:
+
+<a href="http://wiki.samba.org/index.php/Using_Git_for_Samba_Development">Using Git for Samba Development</a>
+Once you have cloned the tree switch to the master branch and cd into the source/lib/tdb directory.
+
+To fetch via rsync use these commands:
+
+<pre>
+  rsync -Pavz samba.org::ftp/unpacked/standalone_projects/lib/tdb .
+  rsync -Pavz samba.org::ftp/unpacked/standalone_projects/lib/replace .
+</pre>
+
+and build in tdb. It will find the replace library in the directory above
+automatically.
+
+@section tdb_bugs Discussion and bug reports
+
+tdb does not currently have its own mailing list or bug tracking system. For now,
+please use the
+<a href="https://lists.samba.org/mailman/listinfo/samba-technical">samba-technical</a>
+mailing list, and the <a href="http://bugzilla.samba.org/">Samba bugzilla</a> bug
+tracking system.
+
+
+@section tdb_compilation Compilation
+
+add HAVE_MMAP=1 to use mmap instead of read/write
+add NOLOCK=1 to disable locking code
+
+@section tdb_testing Testing
+
+Compile tdbtest.c and link with gdbm for testing. tdbtest will perform
+identical operations via tdb and gdbm then make sure the result is the
+same
+
+Also included is tdbtool, which allows simple database manipulation
+on the commandline.
+
+tdbtest and tdbtool are not built as part of Samba, but are included
+for completeness.
+
+*/
diff --git a/ctdb/lib/tdb/docs/tdb.magic b/ctdb/lib/tdb/docs/tdb.magic
new file mode 100644 (file)
index 0000000..f5619e7
--- /dev/null
@@ -0,0 +1,10 @@
+# Magic file(1) information about tdb files.
+#
+# Install this into /etc/magic or the corresponding location for your
+# system, or pass as a -m argument to file(1).
+
+# You may use and redistribute this file without restriction.
+
+0      string  TDB\ file               TDB database
+>32    lelong  =0x2601196D             version 6, little-endian
+>>36   lelong  x                       hash size %d bytes
diff --git a/ctdb/lib/tdb/docs/tracing.txt b/ctdb/lib/tdb/docs/tracing.txt
new file mode 100644 (file)
index 0000000..98c5db9
--- /dev/null
@@ -0,0 +1,46 @@
+How And Why To Use TDB Tracing
+==============================
+
+You can trace all TDB operations, using TDB_TRACE.  It is not complete
+(error conditions which expect to the logged will not always be traced
+correctly, so you should set up a logging function too), but is designed
+to collect benchmark-style traces to allow us to optimize TDB.
+
+Note: tracing is not efficient, and the trace files are huge: a
+traverse of the database is particularly large!  But they compress very
+well with rzip (http://rzip.samba.org)
+
+How to gather trace files:
+--------------------------
+1) Uncomment /* #define TDB_TRACE 1 */ in tdb_private.h.
+2) Rebuild TDB, and everything that uses it.
+3) Run something.
+
+Your trace files will be called <tdbname>.trace.<pid>.  These files
+will not be overwritten: if the same process reopens the same TDB, an
+error will be logged and tracing will be disabled.
+
+How to replay trace files:
+--------------------------
+1) For benchmarking, remember to rebuild tdb with #define TDB_TRACE commented
+   out again!
+2) Grab the latest "replace_trace.c" from CCAN's tdb module (tools/ dir):
+       http://ccan.ozlabs.org/tarballs/tdb.tar.bz2
+3) Compile up replay_trace, munging as necessary.
+4) Run replay_trace <scratch-tdb-name> <tracefiles>...
+
+If given more than one trace file (presumably from the same tdb)
+replay_trace will try to figure out the dependencies between the operations
+and fire off a child to run each trace.  Occasionally it gets stuck, in
+which case it will add another dependency and retry.  Eventually it will
+give a speed value.
+
+replay_trace can intuit the existence of previous data in the tdb (ie.
+activity prior to the trace(s) supplied) and will prepopulate as
+neccessary.
+
+You can run --quiet for straight benchmark results, and -n to run multiple
+times (this saves time, since it need only calculate dependencies once).
+
+Good luck!
+Rusty Russell <rusty@rustcorp.com.au>
diff --git a/ctdb/lib/tdb/doxy.config b/ctdb/lib/tdb/doxy.config
new file mode 100644 (file)
index 0000000..f55e9c3
--- /dev/null
@@ -0,0 +1,1697 @@
+# Doxyfile 1.7.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = tdb
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         = 1.2.9
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description for a project that appears at the top of each page and should give viewer a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = docs
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even if there is only one candidate or it is obvious which candidate to choose by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = include \
+                         docs
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *.cpp \
+                         *.cc \
+                         *.c \
+                         *.h \
+                         *.hh \
+                         *.hpp \
+                         *.dox
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */.git/*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the stylesheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          =
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [0,1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+# Note that a value of 0 will completely suppress the enum values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NONE
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = NO
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = YES
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = DOXYGEN \
+                         PRINTF_ATTRIBUTE(x,y)=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will write a font called Helvetica to the output
+# directory and reference it in all dot files that doxygen generates.
+# When you want a differently looking font you can specify the font name
+# using DOT_FONTNAME. You need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, svg, gif or svg.
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/ctdb/lib/tdb/include/tdb.h b/ctdb/lib/tdb/include/tdb.h
new file mode 100644 (file)
index 0000000..d19439e
--- /dev/null
@@ -0,0 +1,869 @@
+#ifndef __TDB_H__
+#define __TDB_H__
+
+/* 
+   Unix SMB/CIFS implementation.
+
+   trivial database library
+
+   Copyright (C) Andrew Tridgell 1999-2004
+   
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#include <signal.h>
+
+/**
+ * @defgroup tdb The tdb API
+ *
+ * tdb is a Trivial database. In concept, it is very much like GDBM, and BSD's
+ * DB except that it allows multiple simultaneous writers and uses locking
+ * internally to keep writers from trampling on each other. tdb is also
+ * extremely small.
+ *
+ * @section tdb_interface Interface
+ *
+ * The interface is very similar to gdbm except for the following:
+ *
+ * <ul>
+ * <li>different open interface. The tdb_open call is more similar to a
+ * traditional open()</li>
+ * <li>no tdbm_reorganise() function</li>
+ * <li>no tdbm_sync() function. No operations are cached in the library
+ *     anyway</li>
+ * <li>added a tdb_traverse() function for traversing the whole database</li>
+ * <li>added transactions support</li>
+ * </ul>
+ *
+ * A general rule for using tdb is that the caller frees any returned TDB_DATA
+ * structures. Just call free(p.dptr) to free a TDB_DATA return value called p.
+ * This is the same as gdbm.
+ *
+ * @{
+ */
+
+/** Flags to tdb_store() */
+#define TDB_REPLACE 1          /** Unused */
+#define TDB_INSERT 2           /** Don't overwrite an existing entry */
+#define TDB_MODIFY 3           /** Don't create an existing entry    */
+
+/** Flags for tdb_open() */
+#define TDB_DEFAULT 0 /** just a readability place holder */
+#define TDB_CLEAR_IF_FIRST 1 /** If this is the first open, wipe the db */
+#define TDB_INTERNAL 2 /** Don't store on disk */
+#define TDB_NOLOCK   4 /** Don't do any locking */
+#define TDB_NOMMAP   8 /** Don't use mmap */
+#define TDB_CONVERT 16 /** Convert endian (internal use) */
+#define TDB_BIGENDIAN 32 /** Header is big-endian (internal use) */
+#define TDB_NOSYNC   64 /** Don't use synchronous transactions */
+#define TDB_SEQNUM   128 /** Maintain a sequence number */
+#define TDB_VOLATILE   256 /** Activate the per-hashchain freelist, default 5 */
+#define TDB_ALLOW_NESTING 512 /** Allow transactions to nest */
+#define TDB_DISALLOW_NESTING 1024 /** Disallow transactions to nest */
+#define TDB_INCOMPATIBLE_HASH 2048 /** Better hashing: can't be opened by tdb < 1.2.6. */
+
+/** The tdb error codes */
+enum TDB_ERROR {TDB_SUCCESS=0, TDB_ERR_CORRUPT, TDB_ERR_IO, TDB_ERR_LOCK, 
+               TDB_ERR_OOM, TDB_ERR_EXISTS, TDB_ERR_NOLOCK, TDB_ERR_LOCK_TIMEOUT,
+               TDB_ERR_NOEXIST, TDB_ERR_EINVAL, TDB_ERR_RDONLY,
+               TDB_ERR_NESTING};
+
+/** Debugging uses one of the following levels */
+enum tdb_debug_level {TDB_DEBUG_FATAL = 0, TDB_DEBUG_ERROR, 
+                     TDB_DEBUG_WARNING, TDB_DEBUG_TRACE};
+
+/** The tdb data structure */
+typedef struct TDB_DATA {
+       unsigned char *dptr;
+       size_t dsize;
+} TDB_DATA;
+
+#ifndef PRINTF_ATTRIBUTE
+#if (__GNUC__ >= 3)
+/** Use gcc attribute to check printf fns.  a1 is the 1-based index of
+ * the parameter containing the format, and a2 the index of the first
+ * argument. Note that some gcc 2.x versions don't handle this
+ * properly **/
+#define PRINTF_ATTRIBUTE(a1, a2) __attribute__ ((format (__printf__, a1, a2)))
+#else
+#define PRINTF_ATTRIBUTE(a1, a2)
+#endif
+#endif
+
+/** This is the context structure that is returned from a db open. */
+typedef struct tdb_context TDB_CONTEXT;
+
+typedef int (*tdb_traverse_func)(struct tdb_context *, TDB_DATA, TDB_DATA, void *);
+typedef void (*tdb_log_func)(struct tdb_context *, enum tdb_debug_level, const char *, ...) PRINTF_ATTRIBUTE(3, 4);
+typedef unsigned int (*tdb_hash_func)(TDB_DATA *key);
+
+struct tdb_logging_context {
+        tdb_log_func log_fn;
+        void *log_private;
+};
+
+/**
+ * @brief Open the database and creating it if necessary.
+ *
+ * @param[in]  name     The name of the db to open.
+ *
+ * @param[in]  hash_size The hash size is advisory, use zero for a default
+ *                       value.
+ *
+ * @param[in]  tdb_flags The flags to use to open the db:\n\n
+ *                         TDB_CLEAR_IF_FIRST - Clear database if we are the
+ *                                              only one with it open\n
+ *                         TDB_INTERNAL - Don't use a file, instaed store the
+ *                                        data in memory. The filename is
+ *                                        ignored in this case.\n
+ *                         TDB_NOLOCK - Don't do any locking\n
+ *                         TDB_NOMMAP - Don't use mmap\n
+ *                         TDB_NOSYNC - Don't synchronise transactions to disk\n
+ *                         TDB_SEQNUM - Maintain a sequence number\n
+ *                         TDB_VOLATILE - activate the per-hashchain freelist,
+ *                                        default 5.\n
+ *                         TDB_ALLOW_NESTING - Allow transactions to nest.\n
+ *                         TDB_DISALLOW_NESTING - Disallow transactions to nest.\n
+ *
+ * @param[in]  open_flags Flags for the open(2) function.
+ *
+ * @param[in]  mode     The mode for the open(2) function.
+ *
+ * @return              A tdb context structure, NULL on error.
+ */
+struct tdb_context *tdb_open(const char *name, int hash_size, int tdb_flags,
+                     int open_flags, mode_t mode);
+
+/**
+ * @brief Open the database and creating it if necessary.
+ *
+ * This is like tdb_open(), but allows you to pass an initial logging and
+ * hash function. Be careful when passing a hash function - all users of the
+ * database must use the same hash function or you will get data corruption.
+ *
+ * @param[in]  name     The name of the db to open.
+ *
+ * @param[in]  hash_size The hash size is advisory, use zero for a default
+ *                       value.
+ *
+ * @param[in]  tdb_flags The flags to use to open the db:\n\n
+ *                         TDB_CLEAR_IF_FIRST - Clear database if we are the
+ *                                              only one with it open\n
+ *                         TDB_INTERNAL - Don't use a file, instaed store the
+ *                                        data in memory. The filename is
+ *                                        ignored in this case.\n
+ *                         TDB_NOLOCK - Don't do any locking\n
+ *                         TDB_NOMMAP - Don't use mmap\n
+ *                         TDB_NOSYNC - Don't synchronise transactions to disk\n
+ *                         TDB_SEQNUM - Maintain a sequence number\n
+ *                         TDB_VOLATILE - activate the per-hashchain freelist,
+ *                                        default 5.\n
+ *                         TDB_ALLOW_NESTING - Allow transactions to nest.\n
+ *                         TDB_DISALLOW_NESTING - Disallow transactions to nest.\n
+ *
+ * @param[in]  open_flags Flags for the open(2) function.
+ *
+ * @param[in]  mode     The mode for the open(2) function.
+ *
+ * @param[in]  log_ctx  The logging function to use.
+ *
+ * @param[in]  hash_fn  The hash function you want to use.
+ *
+ * @return              A tdb context structure, NULL on error.
+ *
+ * @see tdb_open()
+ */
+struct tdb_context *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
+                        int open_flags, mode_t mode,
+                        const struct tdb_logging_context *log_ctx,
+                        tdb_hash_func hash_fn);
+
+/**
+ * @brief Set the maximum number of dead records per hash chain.
+ *
+ * @param[in]  tdb      The database handle to set the maximum.
+ *
+ * @param[in]  max_dead The maximum number of dead records per hash chain.
+ */
+void tdb_set_max_dead(struct tdb_context *tdb, int max_dead);
+
+/**
+ * @brief Reopen a tdb.
+ *
+ * This can be used after a fork to ensure that we have an independent seek
+ * pointer from our parent and to re-establish locks.
+ *
+ * @param[in]  tdb      The database to reopen.
+ *
+ * @return              0 on success, -1 on error.
+ */
+int tdb_reopen(struct tdb_context *tdb);
+
+/**
+ * @brief Reopen all tdb's
+ *
+ * If the parent is longlived (ie. a parent daemon architecture), we know it
+ * will keep it's active lock on a tdb opened with CLEAR_IF_FIRST. Thus for
+ * child processes we don't have to add an active lock. This is essential to
+ * improve performance on systems that keep POSIX locks as a non-scalable data
+ * structure in the kernel.
+ *
+ * @param[in]  parent_longlived Wether the parent is longlived or not.
+ *
+ * @return              0 on success, -1 on error.
+ */
+int tdb_reopen_all(int parent_longlived);
+
+/**
+ * @brief Set a different tdb logging function.
+ *
+ * @param[in]  tdb      The tdb to set the logging function.
+ *
+ * @param[in]  log_ctx  The logging function to set.
+ */
+void tdb_set_logging_function(struct tdb_context *tdb, const struct tdb_logging_context *log_ctx);
+
+/**
+ * @brief Get the tdb last error code.
+ *
+ * @param[in]  tdb      The tdb to get the error code from.
+ *
+ * @return              A TDB_ERROR code.
+ *
+ * @see TDB_ERROR
+ */
+enum TDB_ERROR tdb_error(struct tdb_context *tdb);
+
+/**
+ * @brief Get a error string for the last tdb error
+ *
+ * @param[in]  tdb      The tdb to get the error code from.
+ *
+ * @return              An error string.
+ */
+const char *tdb_errorstr(struct tdb_context *tdb);
+
+/**
+ * @brief Fetch an entry in the database given a key.
+ *
+ * The caller must free the resulting data.
+ *
+ * @param[in]  tdb      The tdb to fetch the key.
+ *
+ * @param[in]  key      The key to fetch.
+ *
+ * @return              The key entry found in the database, NULL on error with
+ *                      TDB_ERROR set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+TDB_DATA tdb_fetch(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * @brief Hand a record to a parser function without allocating it.
+ *
+ * This function is meant as a fast tdb_fetch alternative for large records
+ * that are frequently read. The "key" and "data" arguments point directly
+ * into the tdb shared memory, they are not aligned at any boundary.
+ *
+ * @warning The parser is called while tdb holds a lock on the record. DO NOT
+ * call other tdb routines from within the parser. Also, for good performance
+ * you should make the parser fast to allow parallel operations.
+ *
+ * @param[in]  tdb      The tdb to parse the record.
+ *
+ * @param[in]  key      The key to parse.
+ *
+ * @param[in]  parser   The parser to use to parse the data.
+ *
+ * @param[in]  private_data A private data pointer which is passed to the parser
+ *                          function.
+ *
+ * @return              -1 if the record was not found. If the record was found,
+ *                      the return value of "parser" is passed up to the caller.
+ */
+int tdb_parse_record(struct tdb_context *tdb, TDB_DATA key,
+                             int (*parser)(TDB_DATA key, TDB_DATA data,
+                                           void *private_data),
+                             void *private_data);
+
+/**
+ * @brief Delete an entry in the database given a key.
+ *
+ * @param[in]  tdb      The tdb to delete the key.
+ *
+ * @param[in]  key      The key to delete.
+ *
+ * @return              0 on success, -1 if the key doesn't exist.
+ */
+int tdb_delete(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * @brief Store an element in the database.
+ *
+ * This replaces any existing element with the same key.
+ *
+ * @param[in]  tdb      The tdb to store the entry.
+ *
+ * @param[in]  key      The key to use to store the entry.
+ *
+ * @param[in]  dbuf     The data to store under the key.
+ *
+ * @param[in]  flag     The flags to store the key:\n\n
+ *                      TDB_INSERT: Don't overwrite an existing entry.\n
+ *                      TDB_MODIFY: Don't create a new entry\n
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_store(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, int flag);
+
+/**
+ * @brief Append data to an entry.
+ *
+ * If the entry doesn't exist, it will create a new one.
+ *
+ * @param[in]  tdb      The database to use.
+ *
+ * @param[in]  key      The key to append the data.
+ *
+ * @param[in]  new_dbuf The data to append to the key.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_append(struct tdb_context *tdb, TDB_DATA key, TDB_DATA new_dbuf);
+
+/**
+ * @brief Close a database.
+ *
+ * @param[in]  tdb      The database to close.
+ *
+ * @return              0 for success, -1 on error.
+ */
+int tdb_close(struct tdb_context *tdb);
+
+/**
+ * @brief Find the first entry in the database and return its key.
+ *
+ * The caller must free the returned data.
+ *
+ * @param[in]  tdb      The database to use.
+ *
+ * @return              The first entry of the database, an empty TDB_DATA entry
+ *                      if the database is empty.
+ */
+TDB_DATA tdb_firstkey(struct tdb_context *tdb);
+
+/**
+ * @brief Find the next entry in the database, returning its key.
+ *
+ * The caller must free the returned data.
+ *
+ * @param[in]  tdb      The database to use.
+ *
+ * @param[in]  key      The key from which you want the next key.
+ *
+ * @return              The next entry of the current key, an empty TDB_DATA
+ *                      entry if there is no entry.
+ */
+TDB_DATA tdb_nextkey(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * @brief Traverse the entire database.
+ *
+ * While travering the function fn(tdb, key, data, state) is called on each
+ * element. If fn is NULL then it is not called. A non-zero return value from
+ * fn() indicates that the traversal should stop. Traversal callbacks may not
+ * start transactions.
+ *
+ * @warning The data buffer given to the callback fn does NOT meet the alignment
+ * restrictions malloc gives you.
+ *
+ * @param[in]  tdb      The database to traverse.
+ *
+ * @param[in]  fn       The function to call on each entry.
+ *
+ * @param[in]  private_data The private data which should be passed to the
+ *                          traversing function.
+ *
+ * @return              The record count traversed, -1 on error.
+ */
+int tdb_traverse(struct tdb_context *tdb, tdb_traverse_func fn, void *private_data);
+
+/**
+ * @brief Traverse the entire database.
+ *
+ * While traversing the database the function fn(tdb, key, data, state) is
+ * called on each element, but marking the database read only during the
+ * traversal, so any write operations will fail. This allows tdb to use read
+ * locks, which increases the parallelism possible during the traversal.
+ *
+ * @param[in]  tdb      The database to traverse.
+ *
+ * @param[in]  fn       The function to call on each entry.
+ *
+ * @param[in]  private_data The private data which should be passed to the
+ *                          traversing function.
+ *
+ * @return              The record count traversed, -1 on error.
+ */
+int tdb_traverse_read(struct tdb_context *tdb, tdb_traverse_func fn, void *private_data);
+
+/**
+ * @brief Check if an entry in the database exists.
+ *
+ * @note 1 is returned if the key is found and 0 is returned if not found this
+ * doesn't match the conventions in the rest of this module, but is compatible
+ * with gdbm.
+ *
+ * @param[in]  tdb      The database to check if the entry exists.
+ *
+ * @param[in]  key      The key to check if the entry exists.
+ *
+ * @return              1 if the key is found, 0 if not.
+ */
+int tdb_exists(struct tdb_context *tdb, TDB_DATA key);
+
+/**
+ * @brief Lock entire database with a write lock.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_lockall(struct tdb_context *tdb);
+
+/**
+ * @brief Lock entire database with a write lock.
+ *
+ * This is the non-blocking call.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_lockall_nonblock(struct tdb_context *tdb);
+
+/**
+ * @brief Unlock entire database with write lock.
+ *
+ * @param[in]  tdb      The database to unlock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_unlockall(struct tdb_context *tdb);
+
+/**
+ * @brief Lock entire database with a read lock.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_lockall_read(struct tdb_context *tdb);
+
+/**
+ * @brief Lock entire database with a read lock.
+ *
+ * This is the non-blocking call.
+ *
+ * @param[in]  tdb      The database to lock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall_read()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_lockall_read_nonblock(struct tdb_context *tdb);
+
+/**
+ * @brief Unlock entire database with read lock.
+ *
+ * @param[in]  tdb      The database to unlock.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_lockall_read()
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_unlockall_read(struct tdb_context *tdb);
+
+/**
+ * @brief Lock entire database with write lock - mark only.
+ *
+ * @todo Add more details.
+ *
+ * @param[in]  tdb      The database to mark.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_lockall_mark(struct tdb_context *tdb);
+
+/**
+ * @brief Lock entire database with write lock - unmark only.
+ *
+ * @todo Add more details.
+ *
+ * @param[in]  tdb      The database to mark.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_lockall_unmark(struct tdb_context *tdb);
+
+/**
+ * @brief Get the name of the current tdb file.
+ *
+ * This is useful for external logging functions.
+ *
+ * @param[in]  tdb      The database to get the name from.
+ *
+ * @return              The name of the database.
+ */
+const char *tdb_name(struct tdb_context *tdb);
+
+/**
+ * @brief Get the underlying file descriptor being used by tdb.
+ *
+ * This is useful for external routines that want to check the device/inode
+ * of the fd.
+ *
+ * @param[in]  tdb      The database to get the fd from.
+ *
+ * @return              The file descriptor or -1.
+ */
+int tdb_fd(struct tdb_context *tdb);
+
+/**
+ * @brief Get the current logging function.
+ *
+ * This is useful for external tdb routines that wish to log tdb errors.
+ *
+ * @param[in]  tdb      The database to get the logging function from.
+ *
+ * @return              The logging function of the database.
+ *
+ * @see tdb_get_logging_private()
+ */
+tdb_log_func tdb_log_fn(struct tdb_context *tdb);
+
+/**
+ * @brief Get the private data of the logging function.
+ *
+ * @param[in]  tdb      The database to get the data from.
+ *
+ * @return              The private data pointer of the logging function.
+ *
+ * @see tdb_log_fn()
+ */
+void *tdb_get_logging_private(struct tdb_context *tdb);
+
+/**
+ * @brief Start a transaction.
+ *
+ * All operations after the transaction start can either be committed with
+ * tdb_transaction_commit() or cancelled with tdb_transaction_cancel().
+ *
+ * If you call tdb_transaction_start() again on the same tdb context while a
+ * transaction is in progress, then the same transaction buffer is re-used. The
+ * number of tdb_transaction_{commit,cancel} operations must match the number
+ * of successful tdb_transaction_start() calls.
+ *
+ * Note that transactions are by default disk synchronous, and use a recover
+ * area in the database to automatically recover the database on the next open
+ * if the system crashes during a transaction. You can disable the synchronous
+ * transaction recovery setup using the TDB_NOSYNC flag, which will greatly
+ * speed up operations at the risk of corrupting your database if the system
+ * crashes.
+ *
+ * Operations made within a transaction are not visible to other users of the
+ * database until a successful commit.
+ *
+ * @param[in]  tdb      The database to start the transaction.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_transaction_start(struct tdb_context *tdb);
+
+/**
+ * @brief Start a transaction, non-blocking.
+ *
+ * @param[in]  tdb      The database to start the transaction.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ * @see tdb_transaction_start()
+ */
+int tdb_transaction_start_nonblock(struct tdb_context *tdb);
+
+/**
+ * @brief Prepare to commit a current transaction, for two-phase commits.
+ *
+ * Once prepared for commit, the only allowed calls are tdb_transaction_commit()
+ * or tdb_transaction_cancel(). Preparing allocates disk space for the pending
+ * updates, so a subsequent commit should succeed (barring any hardware
+ * failures).
+ *
+ * @param[in]  tdb      The database to prepare the commit.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_transaction_prepare_commit(struct tdb_context *tdb);
+
+/**
+ * @brief Commit a current transaction.
+ *
+ * This updates the database and releases the current transaction locks.
+ *
+ * @param[in]  tdb      The database to commit the transaction.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_transaction_commit(struct tdb_context *tdb);
+
+/**
+ * @brief Cancel a current transaction.
+ *
+ * This discards all write and lock operations that have been made since the
+ * transaction started.
+ *
+ * @param[in]  tdb      The tdb to cancel the transaction on.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_transaction_cancel(struct tdb_context *tdb);
+
+/**
+ * @brief Get the tdb sequence number.
+ *
+ * Only makes sense if the writers opened with TDB_SEQNUM set. Note that this
+ * sequence number will wrap quite quickly, so it should only be used for a
+ * 'has something changed' test, not for code that relies on the count of the
+ * number of changes made. If you want a counter then use a tdb record.
+ *
+ * The aim of this sequence number is to allow for a very lightweight test of a
+ * possible tdb change.
+ *
+ * @param[in]  tdb      The database to get the sequence number from.
+ *
+ * @return              The sequence number or 0.
+ *
+ * @see tdb_open()
+ * @see tdb_enable_seqnum()
+ */
+int tdb_get_seqnum(struct tdb_context *tdb);
+
+/**
+ * @brief Get the hash size.
+ *
+ * @param[in]  tdb      The database to get the hash size from.
+ *
+ * @return              The hash size.
+ */
+int tdb_hash_size(struct tdb_context *tdb);
+
+/**
+ * @brief Get the map size.
+ *
+ * @param[in]  tdb     The database to get the map size from.
+ *
+ * @return             The map size.
+ */
+size_t tdb_map_size(struct tdb_context *tdb);
+
+/**
+ * @brief Get the tdb flags set during open.
+ *
+ * @param[in]  tdb      The database to get the flags form.
+ *
+ * @return              The flags set to on the database.
+ */
+int tdb_get_flags(struct tdb_context *tdb);
+
+/**
+ * @brief Add flags to the database.
+ *
+ * @param[in]  tdb      The database to add the flags.
+ *
+ * @param[in]  flag     The tdb flags to add.
+ */
+void tdb_add_flags(struct tdb_context *tdb, unsigned flag);
+
+/**
+ * @brief Remove flags from the database.
+ *
+ * @param[in]  tdb      The database to remove the flags.
+ *
+ * @param[in]  flag     The tdb flags to remove.
+ */
+void tdb_remove_flags(struct tdb_context *tdb, unsigned flag);
+
+/**
+ * @brief Enable sequence number handling on an open tdb.
+ *
+ * @param[in]  tdb      The database to enable sequence number handling.
+ *
+ * @see tdb_get_seqnum()
+ */
+void tdb_enable_seqnum(struct tdb_context *tdb);
+
+/**
+ * @brief Increment the tdb sequence number.
+ *
+ * This only works if the tdb has been opened using the TDB_SEQNUM flag or
+ * enabled useing tdb_enable_seqnum().
+ *
+ * @param[in]  tdb      The database to increment the sequence number.
+ *
+ * @see tdb_enable_seqnum()
+ * @see tdb_get_seqnum()
+ */
+void tdb_increment_seqnum_nonblock(struct tdb_context *tdb);
+
+/**
+ * @brief Create a hash of the key.
+ *
+ * @param[in]  key      The key to hash
+ *
+ * @return              The hash.
+ */
+unsigned int tdb_jenkins_hash(TDB_DATA *key);
+
+/**
+ * @brief Check the consistency of the database.
+ *
+ * This check the consistency of the database calling back the check function
+ * (if non-NULL) on each record.  If some consistency check fails, or the
+ * supplied check function returns -1, tdb_check returns -1, otherwise 0.
+ *
+ * @note The logging function (if set) will be called with additional
+ * information on the corruption found.
+ *
+ * @param[in]  tdb      The database to check.
+ *
+ * @param[in]  check    The check function to use.
+ *
+ * @param[in]  private_data the private data to pass to the check function.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_check(struct tdb_context *tdb,
+             int (*check) (TDB_DATA key, TDB_DATA data, void *private_data),
+             void *private_data);
+
+/**
+ * @brief Dump all possible records in a corrupt database.
+ *
+ * This is the only way to get data out of a database where tdb_check() fails.
+ * It will call walk() with anything which looks like a database record; this
+ * may well include invalid, incomplete or duplicate records.
+ *
+ * @param[in]  tdb      The database to check.
+ *
+ * @param[in]  walk     The walk function to use.
+ *
+ * @param[in]  private_data the private data to pass to the walk function.
+ *
+ * @return              0 on success, -1 on error with error code set.
+ *
+ * @see tdb_error()
+ * @see tdb_errorstr()
+ */
+int tdb_rescue(struct tdb_context *tdb,
+              void (*walk) (TDB_DATA key, TDB_DATA data, void *private_data),
+              void *private_data);
+
+/* @} ******************************************************************/
+
+/* Low level locking functions: use with care */
+int tdb_chainlock(struct tdb_context *tdb, TDB_DATA key);
+int tdb_chainlock_nonblock(struct tdb_context *tdb, TDB_DATA key);
+int tdb_chainunlock(struct tdb_context *tdb, TDB_DATA key);
+int tdb_chainlock_read(struct tdb_context *tdb, TDB_DATA key);
+int tdb_chainunlock_read(struct tdb_context *tdb, TDB_DATA key);
+int tdb_chainlock_mark(struct tdb_context *tdb, TDB_DATA key);
+int tdb_chainlock_unmark(struct tdb_context *tdb, TDB_DATA key);
+
+void tdb_setalarm_sigptr(struct tdb_context *tdb, volatile sig_atomic_t *sigptr);
+
+/* wipe and repack */
+int tdb_wipe_all(struct tdb_context *tdb);
+int tdb_repack(struct tdb_context *tdb);
+
+/* Debug functions. Not used in production. */
+void tdb_dump_all(struct tdb_context *tdb);
+int tdb_printfreelist(struct tdb_context *tdb);
+int tdb_validate_freelist(struct tdb_context *tdb, int *pnum_entries);
+int tdb_freelist_size(struct tdb_context *tdb);
+char *tdb_summary(struct tdb_context *tdb);
+
+extern TDB_DATA tdb_null;
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* tdb.h */
diff --git a/ctdb/lib/tdb/libtdb.m4 b/ctdb/lib/tdb/libtdb.m4
new file mode 100644 (file)
index 0000000..6503bf1
--- /dev/null
@@ -0,0 +1,59 @@
+dnl Check to see if we should use the included tdb
+
+INCLUDED_TDB=auto
+AC_ARG_WITH(included-tdb,
+    [AC_HELP_STRING([--with-included-tdb], [use bundled tdb library, not from system])],
+    [ INCLUDED_TDB=$withval ])
+
+AC_SUBST(TDB_LIBS)
+AC_SUBST(TDB_CFLAGS)
+
+if test x"$INCLUDED_TDB" != x"yes" ; then
+    AC_CHECK_HEADERS(tdb.h)
+    AC_CHECK_LIB(tdb, tdb_transaction_write_lock_mark, [ TDB_LIBS="-ltdb" ])
+    if test x"$ac_cv_header_tdb_h" = x"no" -o x"$ac_cv_lib_tdb_tdb_transaction_write_lock_mark" = x"no" ; then
+        INCLUDED_TDB=yes
+        TDB_CFLAGS=""
+    else
+        INCLUDED_TDB=no
+    fi
+fi
+
+AC_MSG_CHECKING(whether to use included tdb)
+AC_MSG_RESULT($INCLUDED_TDB)
+if test x"$INCLUDED_TDB" != x"no" ; then
+    dnl find the tdb sources. This is meant to work both for 
+    dnl tdb standalone builds, and builds of packages using tdb
+    tdbdir=""
+    tdbpaths=". lib/tdb tdb ../tdb ../lib/tdb"
+    for d in $tdbpaths; do
+       if test -f "$srcdir/$d/common/tdb.c"; then
+               tdbdir="$d"             
+               AC_SUBST(tdbdir)
+               break;
+       fi
+    done
+    if test x"$tdbdir" = "x"; then
+       AC_MSG_ERROR([cannot find tdb source in $tdbpaths])
+    fi
+    TDB_OBJ="common/tdb.o common/dump.o common/transaction.o common/error.o common/traverse.o"
+    TDB_OBJ="$TDB_OBJ common/freelist.o common/freelistcheck.o common/io.o common/lock.o common/open.o common/check.o common/hash.o common/summary.o common/rescue.o"
+    AC_SUBST(TDB_OBJ)
+
+    TDB_LIBS=""
+    AC_SUBST(TDB_LIBS)
+
+    TDB_CFLAGS="-I$tdbdir/include"
+    AC_SUBST(TDB_CFLAGS)
+fi
+
+AC_CHECK_FUNCS(mmap pread pwrite getpagesize utime)
+AC_CHECK_HEADERS(getopt.h sys/select.h sys/time.h)
+
+AC_HAVE_DECL(pread, [#include <unistd.h>])
+AC_HAVE_DECL(pwrite, [#include <unistd.h>])
+
+if test x"$VERSIONSCRIPT" != "x"; then
+    EXPORTSFILE=tdb.exports
+    AC_SUBST(EXPORTSFILE)
+fi
diff --git a/ctdb/lib/tdb/manpages/tdbbackup.8.xml b/ctdb/lib/tdb/manpages/tdbbackup.8.xml
new file mode 100644 (file)
index 0000000..78fe32e
--- /dev/null
@@ -0,0 +1,136 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<refentry id="tdbbackup.8">
+
+<refmeta>
+       <refentrytitle>tdbbackup</refentrytitle>
+       <manvolnum>8</manvolnum>
+       <refmiscinfo class="source">Samba</refmiscinfo>
+       <refmiscinfo class="manual">System Administration tools</refmiscinfo>
+       <refmiscinfo class="version">3.6</refmiscinfo>
+</refmeta>
+
+
+<refnamediv>
+       <refname>tdbbackup</refname>
+       <refpurpose>tool for backing up and for validating the integrity of samba .tdb files</refpurpose>
+</refnamediv>
+
+<refsynopsisdiv>
+       <cmdsynopsis>
+               <command>tdbbackup</command>
+               <arg choice="opt">-s suffix</arg>
+               <arg choice="opt">-v</arg>
+               <arg choice="opt">-h</arg>
+       </cmdsynopsis>
+</refsynopsisdiv>
+
+<refsect1>
+       <title>DESCRIPTION</title>
+
+       <para>This tool is part of the <citerefentry><refentrytitle>samba</refentrytitle>
+       <manvolnum>1</manvolnum></citerefentry> suite.</para>
+
+       <para><command>tdbbackup</command> is a tool that may be used to backup samba .tdb
+       files. This tool may also be used to verify the integrity of the .tdb files prior
+       to samba startup or during normal operation. If it finds file damage and it finds 
+       a prior backup the backup file will be restored. 
+       </para>
+</refsect1>
+
+
+<refsect1>
+       <title>OPTIONS</title>
+
+       <variablelist>
+
+               <varlistentry>
+               <term>-h</term>
+               <listitem><para>
+               Get help information.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>-s suffix</term>
+               <listitem><para>
+               The <command>-s</command> option allows the adminisistrator to specify a file
+               backup extension. This way it is possible to keep a history of tdb backup
+               files by using a new suffix for each backup.
+               </para> </listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>-v</term>
+               <listitem><para>
+               The <command>-v</command> will check the database for damages (currupt data)
+               which if detected causes the backup to be restored.
+               </para></listitem>
+               </varlistentry>
+
+       </variablelist>
+</refsect1>
+
+
+<refsect1>
+       <title>COMMANDS</title>
+
+       <para><emphasis>GENERAL INFORMATION</emphasis></para>
+
+       <para>
+       The <command>tdbbackup</command> utility can safely be run at any time. It was designed so
+       that it can be used at any time to validate the integrity of tdb files, even during Samba
+       operation. Typical usage for the command will be:
+       </para>
+
+       <para>tdbbackup [-s suffix] *.tdb</para>
+
+       <para>
+       Before restarting samba the following command may be run to validate .tdb files:
+       </para>
+
+       <para>tdbbackup -v [-s suffix] *.tdb</para>
+
+       <para>
+       Samba .tdb files are stored in various locations, be sure to run backup all
+       .tdb file on the system. Important files includes:
+       </para>
+
+       <itemizedlist>
+               <listitem><para>
+               <command>secrets.tdb</command> - usual location is in the /usr/local/samba/private
+               directory, or on some systems in /etc/samba.
+               </para></listitem>
+
+               <listitem><para>
+               <command>passdb.tdb</command> - usual location is in the /usr/local/samba/private
+               directory, or on some systems in /etc/samba.
+               </para></listitem>
+
+               <listitem><para>
+               <command>*.tdb</command> located in the /usr/local/samba/var directory or on some
+               systems in the /var/cache or /var/lib/samba directories.
+               </para></listitem>
+       </itemizedlist>
+
+</refsect1>
+
+<refsect1>
+       <title>VERSION</title>
+
+       <para>This man page is correct for version 3 of the Samba suite.</para>
+</refsect1>
+
+<refsect1>
+       <title>AUTHOR</title>
+
+       <para>
+       The original Samba software and related utilities were created by Andrew Tridgell.
+       Samba is now developed by the Samba Team as an Open Source project similar to the way
+       the Linux kernel is developed.
+       </para> 
+
+       <para>The tdbbackup man page was written by John H Terpstra.</para>
+</refsect1>
+
+</refentry>
diff --git a/ctdb/lib/tdb/manpages/tdbdump.8.xml b/ctdb/lib/tdb/manpages/tdbdump.8.xml
new file mode 100644 (file)
index 0000000..3420193
--- /dev/null
@@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<refentry id="tdbdump.8">
+
+<refmeta>
+       <refentrytitle>tdbdump</refentrytitle>
+       <manvolnum>8</manvolnum>
+       <refmiscinfo class="source">Samba</refmiscinfo>
+       <refmiscinfo class="manual">System Administration tools</refmiscinfo>
+       <refmiscinfo class="version">3.6</refmiscinfo>
+</refmeta>
+
+
+<refnamediv>
+       <refname>tdbdump</refname>
+       <refpurpose>tool for printing the contents of a TDB file</refpurpose>
+</refnamediv>
+
+<refsynopsisdiv>
+       <cmdsynopsis>
+               <command>tdbdump</command>
+               <arg choice="opt">-k <replaceable>keyname</replaceable></arg>
+               <arg choice="opt">-e</arg>
+               <arg choice="opt">-h</arg>
+               <arg choice="req">filename</arg>
+       </cmdsynopsis>
+</refsynopsisdiv>
+
+<refsect1>
+       <title>DESCRIPTION</title>
+
+       <para>This tool is part of the <citerefentry><refentrytitle>samba</refentrytitle>
+       <manvolnum>1</manvolnum></citerefentry> suite.</para>
+
+       <para><command>tdbdump</command> is a very simple utility that 'dumps' the 
+               contents of a TDB (Trivial DataBase) file to standard output in a 
+               human-readable format.
+       </para>
+
+       <para>This tool can be used when debugging problems with TDB files. It is 
+               intended for those who are somewhat familiar with Samba internals.
+       </para>
+</refsect1>
+
+<refsect1>
+       <title>OPTIONS</title>
+
+       <variablelist>
+
+               <varlistentry>
+               <term>-h</term>
+               <listitem><para>
+               Get help information.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>-k <replaceable>keyname</replaceable></term>
+               <listitem><para>
+               The <command>-k</command> option restricts dumping to a single key, if found.
+               </para> </listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>-e</term>
+               <listitem><para>
+               The <command>-e</command> tries to dump out from a corrupt database.  Naturally, such a dump is unreliable, at best.
+               </para></listitem>
+               </varlistentry>
+
+       </variablelist>
+</refsect1>
+
+<refsect1>
+       <title>VERSION</title>
+
+       <para>This man page is correct for version 3 of the Samba suite.</para>
+</refsect1>
+
+<refsect1>
+       <title>AUTHOR</title>
+
+       <para>
+       The original Samba software and related utilities were created by Andrew Tridgell.
+       Samba is now developed by the Samba Team as an Open Source project similar to the way
+       the Linux kernel is developed.
+       </para> 
+
+       <para>The tdbdump man page was written by Jelmer Vernooij.</para>
+</refsect1>
+
+</refentry>
diff --git a/ctdb/lib/tdb/manpages/tdbrestore.8.xml b/ctdb/lib/tdb/manpages/tdbrestore.8.xml
new file mode 100644 (file)
index 0000000..64c0ba2
--- /dev/null
@@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<refentry id="tdbrestore.8">
+
+<refmeta>
+       <refentrytitle>tdbrestore</refentrytitle>
+       <manvolnum>8</manvolnum>
+       <refmiscinfo class="source">Samba</refmiscinfo>
+       <refmiscinfo class="manual">System Administration tools</refmiscinfo>
+       <refmiscinfo class="version">3.6</refmiscinfo>
+</refmeta>
+
+
+<refnamediv>
+       <refname>tdbrestore</refname>
+       <refpurpose>tool for creating a TDB file out of a tdbdump output</refpurpose>
+</refnamediv>
+
+<refsynopsisdiv>
+       <cmdsynopsis>
+               <command>tdbrestore</command>
+               <arg choice="req">tdbfilename</arg>
+       </cmdsynopsis>
+</refsynopsisdiv>
+
+<refsect1>
+       <title>DESCRIPTION</title>
+
+       <para>This tool is part of the <citerefentry><refentrytitle>samba</refentrytitle>
+       <manvolnum>1</manvolnum></citerefentry> suite.</para>
+
+       <para><command>tdbrestore</command> is a very simple utility that 'restores' the
+               contents of dump file into TDB (Trivial DataBase) file. The dump file is obtained from the tdbdump
+               command.
+       </para>
+
+       <para>This tool wait on the standard input for the content of the dump and will write the tdb in the tdbfilename
+  parameter.
+       </para>
+       <para>This tool can be used for unpacking the content of tdb as backup mean.
+       </para>
+</refsect1>
+
+
+<refsect1>
+       <title>VERSION</title>
+
+       <para>This man page is correct for version 3 of the Samba suite.</para>
+</refsect1>
+
+<refsect1>
+       <title>AUTHOR</title>
+
+       <para>
+       The original Samba software and related utilities were created by Andrew Tridgell.
+       Samba is now developed by the Samba Team as an Open Source project similar to the way
+       the Linux kernel is developed.
+
+        This tool was initially written by Volker Lendecke based on an
+        idea by Simon McVittie.
+       </para>
+
+       <para>The tdbrestore man page was written by Matthieu Patou.</para>
+</refsect1>
+
+</refentry>
diff --git a/ctdb/lib/tdb/manpages/tdbtool.8.xml b/ctdb/lib/tdb/manpages/tdbtool.8.xml
new file mode 100644 (file)
index 0000000..9f96db2
--- /dev/null
@@ -0,0 +1,235 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.2//EN" "http://www.oasis-open.org/docbook/xml/4.2/docbookx.dtd">
+<refentry id="tdbtool.8">
+
+<refmeta>
+       <refentrytitle>tdbtool</refentrytitle>
+       <manvolnum>8</manvolnum>
+       <refmiscinfo class="source">Samba</refmiscinfo>
+       <refmiscinfo class="manual">System Administration tools</refmiscinfo>
+       <refmiscinfo class="version">3.6</refmiscinfo>
+</refmeta>
+
+
+<refnamediv>
+       <refname>tdbtool</refname>
+       <refpurpose>manipulate the contents TDB files</refpurpose>
+</refnamediv>
+
+<refsynopsisdiv>
+
+       <cmdsynopsis>
+               <command>tdbtool</command>
+       </cmdsynopsis>
+
+       <cmdsynopsis>
+               <command>tdbtool</command>
+               <arg choice="plain">
+               <replaceable>TDBFILE</replaceable>
+               </arg>
+               <arg rep="repeat" choice="opt">
+               <replaceable>COMMANDS</replaceable>
+               </arg>
+       </cmdsynopsis>
+
+</refsynopsisdiv>
+
+<refsect1>
+       <title>DESCRIPTION</title>
+
+       <para>This tool is part of the
+       <citerefentry><refentrytitle>samba</refentrytitle>
+       <manvolnum>1</manvolnum></citerefentry> suite.</para>
+
+       <para><command>tdbtool</command> a tool for displaying and
+       altering the contents of Samba TDB (Trivial DataBase) files. Each
+       of the commands listed below can be entered interactively or
+       provided on the command line.</para>
+
+</refsect1>
+
+
+<refsect1>
+       <title>COMMANDS</title>
+
+       <variablelist>
+
+               <varlistentry>
+               <term><option>create</option>
+               <replaceable>TDBFILE</replaceable></term>
+               <listitem><para>Create a new database named
+               <replaceable>TDBFILE</replaceable>.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>open</option>
+               <replaceable>TDBFILE</replaceable></term>
+               <listitem><para>Open an existing database named
+               <replaceable>TDBFILE</replaceable>.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>erase</option></term>
+               <listitem><para>Erase the current database.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>dump</option></term>
+               <listitem><para>Dump the current database as strings.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>cdump</option></term>
+               <listitem><para>Dump the current database as connection records.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>keys</option></term>
+               <listitem><para>Dump the current database keys as strings.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>hexkeys</option></term>
+               <listitem><para>Dump the current database keys as hex values.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>info</option></term>
+               <listitem><para>Print summary information about the
+               current database.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>insert</option>
+               <replaceable>KEY</replaceable>
+               <replaceable>DATA</replaceable>
+               </term>
+               <listitem><para>Insert a record into the
+               current database.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>move</option>
+               <replaceable>KEY</replaceable>
+               <replaceable>TDBFILE</replaceable>
+               </term>
+               <listitem><para>Move a record from the 
+               current database into <replaceable>TDBFILE</replaceable>.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>store</option>
+               <replaceable>KEY</replaceable>
+               <replaceable>DATA</replaceable>
+               </term>
+               <listitem><para>Store (replace) a record in the
+               current database.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>show</option>
+               <replaceable>KEY</replaceable>
+               </term>
+               <listitem><para>Show a record by key.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>delete</option>
+               <replaceable>KEY</replaceable>
+               </term>
+               <listitem><para>Delete a record by key.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>list</option>
+               </term>
+               <listitem><para>Print the current database hash table and free list.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>free</option>
+               </term>
+               <listitem><para>Print the current database and free list.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term><option>!</option>
+               <replaceable>COMMAND</replaceable>
+               </term>
+               <listitem><para>Execute the given system command.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>
+               <option>first</option>
+               </term>
+               <listitem><para>Print the first record in the current database.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>
+               <option>next</option>
+               </term>
+               <listitem><para>Print the next record in the current database.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>
+               <option>check</option>
+               </term>
+               <listitem><para>Check the integrity of the current database.
+               </para></listitem>
+               </varlistentry>
+
+               <varlistentry>
+               <term>
+               <option>quit</option>
+               </term>
+               <listitem><para>Exit <command>tdbtool</command>.
+               </para></listitem>
+               </varlistentry>
+
+       </variablelist>
+</refsect1>
+
+<refsect1>
+       <title>CAVEATS</title>
+       <para>The contents of the Samba TDB files are private
+       to the implementation and should not be altered with
+       <command>tdbtool</command>.
+       </para>
+</refsect1>
+
+<refsect1>
+       <title>VERSION</title>
+       <para>This man page is correct for version 3.0.25 of the Samba suite.</para>
+</refsect1>
+
+<refsect1>
+       <title>AUTHOR</title>
+
+       <para> The original Samba software and related utilities were
+       created by Andrew Tridgell.  Samba is now developed by the
+       Samba Team as an Open Source project similar to the way the
+       Linux kernel is developed.</para>
+</refsect1>
+
+</refentry>
diff --git a/ctdb/lib/tdb/pytdb.c b/ctdb/lib/tdb/pytdb.c
new file mode 100644 (file)
index 0000000..ae0e6f8
--- /dev/null
@@ -0,0 +1,689 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   Python interface to tdb.
+
+   Copyright (C) 2004-2006 Tim Potter <tpot@samba.org>
+   Copyright (C) 2007-2008 Jelmer Vernooij <jelmer@samba.org>
+
+     ** NOTE! The following LGPL license applies to the tdb
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Python.h>
+#include "replace.h"
+#include "system/filesys.h"
+
+#ifndef Py_RETURN_NONE
+#define Py_RETURN_NONE return Py_INCREF(Py_None), Py_None
+#endif
+
+/* Include tdb headers */
+#include <tdb.h>
+
+typedef struct {
+       PyObject_HEAD
+       TDB_CONTEXT *ctx;
+       bool closed;
+} PyTdbObject;
+
+staticforward PyTypeObject PyTdb;
+
+static void PyErr_SetTDBError(TDB_CONTEXT *tdb)
+{
+       PyErr_SetObject(PyExc_RuntimeError, 
+               Py_BuildValue("(i,s)", tdb_error(tdb), tdb_errorstr(tdb)));
+}
+
+static TDB_DATA PyString_AsTDB_DATA(PyObject *data)
+{
+       TDB_DATA ret;
+       ret.dptr = (unsigned char *)PyString_AsString(data);
+       ret.dsize = PyString_Size(data);
+       return ret;
+}
+
+static PyObject *PyString_FromTDB_DATA(TDB_DATA data)
+{
+       if (data.dptr == NULL && data.dsize == 0) {
+               Py_RETURN_NONE;
+       } else {
+               PyObject *ret = PyString_FromStringAndSize((const char *)data.dptr, 
+                                                                                                  data.dsize);
+               free(data.dptr);
+               return ret;
+    }
+}
+
+#define PyErr_TDB_ERROR_IS_ERR_RAISE(ret, tdb) \
+       if (ret != 0) { \
+               PyErr_SetTDBError(tdb); \
+               return NULL; \
+       }
+
+#define PyErr_TDB_RAISE_IF_CLOSED(self) \
+       if (self->closed) {                                             \
+               PyErr_SetObject(PyExc_RuntimeError,                             \
+                               Py_BuildValue("(i,s)", TDB_ERR_IO, "Database is already closed")); \
+               return NULL;                                            \
+       }
+
+#define PyErr_TDB_RAISE_RETURN_MINUS_1_IF_CLOSED(self) \
+       if (self->closed) {                                             \
+               PyErr_SetObject(PyExc_RuntimeError,                             \
+                               Py_BuildValue("(i,s)", TDB_ERR_IO, "Database is already closed")); \
+               return -1;                                              \
+       }
+
+static PyObject *py_tdb_open(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+       char *name = NULL;
+       int hash_size = 0, tdb_flags = TDB_DEFAULT, flags = O_RDWR, mode = 0600;
+       TDB_CONTEXT *ctx;
+       PyTdbObject *ret;
+       const char *kwnames[] = { "name", "hash_size", "tdb_flags", "flags", "mode", NULL };
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|siiii", (char **)kwnames, &name, &hash_size, &tdb_flags, &flags, &mode))
+               return NULL;
+
+       if (name == NULL) {
+               tdb_flags |= TDB_INTERNAL;
+       }
+
+       ctx = tdb_open(name, hash_size, tdb_flags, flags, mode);
+       if (ctx == NULL) {
+               PyErr_SetFromErrno(PyExc_IOError);
+               return NULL;
+       }
+
+       ret = PyObject_New(PyTdbObject, &PyTdb);
+       if (!ret) {
+               tdb_close(ctx);
+               return NULL;
+       }
+
+       ret->ctx = ctx;
+       ret->closed = false;
+       return (PyObject *)ret;
+}
+
+static PyObject *obj_transaction_cancel(PyTdbObject *self)
+{
+       int ret;
+
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       ret = tdb_transaction_cancel(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_commit(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_transaction_commit(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_prepare_commit(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_transaction_prepare_commit(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_transaction_start(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_transaction_start(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_reopen(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_reopen(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_lockall(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_lockall(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_unlockall(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_unlockall(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_lockall_read(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_lockall_read(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_unlockall_read(PyTdbObject *self)
+{
+       int ret = tdb_unlockall_read(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_close(PyTdbObject *self)
+{
+       int ret;
+       if (self->closed)
+               Py_RETURN_NONE;
+       ret = tdb_close(self->ctx);
+       self->closed = true;
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_get(PyTdbObject *self, PyObject *args)
+{
+       TDB_DATA key;
+       PyObject *py_key;
+
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "O", &py_key))
+               return NULL;
+
+       key = PyString_AsTDB_DATA(py_key);
+       if (!key.dptr)
+               return NULL;
+
+       return PyString_FromTDB_DATA(tdb_fetch(self->ctx, key));
+}
+
+static PyObject *obj_append(PyTdbObject *self, PyObject *args)
+{
+       TDB_DATA key, data;
+       PyObject *py_key, *py_data;
+       int ret;
+
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "OO", &py_key, &py_data))
+               return NULL;
+
+       key = PyString_AsTDB_DATA(py_key);
+       if (!key.dptr)
+               return NULL;
+       data = PyString_AsTDB_DATA(py_data);
+       if (!data.dptr)
+               return NULL;
+
+       ret = tdb_append(self->ctx, key, data);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_firstkey(PyTdbObject *self)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       return PyString_FromTDB_DATA(tdb_firstkey(self->ctx));
+}
+
+static PyObject *obj_nextkey(PyTdbObject *self, PyObject *args)
+{
+       TDB_DATA key;
+       PyObject *py_key;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "O", &py_key))
+               return NULL;
+
+       key = PyString_AsTDB_DATA(py_key);
+       if (!key.dptr)
+               return NULL;
+       
+       return PyString_FromTDB_DATA(tdb_nextkey(self->ctx, key));
+}
+
+static PyObject *obj_delete(PyTdbObject *self, PyObject *args)
+{
+       TDB_DATA key;
+       PyObject *py_key;
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "O", &py_key))
+               return NULL;
+
+       key = PyString_AsTDB_DATA(py_key);
+       if (!key.dptr)
+               return NULL;
+       ret = tdb_delete(self->ctx, key);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_has_key(PyTdbObject *self, PyObject *args)
+{
+       TDB_DATA key;
+       int ret;
+       PyObject *py_key;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "O", &py_key))
+               return NULL;
+
+       key = PyString_AsTDB_DATA(py_key);
+       if (!key.dptr)
+               return NULL;
+       ret = tdb_exists(self->ctx, key);
+       if (ret != TDB_ERR_NOEXIST) {
+               PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       }
+
+       return (ret == TDB_ERR_NOEXIST)?Py_False:Py_True;
+}
+
+static PyObject *obj_store(PyTdbObject *self, PyObject *args)
+{
+       TDB_DATA key, value;
+       int ret;
+       int flag = TDB_REPLACE;
+       PyObject *py_key, *py_value;
+
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "OO|i", &py_key, &py_value, &flag))
+               return NULL;
+
+       key = PyString_AsTDB_DATA(py_key);
+       if (!key.dptr)
+               return NULL;
+       value = PyString_AsTDB_DATA(py_value);
+       if (!value.dptr)
+               return NULL;
+
+       ret = tdb_store(self->ctx, key, value, flag);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_add_flags(PyTdbObject *self, PyObject *args)
+{
+       unsigned flags;
+
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "I", &flags))
+               return NULL;
+
+       tdb_add_flags(self->ctx, flags);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_remove_flags(PyTdbObject *self, PyObject *args)
+{
+       unsigned flags;
+
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       if (!PyArg_ParseTuple(args, "I", &flags))
+               return NULL;
+
+       tdb_remove_flags(self->ctx, flags);
+       Py_RETURN_NONE;
+}
+
+typedef struct {
+       PyObject_HEAD
+       TDB_DATA current;
+       PyTdbObject *iteratee;
+} PyTdbIteratorObject;
+
+static PyObject *tdb_iter_next(PyTdbIteratorObject *self)
+{
+       TDB_DATA current;
+       PyObject *ret;
+       if (self->current.dptr == NULL && self->current.dsize == 0)
+               return NULL;
+       current = self->current;
+       self->current = tdb_nextkey(self->iteratee->ctx, self->current);
+       ret = PyString_FromTDB_DATA(current);
+       return ret;
+}
+
+static void tdb_iter_dealloc(PyTdbIteratorObject *self)
+{
+       Py_DECREF(self->iteratee);
+       PyObject_Del(self);
+}
+
+PyTypeObject PyTdbIterator = {
+       .tp_name = "Iterator",
+       .tp_basicsize = sizeof(PyTdbIteratorObject),
+       .tp_iternext = (iternextfunc)tdb_iter_next,
+       .tp_dealloc = (destructor)tdb_iter_dealloc,
+       .tp_flags = Py_TPFLAGS_DEFAULT,
+       .tp_iter = PyObject_SelfIter,
+};
+
+static PyObject *tdb_object_iter(PyTdbObject *self)
+{
+       PyTdbIteratorObject *ret;       
+
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+
+       ret = PyObject_New(PyTdbIteratorObject, &PyTdbIterator);
+       if (!ret)
+               return NULL;
+       ret->current = tdb_firstkey(self->ctx);
+       ret->iteratee = self;
+       Py_INCREF(self);
+       return (PyObject *)ret;
+}
+
+static PyObject *obj_clear(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_wipe_all(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_repack(PyTdbObject *self)
+{
+       int ret;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       ret = tdb_repack(self->ctx);
+       PyErr_TDB_ERROR_IS_ERR_RAISE(ret, self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_enable_seqnum(PyTdbObject *self)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       tdb_enable_seqnum(self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyObject *obj_increment_seqnum_nonblock(PyTdbObject *self)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       tdb_increment_seqnum_nonblock(self->ctx);
+       Py_RETURN_NONE;
+}
+
+static PyMethodDef tdb_object_methods[] = {
+       { "transaction_cancel", (PyCFunction)obj_transaction_cancel, METH_NOARGS, 
+               "S.transaction_cancel() -> None\n"
+               "Cancel the currently active transaction." },
+       { "transaction_commit", (PyCFunction)obj_transaction_commit, METH_NOARGS,
+               "S.transaction_commit() -> None\n"
+               "Commit the currently active transaction." },
+       { "transaction_prepare_commit", (PyCFunction)obj_transaction_prepare_commit, METH_NOARGS,
+               "S.transaction_prepare_commit() -> None\n"
+               "Prepare to commit the currently active transaction" },
+       { "transaction_start", (PyCFunction)obj_transaction_start, METH_NOARGS,
+               "S.transaction_start() -> None\n"
+               "Start a new transaction." },
+       { "reopen", (PyCFunction)obj_reopen, METH_NOARGS, "Reopen this file." },
+       { "lock_all", (PyCFunction)obj_lockall, METH_NOARGS, NULL },
+       { "unlock_all", (PyCFunction)obj_unlockall, METH_NOARGS, NULL },
+       { "read_lock_all", (PyCFunction)obj_lockall_read, METH_NOARGS, NULL },
+       { "read_unlock_all", (PyCFunction)obj_unlockall_read, METH_NOARGS, NULL },
+       { "close", (PyCFunction)obj_close, METH_NOARGS, NULL },
+       { "get", (PyCFunction)obj_get, METH_VARARGS, "S.get(key) -> value\n"
+               "Fetch a value." },
+       { "append", (PyCFunction)obj_append, METH_VARARGS, "S.append(key, value) -> None\n"
+               "Append data to an existing key." },
+       { "firstkey", (PyCFunction)obj_firstkey, METH_NOARGS, "S.firstkey() -> data\n"
+               "Return the first key in this database." },
+       { "nextkey", (PyCFunction)obj_nextkey, METH_NOARGS, "S.nextkey(key) -> data\n"
+               "Return the next key in this database." },
+       { "delete", (PyCFunction)obj_delete, METH_VARARGS, "S.delete(key) -> None\n"
+               "Delete an entry." },
+       { "has_key", (PyCFunction)obj_has_key, METH_VARARGS, "S.has_key(key) -> None\n"
+               "Check whether key exists in this database." },
+       { "store", (PyCFunction)obj_store, METH_VARARGS, "S.store(key, data, flag=REPLACE) -> None"
+               "Store data." },
+       { "add_flags", (PyCFunction)obj_add_flags, METH_VARARGS, "S.add_flags(flags) -> None" },
+       { "remove_flags", (PyCFunction)obj_remove_flags, METH_VARARGS, "S.remove_flags(flags) -> None" },
+       { "iterkeys", (PyCFunction)tdb_object_iter, METH_NOARGS, "S.iterkeys() -> iterator" },
+       { "clear", (PyCFunction)obj_clear, METH_NOARGS, "S.clear() -> None\n"
+               "Wipe the entire database." },
+       { "repack", (PyCFunction)obj_repack, METH_NOARGS, "S.repack() -> None\n"
+               "Repack the entire database." },
+       { "enable_seqnum", (PyCFunction)obj_enable_seqnum, METH_NOARGS,
+               "S.enable_seqnum() -> None" },
+       { "increment_seqnum_nonblock", (PyCFunction)obj_increment_seqnum_nonblock, METH_NOARGS,
+               "S.increment_seqnum_nonblock() -> None" },
+       { NULL }
+};
+
+static PyObject *obj_get_hash_size(PyTdbObject *self, void *closure)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       return PyInt_FromLong(tdb_hash_size(self->ctx));
+}
+
+static int obj_set_max_dead(PyTdbObject *self, PyObject *max_dead, void *closure)
+{
+       PyErr_TDB_RAISE_RETURN_MINUS_1_IF_CLOSED(self);
+       if (!PyInt_Check(max_dead))
+               return -1;
+       tdb_set_max_dead(self->ctx, PyInt_AsLong(max_dead));
+       return 0;
+}
+
+static PyObject *obj_get_map_size(PyTdbObject *self, void *closure)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       return PyInt_FromLong(tdb_map_size(self->ctx));
+}
+
+static PyObject *obj_get_freelist_size(PyTdbObject *self, void *closure)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       return PyInt_FromLong(tdb_freelist_size(self->ctx));
+}
+
+static PyObject *obj_get_flags(PyTdbObject *self, void *closure)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       return PyInt_FromLong(tdb_get_flags(self->ctx));
+}
+
+static PyObject *obj_get_filename(PyTdbObject *self, void *closure)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       return PyString_FromString(tdb_name(self->ctx));
+}
+
+static PyObject *obj_get_seqnum(PyTdbObject *self, void *closure)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       return PyInt_FromLong(tdb_get_seqnum(self->ctx));
+}
+
+
+static PyGetSetDef tdb_object_getsetters[] = {
+       { (char *)"hash_size", (getter)obj_get_hash_size, NULL, NULL },
+       { (char *)"map_size", (getter)obj_get_map_size, NULL, NULL },
+       { (char *)"freelist_size", (getter)obj_get_freelist_size, NULL, NULL },
+       { (char *)"flags", (getter)obj_get_flags, NULL, NULL },
+       { (char *)"max_dead", NULL, (setter)obj_set_max_dead, NULL },
+       { (char *)"filename", (getter)obj_get_filename, NULL, (char *)"The filename of this TDB file."},
+       { (char *)"seqnum", (getter)obj_get_seqnum, NULL, NULL },
+       { NULL }
+};
+
+static PyObject *tdb_object_repr(PyTdbObject *self)
+{
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       if (tdb_get_flags(self->ctx) & TDB_INTERNAL) {
+               return PyString_FromString("Tdb(<internal>)");
+       } else {
+               return PyString_FromFormat("Tdb('%s')", tdb_name(self->ctx));
+       }
+}
+
+static void tdb_object_dealloc(PyTdbObject *self)
+{
+       if (!self->closed)
+               tdb_close(self->ctx);
+       self->ob_type->tp_free(self);
+}
+
+static PyObject *obj_getitem(PyTdbObject *self, PyObject *key)
+{
+       TDB_DATA tkey, val;
+       PyErr_TDB_RAISE_IF_CLOSED(self);
+       if (!PyString_Check(key)) {
+               PyErr_SetString(PyExc_TypeError, "Expected string as key");
+               return NULL;
+       }
+
+       tkey.dptr = (unsigned char *)PyString_AsString(key);
+       tkey.dsize = PyString_Size(key);
+
+       val = tdb_fetch(self->ctx, tkey);
+       if (val.dptr == NULL) {
+               PyErr_SetString(PyExc_KeyError, "No such TDB entry");
+               return NULL;
+       } else {
+               return PyString_FromTDB_DATA(val);
+       }
+}
+
+static int obj_setitem(PyTdbObject *self, PyObject *key, PyObject *value)
+{
+       TDB_DATA tkey, tval;
+       int ret;
+       PyErr_TDB_RAISE_RETURN_MINUS_1_IF_CLOSED(self);
+       if (!PyString_Check(key)) {
+               PyErr_SetString(PyExc_TypeError, "Expected string as key");
+               return -1;
+       }
+
+       tkey = PyString_AsTDB_DATA(key);
+
+       if (value == NULL) { 
+               ret = tdb_delete(self->ctx, tkey);
+       } else { 
+               if (!PyString_Check(value)) {
+                       PyErr_SetString(PyExc_TypeError, "Expected string as value");
+                       return -1;
+               }
+
+               tval = PyString_AsTDB_DATA(value);
+
+               ret = tdb_store(self->ctx, tkey, tval, TDB_REPLACE);
+       }
+
+       if (ret != 0) {
+               PyErr_SetTDBError(self->ctx);
+               return -1;
+       } 
+
+       return ret;
+}
+
+static PyMappingMethods tdb_object_mapping = {
+       .mp_subscript = (binaryfunc)obj_getitem,
+       .mp_ass_subscript = (objobjargproc)obj_setitem,
+};
+static PyTypeObject PyTdb = {
+       .tp_name = "tdb.Tdb",
+       .tp_basicsize = sizeof(PyTdbObject),
+       .tp_methods = tdb_object_methods,
+       .tp_getset = tdb_object_getsetters,
+       .tp_new = py_tdb_open,
+       .tp_doc = "A TDB file",
+       .tp_repr = (reprfunc)tdb_object_repr,
+       .tp_dealloc = (destructor)tdb_object_dealloc,
+       .tp_as_mapping = &tdb_object_mapping,
+       .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE|Py_TPFLAGS_HAVE_ITER,
+       .tp_iter = (getiterfunc)tdb_object_iter,
+};
+
+static PyMethodDef tdb_methods[] = {
+       { "open", (PyCFunction)py_tdb_open, METH_VARARGS|METH_KEYWORDS, "open(name, hash_size=0, tdb_flags=TDB_DEFAULT, flags=O_RDWR, mode=0600)\n"
+               "Open a TDB file." },
+       { NULL }
+};
+
+void inittdb(void);
+void inittdb(void)
+{
+       PyObject *m;
+
+       if (PyType_Ready(&PyTdb) < 0)
+               return;
+
+       if (PyType_Ready(&PyTdbIterator) < 0)
+               return;
+
+       m = Py_InitModule3("tdb", tdb_methods,
+               "simple key-value database that supports multiple writers.");
+       if (m == NULL)
+               return;
+
+       PyModule_AddObject(m, "REPLACE", PyInt_FromLong(TDB_REPLACE));
+       PyModule_AddObject(m, "INSERT", PyInt_FromLong(TDB_INSERT));
+       PyModule_AddObject(m, "MODIFY", PyInt_FromLong(TDB_MODIFY));
+
+       PyModule_AddObject(m, "DEFAULT", PyInt_FromLong(TDB_DEFAULT));
+       PyModule_AddObject(m, "CLEAR_IF_FIRST", PyInt_FromLong(TDB_CLEAR_IF_FIRST));
+       PyModule_AddObject(m, "INTERNAL", PyInt_FromLong(TDB_INTERNAL));
+       PyModule_AddObject(m, "NOLOCK", PyInt_FromLong(TDB_NOLOCK));
+       PyModule_AddObject(m, "NOMMAP", PyInt_FromLong(TDB_NOMMAP));
+       PyModule_AddObject(m, "CONVERT", PyInt_FromLong(TDB_CONVERT));
+       PyModule_AddObject(m, "BIGENDIAN", PyInt_FromLong(TDB_BIGENDIAN));
+       PyModule_AddObject(m, "NOSYNC", PyInt_FromLong(TDB_NOSYNC));
+       PyModule_AddObject(m, "SEQNUM", PyInt_FromLong(TDB_SEQNUM));
+       PyModule_AddObject(m, "VOLATILE", PyInt_FromLong(TDB_VOLATILE));
+       PyModule_AddObject(m, "ALLOW_NESTING", PyInt_FromLong(TDB_ALLOW_NESTING));
+       PyModule_AddObject(m, "DISALLOW_NESTING", PyInt_FromLong(TDB_DISALLOW_NESTING));
+       PyModule_AddObject(m, "INCOMPATIBLE_HASH", PyInt_FromLong(TDB_INCOMPATIBLE_HASH));
+
+       PyModule_AddObject(m, "__docformat__", PyString_FromString("restructuredText"));
+
+       PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
+
+       Py_INCREF(&PyTdb);
+       PyModule_AddObject(m, "Tdb", (PyObject *)&PyTdb);
+
+       Py_INCREF(&PyTdbIterator);
+}
diff --git a/ctdb/lib/tdb/python/tdbdump.py b/ctdb/lib/tdb/python/tdbdump.py
new file mode 100644 (file)
index 0000000..01859eb
--- /dev/null
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+# Trivial reimplementation of tdbdump in Python
+
+import tdb, sys
+
+if len(sys.argv) < 2:
+    print "Usage: tdbdump.py <tdb-file>"
+    sys.exit(1)
+
+db = tdb.Tdb(sys.argv[1])
+for (k, v) in db.iteritems():
+    print "{\nkey(%d) = %r\ndata(%d) = %r\n}" % (len(k), k, len(v), v)
diff --git a/ctdb/lib/tdb/python/tests/simple.py b/ctdb/lib/tdb/python/tests/simple.py
new file mode 100644 (file)
index 0000000..7e295a8
--- /dev/null
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# Some simple tests for the Python bindings for TDB
+# Note that this tests the interface of the Python bindings
+# It does not test tdb itself.
+#
+# Copyright (C) 2007-2008 Jelmer Vernooij <jelmer@samba.org>
+# Published under the GNU LGPLv3 or later
+
+import tdb
+from unittest import TestCase
+import os, tempfile
+
+
+class OpenTdbTests(TestCase):
+
+    def test_nonexistent_read(self):
+        self.assertRaises(IOError, tdb.Tdb, "/some/nonexistent/file", 0,
+                tdb.DEFAULT, os.O_RDWR)
+
+class CloseTdbTests(TestCase):
+
+    def test_double_close(self):
+        # No hash size in tdb2.
+        if tdb.__version__.startswith("2"):
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
+        else:
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
+        self.assertNotEqual(None, self.tdb)
+
+        # ensure that double close does not crash python
+        self.tdb.close()
+        self.tdb.close()
+
+        # Check that further operations do not crash python
+        self.assertRaises(RuntimeError, lambda: self.tdb.transaction_start())
+
+        self.assertRaises(RuntimeError, lambda: self.tdb["bar"])
+
+
+class InternalTdbTests(TestCase):
+
+    def test_repr(self):
+        self.tdb = tdb.Tdb()
+
+        # repr used to crash on internal db
+        self.assertEquals(repr(self.tdb), "Tdb(<internal>)")
+
+
+class SimpleTdbTests(TestCase):
+
+    def setUp(self):
+        super(SimpleTdbTests, self).setUp()
+        if tdb.__version__.startswith("2"):
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
+        else:
+            self.tdb = tdb.Tdb(tempfile.mkstemp()[1], 0, tdb.DEFAULT,
+                               os.O_CREAT|os.O_RDWR)
+        self.assertNotEqual(None, self.tdb)
+
+    def tearDown(self):
+        del self.tdb
+
+    def test_repr(self):
+        self.assertTrue(repr(self.tdb).startswith("Tdb('"))
+
+    def test_lockall(self):
+        self.tdb.lock_all()
+
+    def test_max_dead(self):
+        if not tdb.__version__.startswith("2"):
+            self.tdb.max_dead = 20
+
+    def test_unlockall(self):
+        self.tdb.lock_all()
+        self.tdb.unlock_all()
+
+    def test_lockall_read(self):
+        self.tdb.read_lock_all()
+        self.tdb.read_unlock_all()
+
+    def test_reopen(self):
+        if not tdb.__version__.startswith("2"):
+            self.tdb.reopen()
+
+    def test_store(self):
+        self.tdb.store("bar", "bla")
+        self.assertEquals("bla", self.tdb.get("bar"))
+
+    def test_getitem(self):
+        self.tdb["bar"] = "foo"
+        if not tdb.__version__.startswith("2"):
+            self.tdb.reopen()
+        self.assertEquals("foo", self.tdb["bar"])
+
+    def test_delete(self):
+        self.tdb["bar"] = "foo"
+        del self.tdb["bar"]
+        self.assertRaises(KeyError, lambda: self.tdb["bar"])
+    
+    def test_contains(self):
+        self.tdb["bla"] = "bloe"
+        self.assertTrue("bla" in self.tdb)
+
+    def test_keyerror(self):
+        self.assertRaises(KeyError, lambda: self.tdb["bla"])
+
+    def test_hash_size(self):
+        if not tdb.__version__.startswith("2"):
+            self.tdb.hash_size
+
+    def test_map_size(self):
+        if not tdb.__version__.startswith("2"):
+            self.tdb.map_size
+
+    def test_freelist_size(self):
+        if not tdb.__version__.startswith("2"):
+            self.tdb.freelist_size
+
+    def test_name(self):
+        self.tdb.filename
+
+    def test_iterator(self):
+        self.tdb["bla"] = "1"
+        self.tdb["brainslug"] = "2"
+        l = list(self.tdb)
+        l.sort()
+        self.assertEquals(["bla", "brainslug"], l)
+
+    def test_transaction_cancel(self):
+        self.tdb["bloe"] = "2"
+        self.tdb.transaction_start()
+        self.tdb["bloe"] = "1"
+        self.tdb.transaction_cancel()
+        self.assertEquals("2", self.tdb["bloe"])
+
+    def test_transaction_commit(self):
+        self.tdb["bloe"] = "2"
+        self.tdb.transaction_start()
+        self.tdb["bloe"] = "1"
+        self.tdb.transaction_commit()
+        self.assertEquals("1", self.tdb["bloe"])
+
+    def test_transaction_prepare_commit(self):
+        self.tdb["bloe"] = "2"
+        self.tdb.transaction_start()
+        self.tdb["bloe"] = "1"
+        self.tdb.transaction_prepare_commit()
+        self.tdb.transaction_commit()
+        self.assertEquals("1", self.tdb["bloe"])
+
+    def test_iterkeys(self):
+        self.tdb["bloe"] = "2"
+        self.tdb["bla"] = "25"
+        i = self.tdb.iterkeys()
+        self.assertEquals(set(["bloe", "bla"]), set([i.next(), i.next()]))
+
+    def test_clear(self):
+        self.tdb["bloe"] = "2"
+        self.tdb["bla"] = "25"
+        self.assertEquals(2, len(list(self.tdb)))
+        self.tdb.clear()
+        self.assertEquals(0, len(list(self.tdb)))
+
+    def test_repack(self):
+        if not tdb.__version__.startswith("2"):
+            self.tdb["foo"] = "abc"
+            self.tdb["bar"] = "def"
+            del self.tdb["foo"]
+            self.tdb.repack()
+
+    def test_seqnum(self):
+        if not tdb.__version__.startswith("2"):
+            self.tdb.enable_seqnum()
+            seq1 = self.tdb.seqnum
+            self.tdb.increment_seqnum_nonblock()
+            seq2 = self.tdb.seqnum
+            self.assertEquals(seq2-seq1, 1)
+
+    def test_len(self):
+        self.assertEquals(0, len(list(self.tdb)))
+        self.tdb["entry"] = "value"
+        self.assertEquals(1, len(list(self.tdb)))
+
+    def test_add_flags(self):
+        if tdb.__version__.startswith("2"):
+            self.tdb.add_flag(tdb.NOMMAP)
+            self.tdb.remove_flag(tdb.NOMMAP)
+        else:
+            self.tdb.add_flags(tdb.NOMMAP)
+            self.tdb.remove_flags(tdb.NOMMAP)
+
+
+class VersionTests(TestCase):
+
+    def test_present(self):
+        self.assertTrue(isinstance(tdb.__version__, str))
+
+
+if __name__ == '__main__':
+    import unittest
+    unittest.TestProgram()
diff --git a/ctdb/lib/tdb/tdb.pc.in b/ctdb/lib/tdb/tdb.pc.in
new file mode 100644 (file)
index 0000000..b78419e
--- /dev/null
@@ -0,0 +1,11 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: tdb
+Description: A trivial database
+Version: @PACKAGE_VERSION@
+Libs: @LIB_RPATH@ -L${libdir} -ltdb
+Cflags: -I${includedir} 
+URL: http://tdb.samba.org/
diff --git a/ctdb/lib/tdb/test/external-agent.c b/ctdb/lib/tdb/test/external-agent.c
new file mode 100644 (file)
index 0000000..8140e70
--- /dev/null
@@ -0,0 +1,198 @@
+#include "external-agent.h"
+#include "lock-tracking.h"
+#include "logging.h"
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <string.h>
+#include <errno.h>
+#include "../common/tdb_private.h"
+#include "tap-interface.h"
+#include <stdio.h>
+#include <stdarg.h>
+
+static struct tdb_context *tdb;
+
+static enum agent_return do_operation(enum operation op, const char *name)
+{
+       TDB_DATA k;
+       enum agent_return ret;
+       TDB_DATA data;
+
+       if (op != OPEN && op != OPEN_WITH_CLEAR_IF_FIRST && !tdb) {
+               diag("external: No tdb open!");
+               return OTHER_FAILURE;
+       }
+
+       k.dptr = (void *)name;
+       k.dsize = strlen(name);
+
+       locking_would_block = 0;
+       switch (op) {
+       case OPEN:
+               if (tdb) {
+                       diag("Already have tdb %s open", tdb_name(tdb));
+                       return OTHER_FAILURE;
+               }
+               tdb = tdb_open_ex(name, 0, TDB_DEFAULT, O_RDWR, 0,
+                                 &taplogctx, NULL);
+               if (!tdb) {
+                       if (!locking_would_block)
+                               diag("Opening tdb gave %s", strerror(errno));
+                       ret = OTHER_FAILURE;
+               } else
+                       ret = SUCCESS;
+               break;
+       case OPEN_WITH_CLEAR_IF_FIRST:
+               if (tdb)
+                       return OTHER_FAILURE;
+               tdb = tdb_open_ex(name, 0, TDB_CLEAR_IF_FIRST, O_RDWR, 0,
+                                 &taplogctx, NULL);
+               ret = tdb ? SUCCESS : OTHER_FAILURE;
+               break;
+       case TRANSACTION_START:
+               ret = tdb_transaction_start(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
+               break;
+       case FETCH:
+               data = tdb_fetch(tdb, k);
+               if (data.dptr == NULL) {
+                       if (tdb_error(tdb) == TDB_ERR_NOEXIST)
+                               ret = FAILED;
+                       else
+                               ret = OTHER_FAILURE;
+               } else if (data.dsize != k.dsize
+                          || memcmp(data.dptr, k.dptr, k.dsize) != 0) {
+                       ret = OTHER_FAILURE;
+               } else {
+                       ret = SUCCESS;
+               }
+               free(data.dptr);
+               break;
+       case STORE:
+               ret = tdb_store(tdb, k, k, 0) == 0 ? SUCCESS : OTHER_FAILURE;
+               break;
+       case TRANSACTION_COMMIT:
+               ret = tdb_transaction_commit(tdb)==0 ? SUCCESS : OTHER_FAILURE;
+               break;
+       case CHECK:
+               ret = tdb_check(tdb, NULL, NULL) == 0 ? SUCCESS : OTHER_FAILURE;
+               break;
+       case NEEDS_RECOVERY:
+               ret = tdb_needs_recovery(tdb) ? SUCCESS : FAILED;
+               break;
+       case CLOSE:
+               ret = tdb_close(tdb) == 0 ? SUCCESS : OTHER_FAILURE;
+               tdb = NULL;
+               break;
+       default:
+               ret = OTHER_FAILURE;
+       }
+
+       if (locking_would_block)
+               ret = WOULD_HAVE_BLOCKED;
+
+       return ret;
+}
+
+struct agent {
+       int cmdfd, responsefd;
+};
+
+/* Do this before doing any tdb stuff.  Return handle, or NULL. */
+struct agent *prepare_external_agent(void)
+{
+       int pid, ret;
+       int command[2], response[2];
+       char name[1+PATH_MAX];
+
+       if (pipe(command) != 0 || pipe(response) != 0) {
+               fprintf(stderr, "pipe failed: %s\n", strerror(errno));
+               exit(1);
+       }
+
+       pid = fork();
+       if (pid < 0) {
+               fprintf(stderr, "fork failed: %s\n", strerror(errno));
+               exit(1);
+       }
+
+       if (pid != 0) {
+               struct agent *agent = malloc(sizeof(*agent));
+
+               close(command[0]);
+               close(response[1]);
+               agent->cmdfd = command[1];
+               agent->responsefd = response[0];
+               return agent;
+       }
+
+       close(command[1]);
+       close(response[0]);
+
+       /* We want to fail, not block. */
+       nonblocking_locks = true;
+       log_prefix = "external: ";
+       while ((ret = read(command[0], name, sizeof(name))) > 0) {
+               enum agent_return result;
+
+               result = do_operation(name[0], name+1);
+               if (write(response[1], &result, sizeof(result))
+                   != sizeof(result))
+                       abort();
+       }
+       exit(0);
+}
+
+/* Ask the external agent to try to do an operation. */
+enum agent_return external_agent_operation(struct agent *agent,
+                                          enum operation op,
+                                          const char *name)
+{
+       enum agent_return res;
+       unsigned int len;
+       char *string;
+
+       if (!name)
+               name = "";
+       len = 1 + strlen(name) + 1;
+       string = malloc(len);
+
+       string[0] = op;
+       strcpy(string+1, name);
+
+       if (write(agent->cmdfd, string, len) != len
+           || read(agent->responsefd, &res, sizeof(res)) != sizeof(res))
+               res = AGENT_DIED;
+
+       free(string);
+       return res;
+}
+
+const char *agent_return_name(enum agent_return ret)
+{
+       return ret == SUCCESS ? "SUCCESS"
+               : ret == WOULD_HAVE_BLOCKED ? "WOULD_HAVE_BLOCKED"
+               : ret == AGENT_DIED ? "AGENT_DIED"
+               : ret == FAILED ? "FAILED"
+               : ret == OTHER_FAILURE ? "OTHER_FAILURE"
+               : "**INVALID**";
+}
+
+const char *operation_name(enum operation op)
+{
+       switch (op) {
+       case OPEN: return "OPEN";
+       case OPEN_WITH_CLEAR_IF_FIRST: return "OPEN_WITH_CLEAR_IF_FIRST";
+       case TRANSACTION_START: return "TRANSACTION_START";
+       case FETCH: return "FETCH";
+       case STORE: return "STORE";
+       case TRANSACTION_COMMIT: return "TRANSACTION_COMMIT";
+       case CHECK: return "CHECK";
+       case NEEDS_RECOVERY: return "NEEDS_RECOVERY";
+       case CLOSE: return "CLOSE";
+       }
+       return "**INVALID**";
+}
diff --git a/ctdb/lib/tdb/test/external-agent.h b/ctdb/lib/tdb/test/external-agent.h
new file mode 100644 (file)
index 0000000..dffdca9
--- /dev/null
@@ -0,0 +1,41 @@
+#ifndef TDB_TEST_EXTERNAL_AGENT_H
+#define TDB_TEST_EXTERNAL_AGENT_H
+
+/* For locking tests, we need a different process to try things at
+ * various times. */
+enum operation {
+       OPEN,
+       OPEN_WITH_CLEAR_IF_FIRST,
+       TRANSACTION_START,
+       FETCH,
+       STORE,
+       TRANSACTION_COMMIT,
+       CHECK,
+       NEEDS_RECOVERY,
+       CLOSE,
+};
+
+/* Do this before doing any tdb stuff.  Return handle, or -1. */
+struct agent *prepare_external_agent(void);
+
+enum agent_return {
+       SUCCESS,
+       WOULD_HAVE_BLOCKED,
+       AGENT_DIED,
+       FAILED, /* For fetch, or NEEDS_RECOVERY */
+       OTHER_FAILURE,
+};
+
+/* Ask the external agent to try to do an operation.
+ * name == tdb name for OPEN/OPEN_WITH_CLEAR_IF_FIRST,
+ * record name for FETCH/STORE (store stores name as data too)
+ */
+enum agent_return external_agent_operation(struct agent *handle,
+                                          enum operation op,
+                                          const char *name);
+
+/* Mapping enum -> string. */
+const char *agent_return_name(enum agent_return ret);
+const char *operation_name(enum operation op);
+
+#endif /* TDB_TEST_EXTERNAL_AGENT_H */
diff --git a/ctdb/lib/tdb/test/jenkins-be-hash.tdb b/ctdb/lib/tdb/test/jenkins-be-hash.tdb
new file mode 100644 (file)
index 0000000..b652840
Binary files /dev/null and b/ctdb/lib/tdb/test/jenkins-be-hash.tdb differ
diff --git a/ctdb/lib/tdb/test/jenkins-le-hash.tdb b/ctdb/lib/tdb/test/jenkins-le-hash.tdb
new file mode 100644 (file)
index 0000000..007e0a3
Binary files /dev/null and b/ctdb/lib/tdb/test/jenkins-le-hash.tdb differ
diff --git a/ctdb/lib/tdb/test/lock-tracking.c b/ctdb/lib/tdb/test/lock-tracking.c
new file mode 100644 (file)
index 0000000..90a07f8
--- /dev/null
@@ -0,0 +1,146 @@
+/* We save the locks so we can reaquire them. */
+#include "../common/tdb_private.h"
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include "tap-interface.h"
+#include "lock-tracking.h"
+
+struct testlock {
+       struct testlock *next;
+       unsigned int off;
+       unsigned int len;
+       int type;
+};
+static struct testlock *testlocks;
+int locking_errors = 0;
+bool suppress_lockcheck = false;
+bool nonblocking_locks;
+int locking_would_block = 0;
+void (*unlock_callback)(int fd);
+
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ )
+{
+       va_list ap;
+       int ret, arg3;
+       struct flock *fl;
+       bool may_block = false;
+
+       if (cmd != F_SETLK && cmd != F_SETLKW) {
+               /* This may be totally bogus, but we don't know in general. */
+               va_start(ap, cmd);
+               arg3 = va_arg(ap, int);
+               va_end(ap);
+
+               return fcntl(fd, cmd, arg3);
+       }
+
+       va_start(ap, cmd);
+       fl = va_arg(ap, struct flock *);
+       va_end(ap);
+
+       if (cmd == F_SETLKW && nonblocking_locks) {
+               cmd = F_SETLK;
+               may_block = true;
+       }
+       ret = fcntl(fd, cmd, fl);
+
+       /* Detect when we failed, but might have been OK if we waited. */
+       if (may_block && ret == -1 && (errno == EAGAIN || errno == EACCES)) {
+               locking_would_block++;
+       }
+
+       if (fl->l_type == F_UNLCK) {
+               struct testlock **l;
+               struct testlock *old = NULL;
+
+               for (l = &testlocks; *l; l = &(*l)->next) {
+                       if ((*l)->off == fl->l_start
+                           && (*l)->len == fl->l_len) {
+                               if (ret == 0) {
+                                       old = *l;
+                                       *l = (*l)->next;
+                                       free(old);
+                               }
+                               break;
+                       }
+               }
+               if (!old && !suppress_lockcheck) {
+                       diag("Unknown unlock %u@%u - %i",
+                            (int)fl->l_len, (int)fl->l_start, ret);
+                       locking_errors++;
+               }
+       } else {
+               struct testlock *new, *i;
+               unsigned int fl_end = fl->l_start + fl->l_len;
+               if (fl->l_len == 0)
+                       fl_end = (unsigned int)-1;
+
+               /* Check for overlaps: we shouldn't do this. */
+               for (i = testlocks; i; i = i->next) {
+                       unsigned int i_end = i->off + i->len;
+                       if (i->len == 0)
+                               i_end = (unsigned int)-1;
+
+                       if (fl->l_start >= i->off && fl->l_start < i_end)
+                               break;
+                       if (fl_end >= i->off && fl_end < i_end)
+                               break;
+
+                       /* tdb_allrecord_lock does this, handle adjacent: */
+                       if (fl->l_start == i_end && fl->l_type == i->type) {
+                               if (ret == 0) {
+                                       i->len = fl->l_len
+                                               ? i->len + fl->l_len
+                                               : 0;
+                               }
+                               goto done;
+                       }
+               }
+               if (i) {
+                       /* Special case: upgrade of allrecord lock. */
+                       if (i->type == F_RDLCK && fl->l_type == F_WRLCK
+                           && i->off == FREELIST_TOP
+                           && fl->l_start == FREELIST_TOP
+                           && i->len == 0
+                           && fl->l_len == 0) {
+                               if (ret == 0)
+                                       i->type = F_WRLCK;
+                               goto done;
+                       }
+                       if (!suppress_lockcheck) {
+                               diag("%s testlock %u@%u overlaps %u@%u",
+                                    fl->l_type == F_WRLCK ? "write" : "read",
+                                    (int)fl->l_len, (int)fl->l_start,
+                                    i->len, (int)i->off);
+                               locking_errors++;
+                       }
+               }
+
+               if (ret == 0) {
+                       new = malloc(sizeof *new);
+                       new->off = fl->l_start;
+                       new->len = fl->l_len;
+                       new->type = fl->l_type;
+                       new->next = testlocks;
+                       testlocks = new;
+               }
+       }
+done:
+       if (ret == 0 && fl->l_type == F_UNLCK && unlock_callback)
+               unlock_callback(fd);
+       return ret;
+}
+
+unsigned int forget_locking(void)
+{
+       unsigned int num = 0;
+       while (testlocks) {
+               struct testlock *next = testlocks->next;
+               free(testlocks);
+               testlocks = next;
+               num++;
+       }
+       return num;
+}
diff --git a/ctdb/lib/tdb/test/lock-tracking.h b/ctdb/lib/tdb/test/lock-tracking.h
new file mode 100644 (file)
index 0000000..f2c9c44
--- /dev/null
@@ -0,0 +1,25 @@
+#ifndef LOCK_TRACKING_H
+#define LOCK_TRACKING_H
+#include <stdbool.h>
+
+/* Set this if you want a callback after fnctl unlock. */
+extern void (*unlock_callback)(int fd);
+
+/* Replacement fcntl. */
+int fcntl_with_lockcheck(int fd, int cmd, ... /* arg */ );
+
+/* Discard locking info: returns number of locks outstanding. */
+unsigned int forget_locking(void);
+
+/* Number of errors in locking. */
+extern int locking_errors;
+
+/* Suppress lock checking. */
+extern bool suppress_lockcheck;
+
+/* Make all locks non-blocking. */
+extern bool nonblocking_locks;
+
+/* Number of times we failed a lock because we made it non-blocking. */
+extern int locking_would_block;
+#endif /* LOCK_TRACKING_H */
diff --git a/ctdb/lib/tdb/test/logging.c b/ctdb/lib/tdb/test/logging.c
new file mode 100644 (file)
index 0000000..dfab486
--- /dev/null
@@ -0,0 +1,33 @@
+#include "logging.h"
+#include "tap-interface.h"
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+bool suppress_logging = false;
+const char *log_prefix = "";
+
+/* Turn log messages into tap diag messages. */
+static void taplog(struct tdb_context *tdb,
+                  enum tdb_debug_level level,
+                  const char *fmt, ...)
+{
+       va_list ap;
+       char line[200];
+
+       if (suppress_logging)
+               return;
+
+       va_start(ap, fmt);
+       vsprintf(line, fmt, ap);
+       va_end(ap);
+
+       /* Strip trailing \n: diag adds it. */
+       if (line[0] && line[strlen(line)-1] == '\n')
+               diag("%s%.*s", log_prefix, (unsigned)strlen(line)-1, line);
+       else
+               diag("%s%s", log_prefix, line);
+}
+
+struct tdb_logging_context taplogctx = { taplog, NULL };
diff --git a/ctdb/lib/tdb/test/logging.h b/ctdb/lib/tdb/test/logging.h
new file mode 100644 (file)
index 0000000..89e77b2
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef TDB_TEST_LOGGING_H
+#define TDB_TEST_LOGGING_H
+#include "replace.h"
+#include "../include/tdb.h"
+#include <stdbool.h>
+
+extern bool suppress_logging;
+extern const char *log_prefix;
+extern struct tdb_logging_context taplogctx;
+
+#endif /* TDB_TEST_LOGGING_H */
diff --git a/ctdb/lib/tdb/test/old-nohash-be.tdb b/ctdb/lib/tdb/test/old-nohash-be.tdb
new file mode 100644 (file)
index 0000000..1c49116
Binary files /dev/null and b/ctdb/lib/tdb/test/old-nohash-be.tdb differ
diff --git a/ctdb/lib/tdb/test/old-nohash-le.tdb b/ctdb/lib/tdb/test/old-nohash-le.tdb
new file mode 100644 (file)
index 0000000..0655072
Binary files /dev/null and b/ctdb/lib/tdb/test/old-nohash-le.tdb differ
diff --git a/ctdb/lib/tdb/test/run-3G-file.c b/ctdb/lib/tdb/test/run-3G-file.c
new file mode 100644 (file)
index 0000000..3ee9de1
--- /dev/null
@@ -0,0 +1,144 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+static int tdb_expand_file_sparse(struct tdb_context *tdb,
+                                 tdb_off_t size,
+                                 tdb_off_t addition)
+{
+       if (tdb->read_only || tdb->traverse_read) {
+               tdb->ecode = TDB_ERR_RDONLY;
+               return -1;
+       }
+
+       if (ftruncate(tdb->fd, size+addition) == -1) {
+               char b = 0;
+               ssize_t written = pwrite(tdb->fd,  &b, 1, (size+addition) - 1);
+               if (written == 0) {
+                       /* try once more, potentially revealing errno */
+                       written = pwrite(tdb->fd,  &b, 1, (size+addition) - 1);
+               }
+               if (written == 0) {
+                       /* again - give up, guessing errno */
+                       errno = ENOSPC;
+               }
+               if (written != 1) {
+                       TDB_LOG((tdb, TDB_DEBUG_FATAL, "expand_file to %d failed (%s)\n",
+                                size+addition, strerror(errno)));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static const struct tdb_methods large_io_methods = {
+       tdb_read,
+       tdb_write,
+       tdb_next_hash_chain,
+       tdb_oob,
+       tdb_expand_file_sparse
+};
+
+static int test_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+                        void *_data)
+{
+       TDB_DATA *expect = _data;
+       ok1(key.dsize == strlen("hi"));
+       ok1(memcmp(key.dptr, "hi", strlen("hi")) == 0);
+       ok1(data.dsize == expect->dsize);
+       ok1(memcmp(data.dptr, expect->dptr, data.dsize) == 0);
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, orig_data, data;
+       uint32_t hash;
+       tdb_off_t rec_ptr;
+       struct tdb_record rec;
+       int ret;
+
+       plan_tests(24);
+       tdb = tdb_open_ex("run-36-file.tdb", 1024, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       ok1(tdb);
+       tdb->methods = &large_io_methods;
+
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       orig_data.dsize = strlen("world");
+       orig_data.dptr = (void *)"world";
+
+       /* Enlarge the file (internally multiplies by 2). */
+       ret = tdb_expand(tdb, 1500000000);
+#ifdef HAVE_INCOHERENT_MMAP
+       /* This can fail due to mmap failure on 32 bit systems. */
+       if (ret == -1) {
+               /* These should now fail. */
+               ok1(tdb_store(tdb, key, orig_data, TDB_INSERT) == -1);
+               data = tdb_fetch(tdb, key);
+               ok1(data.dptr == NULL);
+               ok1(tdb_traverse(tdb, test_traverse, &orig_data) == -1);
+               ok1(tdb_delete(tdb, key) == -1);
+               ok1(tdb_traverse(tdb, test_traverse, NULL) == -1);
+               /* Skip the rest... */
+               for (ret = 0; ret < 24 - 6; ret++)
+                       ok1(1);
+               tdb_close(tdb);
+               return exit_status();
+       }
+#endif
+       ok1(ret == 0);
+
+       /* Put an entry in, and check it. */
+       ok1(tdb_store(tdb, key, orig_data, TDB_INSERT) == 0);
+
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+
+       /* That currently fills at the end, make sure that's true. */
+       hash = tdb->hash_fn(&key);
+       rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec);
+       ok1(rec_ptr);
+       ok1(rec_ptr > 2U*1024*1024*1024);
+       tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
+
+       /* Traverse must work. */
+       ok1(tdb_traverse(tdb, test_traverse, &orig_data) == 1);
+
+       /* Delete should work. */
+       ok1(tdb_delete(tdb, key) == 0);
+
+       ok1(tdb_traverse(tdb, test_traverse, NULL) == 0);
+
+       /* Transactions should work. */
+       ok1(tdb_transaction_start(tdb) == 0);
+       ok1(tdb_store(tdb, key, orig_data, TDB_INSERT) == 0);
+
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+       ok1(tdb_transaction_commit(tdb) == 0);
+
+       ok1(tdb_traverse(tdb, test_traverse, &orig_data) == 1);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-bad-tdb-header.c b/ctdb/lib/tdb/test/run-bad-tdb-header.c
new file mode 100644 (file)
index 0000000..b00fb89
--- /dev/null
@@ -0,0 +1,58 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       struct tdb_header hdr;
+       int fd;
+
+       plan_tests(11);
+       /* Can open fine if complete crap, as long as O_CREAT. */
+       fd = open("run-bad-tdb-header.tdb", O_RDWR|O_CREAT|O_TRUNC, 0600);
+       ok1(fd >= 0);
+       ok1(write(fd, "hello world", 11) == 11);
+       close(fd);
+       tdb = tdb_open_ex("run-bad-tdb-header.tdb", 1024, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(!tdb);
+       tdb = tdb_open_ex("run-bad-tdb-header.tdb", 1024, 0, O_CREAT|O_RDWR,
+                         0600, &taplogctx, NULL);
+       ok1(tdb);
+       tdb_close(tdb);
+
+       /* Now, with wrong version it should *not* overwrite. */
+       fd = open("run-bad-tdb-header.tdb", O_RDWR);
+       ok1(fd >= 0);
+       ok1(read(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
+       ok1(hdr.version == TDB_VERSION);
+       hdr.version++;
+       lseek(fd, 0, SEEK_SET);
+       ok1(write(fd, &hdr, sizeof(hdr)) == sizeof(hdr));
+       close(fd);
+
+       tdb = tdb_open_ex("run-bad-tdb-header.tdb", 1024, 0, O_RDWR|O_CREAT,
+                         0600, &taplogctx, NULL);
+       ok1(errno == EIO);
+       ok1(!tdb);
+
+       /* With truncate, will be fine. */
+       tdb = tdb_open_ex("run-bad-tdb-header.tdb", 1024, 0,
+                         O_RDWR|O_CREAT|O_TRUNC, 0600, &taplogctx, NULL);
+       ok1(tdb);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-check.c b/ctdb/lib/tdb/test/run-check.c
new file mode 100644 (file)
index 0000000..05f7aec
--- /dev/null
@@ -0,0 +1,64 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(13);
+       tdb = tdb_open_ex("run-check.tdb", 1, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data.dsize = strlen("world");
+       data.dptr = (void *)"world";
+
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       tdb = tdb_open_ex("run-check.tdb", 1024, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       tdb = tdb_open_ex("test/tdb.corrupt", 1024, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == -1);
+       ok1(tdb_error(tdb) == TDB_ERR_CORRUPT);
+       tdb_close(tdb);
+
+       /* Big and little endian should work! */
+       tdb = tdb_open_ex("test/old-nohash-le.tdb", 1024, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       tdb = tdb_open_ex("test/old-nohash-be.tdb", 1024, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-corrupt.c b/ctdb/lib/tdb/test/run-corrupt.c
new file mode 100644 (file)
index 0000000..1a3c769
--- /dev/null
@@ -0,0 +1,131 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+static int check(TDB_DATA key, TDB_DATA data, void *private)
+{
+       unsigned int *sizes = private;
+
+       if (key.dsize > strlen("hello"))
+               return -1;
+       if (memcmp(key.dptr, "hello", key.dsize) != 0)
+               return -1;
+
+       if (data.dsize != strlen("world"))
+               return -1;
+       if (memcmp(data.dptr, "world", data.dsize) != 0)
+               return -1;
+
+       sizes[0] += key.dsize;
+       sizes[1] += data.dsize;
+       return 0;
+}
+
+static void tdb_flip_bit(struct tdb_context *tdb, unsigned int bit)
+{
+       unsigned int off = bit / CHAR_BIT;
+       unsigned char mask = (1 << (bit % CHAR_BIT));
+
+       if (tdb->map_ptr)
+               ((unsigned char *)tdb->map_ptr)[off] ^= mask;
+       else {
+               unsigned char c;
+               if (pread(tdb->fd, &c, 1, off) != 1) {
+                       fprintf(stderr, "pread: %s\n", strerror(errno));
+                       exit(1);
+               }
+               c ^= mask;
+               if (pwrite(tdb->fd, &c, 1, off) != 1) {
+                       fprintf(stderr, "pwrite: %s\n", strerror(errno));
+                       exit(1);
+               }
+       }
+}
+
+static void check_test(struct tdb_context *tdb)
+{
+       TDB_DATA key, data;
+       unsigned int i, verifiable, corrupt, sizes[2], dsize, ksize;
+
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+       key.dptr = (void *)"hello";
+       data.dsize = strlen("world");
+       data.dptr = (void *)"world";
+
+       /* Key and data size respectively. */
+       dsize = ksize = 0;
+
+       /* 5 keys in hash size 2 means we'll have multichains. */
+       for (key.dsize = 1; key.dsize <= 5; key.dsize++) {
+               ksize += key.dsize;
+               dsize += data.dsize;
+               if (tdb_store(tdb, key, data, TDB_INSERT) != 0)
+                       abort();
+       }
+
+       /* This is how many bytes we expect to be verifiable. */
+       /* From the file header. */
+       verifiable = strlen(TDB_MAGIC_FOOD) + 1
+               + 2 * sizeof(uint32_t) + 2 * sizeof(tdb_off_t)
+               + 2 * sizeof(uint32_t);
+       /* From the free list chain and hash chains. */
+       verifiable += 3 * sizeof(tdb_off_t);
+       /* From the record headers & tailer */
+       verifiable += 5 * (sizeof(struct tdb_record) + sizeof(uint32_t));
+       /* The free block: we ignore datalen, keylen, full_hash. */
+       verifiable += sizeof(struct tdb_record) - 3*sizeof(uint32_t) +
+               sizeof(uint32_t);
+       /* Our check function verifies the key and data. */
+       verifiable += ksize + dsize;
+
+       /* Flip one bit at a time, make sure it detects verifiable bytes. */
+       for (i = 0, corrupt = 0; i < tdb->map_size * CHAR_BIT; i++) {
+               tdb_flip_bit(tdb, i);
+               memset(sizes, 0, sizeof(sizes));
+               if (tdb_check(tdb, check, sizes) != 0)
+                       corrupt++;
+               else if (sizes[0] != ksize || sizes[1] != dsize)
+                       corrupt++;
+               tdb_flip_bit(tdb, i);
+       }
+       ok(corrupt == verifiable * CHAR_BIT, "corrupt %u should be %u",
+          corrupt, verifiable * CHAR_BIT);
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+
+       plan_tests(4);
+       /* This should use mmap. */
+       tdb = tdb_open_ex("run-corrupt.tdb", 2, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       if (!tdb)
+               abort();
+       check_test(tdb);
+       tdb_close(tdb);
+
+       /* This should not. */
+       tdb = tdb_open_ex("run-corrupt.tdb", 2, TDB_CLEAR_IF_FIRST|TDB_NOMMAP,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       if (!tdb)
+               abort();
+       check_test(tdb);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-die-during-transaction.c b/ctdb/lib/tdb/test/run-die-during-transaction.c
new file mode 100644 (file)
index 0000000..6e3a70d
--- /dev/null
@@ -0,0 +1,231 @@
+#include "../common/tdb_private.h"
+#include "lock-tracking.h"
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <setjmp.h>
+#include "external-agent.h"
+#include "logging.h"
+
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+static bool in_transaction;
+static int target, current;
+static jmp_buf jmpbuf;
+#define TEST_DBNAME "run-die-during-transaction.tdb"
+#define KEY_STRING "helloworld"
+
+static void maybe_die(int fd)
+{
+       if (in_transaction && current++ == target) {
+               longjmp(jmpbuf, 1);
+       }
+}
+
+static ssize_t pwrite_check(int fd,
+                           const void *buf, size_t count, off_t offset)
+{
+       ssize_t ret;
+
+       maybe_die(fd);
+
+       ret = pwrite(fd, buf, count, offset);
+       if (ret != count)
+               return ret;
+
+       maybe_die(fd);
+       return ret;
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+       ssize_t ret;
+
+       maybe_die(fd);
+
+       ret = write(fd, buf, count);
+       if (ret != count)
+               return ret;
+
+       maybe_die(fd);
+       return ret;
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+       int ret;
+
+       maybe_die(fd);
+
+       ret = ftruncate(fd, length);
+
+       maybe_die(fd);
+       return ret;
+}
+
+static bool test_death(enum operation op, struct agent *agent)
+{
+       struct tdb_context *tdb = NULL;
+       TDB_DATA key;
+       enum agent_return ret;
+       int needed_recovery = 0;
+
+       current = target = 0;
+reset:
+       unlink(TEST_DBNAME);
+       tdb = tdb_open_ex(TEST_DBNAME, 1024, TDB_NOMMAP,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       if (setjmp(jmpbuf) != 0) {
+               /* We're partway through.  Simulate our death. */
+               close(tdb->fd);
+               forget_locking();
+               in_transaction = false;
+
+               ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+               if (ret == SUCCESS)
+                       needed_recovery++;
+               else if (ret != FAILED) {
+                       diag("Step %u agent NEEDS_RECOVERY = %s", current,
+                            agent_return_name(ret));
+                       return false;
+               }
+
+               ret = external_agent_operation(agent, op, KEY_STRING);
+               if (ret != SUCCESS) {
+                       diag("Step %u op %s failed = %s", current,
+                            operation_name(op),
+                            agent_return_name(ret));
+                       return false;
+               }
+
+               ret = external_agent_operation(agent, NEEDS_RECOVERY, "");
+               if (ret != FAILED) {
+                       diag("Still needs recovery after step %u = %s",
+                            current, agent_return_name(ret));
+                       return false;
+               }
+
+               ret = external_agent_operation(agent, CHECK, "");
+               if (ret != SUCCESS) {
+                       diag("Step %u check failed = %s", current,
+                            agent_return_name(ret));
+                       return false;
+               }
+
+               ret = external_agent_operation(agent, CLOSE, "");
+               if (ret != SUCCESS) {
+                       diag("Step %u close failed = %s", current,
+                            agent_return_name(ret));
+                       return false;
+               }
+
+               /* Suppress logging as this tries to use closed fd. */
+               suppress_logging = true;
+               suppress_lockcheck = true;
+               tdb_close(tdb);
+               suppress_logging = false;
+               suppress_lockcheck = false;
+               target++;
+               current = 0;
+               goto reset;
+       }
+
+       /* Put key for agent to fetch. */
+       key.dsize = strlen(KEY_STRING);
+       key.dptr = (void *)KEY_STRING;
+       if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
+               return false;
+
+       /* This is the key we insert in transaction. */
+       key.dsize--;
+
+       ret = external_agent_operation(agent, OPEN, TEST_DBNAME);
+       if (ret != SUCCESS) {
+               fprintf(stderr, "Agent failed to open: %s\n",
+                       agent_return_name(ret));
+               exit(1);
+       }
+
+       ret = external_agent_operation(agent, FETCH, KEY_STRING);
+       if (ret != SUCCESS) {
+               fprintf(stderr, "Agent failed find key: %s\n",
+                       agent_return_name(ret));
+               exit(1);
+       }
+
+       in_transaction = true;
+       if (tdb_transaction_start(tdb) != 0)
+               return false;
+
+       if (tdb_store(tdb, key, key, TDB_INSERT) != 0)
+               return false;
+
+       if (tdb_transaction_commit(tdb) != 0)
+               return false;
+
+       in_transaction = false;
+
+       /* We made it! */
+       diag("Completed %u runs", current);
+       tdb_close(tdb);
+       ret = external_agent_operation(agent, CLOSE, "");
+       if (ret != SUCCESS) {
+               diag("Step %u close failed = %s", current,
+                    agent_return_name(ret));
+               return false;
+       }
+
+#ifdef HAVE_INCOHERENT_MMAP
+       /* This means we always mmap, which makes this test a noop. */
+       ok1(1);
+#else
+       ok1(needed_recovery);
+#endif
+       ok1(locking_errors == 0);
+       ok1(forget_locking() == 0);
+       locking_errors = 0;
+       return true;
+}
+
+int main(int argc, char *argv[])
+{
+       enum operation ops[] = { FETCH, STORE, TRANSACTION_START };
+       struct agent *agent;
+       int i;
+
+       plan_tests(12);
+       unlock_callback = maybe_die;
+
+       agent = prepare_external_agent();
+
+       for (i = 0; i < sizeof(ops)/sizeof(ops[0]); i++) {
+               diag("Testing %s after death", operation_name(ops[i]));
+               ok1(test_death(ops[i], agent));
+       }
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-endian.c b/ctdb/lib/tdb/test/run-endian.c
new file mode 100644 (file)
index 0000000..b19ffd3
--- /dev/null
@@ -0,0 +1,63 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(13);
+       tdb = tdb_open_ex("run-endian.tdb", 1024,
+                         TDB_CLEAR_IF_FIRST|TDB_CONVERT,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       ok1(tdb);
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data.dsize = strlen("world");
+       data.dptr = (void *)"world";
+
+       ok1(tdb_store(tdb, key, data, TDB_MODIFY) < 0);
+       ok1(tdb_error(tdb) == TDB_ERR_NOEXIST);
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) < 0);
+       ok1(tdb_error(tdb) == TDB_ERR_EXISTS);
+       ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0);
+
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+
+       key.dsize++;
+       data = tdb_fetch(tdb, key);
+       ok1(data.dptr == NULL);
+       tdb_close(tdb);
+
+       /* Reopen: should read it */
+       tdb = tdb_open_ex("run-endian.tdb", 1024, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(tdb);
+
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-incompatible.c b/ctdb/lib/tdb/test/run-incompatible.c
new file mode 100644 (file)
index 0000000..628927c
--- /dev/null
@@ -0,0 +1,185 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+
+static unsigned int tdb_dumb_hash(TDB_DATA *key)
+{
+       return key->dsize;
+}
+
+static void log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
+{
+       unsigned int *count = tdb_get_logging_private(tdb);
+       if (strstr(fmt, "hash"))
+               (*count)++;
+}
+
+static unsigned int hdr_rwlocks(const char *fname)
+{
+       struct tdb_header hdr;
+
+       int fd = open(fname, O_RDONLY);
+       if (fd == -1)
+               return -1;
+
+       if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+               return -1;
+
+       close(fd);
+       return hdr.rwlocks;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       unsigned int log_count, flags;
+       TDB_DATA d, r;
+       struct tdb_logging_context log_ctx = { log_fn, &log_count };
+
+       plan_tests(38 * 2);
+
+       for (flags = 0; flags <= TDB_CONVERT; flags += TDB_CONVERT) {
+               unsigned int rwmagic = TDB_HASH_RWLOCK_MAGIC;
+
+               if (flags & TDB_CONVERT)
+                       tdb_convert(&rwmagic, sizeof(rwmagic));
+
+               /* Create an old-style hash. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0, flags,
+                                 O_CREAT|O_RDWR|O_TRUNC, 0600, &log_ctx,
+                                 NULL);
+               ok1(tdb);
+               ok1(log_count == 0);
+               d.dptr = (void *)"Hello";
+               d.dsize = 5;
+               ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
+               tdb_close(tdb);
+
+               /* Should not have marked rwlocks field. */
+               ok1(hdr_rwlocks("run-incompatible.tdb") == 0);
+
+               /* We can still open any old-style with incompat flag. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0,
+                                 TDB_INCOMPATIBLE_HASH,
+                                 O_RDWR, 0600, &log_ctx, NULL);
+               ok1(tdb);
+               ok1(log_count == 0);
+               r = tdb_fetch(tdb, d);
+               ok1(r.dsize == 5);
+               free(r.dptr);
+               ok1(tdb_check(tdb, NULL, NULL) == 0);
+               tdb_close(tdb);
+
+               log_count = 0;
+               tdb = tdb_open_ex("test/jenkins-le-hash.tdb", 0, 0, O_RDONLY,
+                                 0, &log_ctx, tdb_jenkins_hash);
+               ok1(tdb);
+               ok1(log_count == 0);
+               ok1(tdb_check(tdb, NULL, NULL) == 0);
+               tdb_close(tdb);
+
+               log_count = 0;
+               tdb = tdb_open_ex("test/jenkins-be-hash.tdb", 0, 0, O_RDONLY,
+                                 0, &log_ctx, tdb_jenkins_hash);
+               ok1(tdb);
+               ok1(log_count == 0);
+               ok1(tdb_check(tdb, NULL, NULL) == 0);
+               tdb_close(tdb);
+
+               /* OK, now create with incompatible flag, default hash. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0,
+                                 flags|TDB_INCOMPATIBLE_HASH,
+                                 O_CREAT|O_RDWR|O_TRUNC, 0600, &log_ctx,
+                                 NULL);
+               ok1(tdb);
+               ok1(log_count == 0);
+               d.dptr = (void *)"Hello";
+               d.dsize = 5;
+               ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
+               tdb_close(tdb);
+
+               /* Should have marked rwlocks field. */
+               ok1(hdr_rwlocks("run-incompatible.tdb") == rwmagic);
+
+               /* Cannot open with old hash. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0, 0,
+                                 O_RDWR, 0600, &log_ctx, tdb_old_hash);
+               ok1(!tdb);
+               ok1(log_count == 1);
+
+               /* Can open with jenkins hash. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0, 0,
+                                 O_RDWR, 0600, &log_ctx, tdb_jenkins_hash);
+               ok1(tdb);
+               ok1(log_count == 0);
+               r = tdb_fetch(tdb, d);
+               ok1(r.dsize == 5);
+               free(r.dptr);
+               ok1(tdb_check(tdb, NULL, NULL) == 0);
+               tdb_close(tdb);
+
+               /* Can open by letting it figure it out itself. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0, 0,
+                                 O_RDWR, 0600, &log_ctx, NULL);
+               ok1(tdb);
+               ok1(log_count == 0);
+               r = tdb_fetch(tdb, d);
+               ok1(r.dsize == 5);
+               free(r.dptr);
+               ok1(tdb_check(tdb, NULL, NULL) == 0);
+               tdb_close(tdb);
+
+               /* We can also use incompatible hash with other hashes. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0,
+                                 flags|TDB_INCOMPATIBLE_HASH,
+                                 O_CREAT|O_RDWR|O_TRUNC, 0600, &log_ctx,
+                                 tdb_dumb_hash);
+               ok1(tdb);
+               ok1(log_count == 0);
+               d.dptr = (void *)"Hello";
+               d.dsize = 5;
+               ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
+               tdb_close(tdb);
+
+               /* Should have marked rwlocks field. */
+               ok1(hdr_rwlocks("run-incompatible.tdb") == rwmagic);
+
+               /* It should not open if we don't specify. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0, 0, O_RDWR, 0,
+                                 &log_ctx, NULL);
+               ok1(!tdb);
+               ok1(log_count == 1);
+
+               /* Should reopen with correct hash. */
+               log_count = 0;
+               tdb = tdb_open_ex("run-incompatible.tdb", 0, 0, O_RDWR, 0,
+                                 &log_ctx, tdb_dumb_hash);
+               ok1(tdb);
+               ok1(log_count == 0);
+               r = tdb_fetch(tdb, d);
+               ok1(r.dsize == 5);
+               free(r.dptr);
+               ok1(tdb_check(tdb, NULL, NULL) == 0);
+               tdb_close(tdb);
+       }
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-nested-transactions.c b/ctdb/lib/tdb/test/run-nested-transactions.c
new file mode 100644 (file)
index 0000000..8c84bca
--- /dev/null
@@ -0,0 +1,78 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include <stdbool.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(27);
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+
+       tdb = tdb_open_ex("run-nested-transactions.tdb",
+                         1024, TDB_CLEAR_IF_FIRST|TDB_DISALLOW_NESTING,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+       ok1(tdb);
+
+       /* Nesting disallowed. */
+       ok1(tdb_transaction_start(tdb) == 0);
+       data.dptr = (void *)"world";
+       data.dsize = strlen("world");
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+       ok1(tdb_transaction_start(tdb) != 0);
+       ok1(tdb_error(tdb) == TDB_ERR_NESTING);
+
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+       ok1(tdb_transaction_commit(tdb) == 0);
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+       tdb_close(tdb);
+
+       /* Nesting allowed by default */
+       tdb = tdb_open_ex("run-nested-transactions.tdb",
+                         1024, TDB_DEFAULT, O_RDWR, 0, &taplogctx, NULL);
+       ok1(tdb);
+
+       ok1(tdb_transaction_start(tdb) == 0);
+       ok1(tdb_transaction_start(tdb) == 0);
+       ok1(tdb_delete(tdb, key) == 0);
+       ok1(tdb_transaction_commit(tdb) == 0);
+       ok1(!tdb_exists(tdb, key));
+       ok1(tdb_transaction_cancel(tdb) == 0);
+       /* Surprise! Kills inner "committed" transaction. */
+       ok1(tdb_exists(tdb, key));
+
+       ok1(tdb_transaction_start(tdb) == 0);
+       ok1(tdb_transaction_start(tdb) == 0);
+       ok1(tdb_delete(tdb, key) == 0);
+       ok1(tdb_transaction_commit(tdb) == 0);
+       ok1(!tdb_exists(tdb, key));
+       ok1(tdb_transaction_commit(tdb) == 0);
+       ok1(!tdb_exists(tdb, key));
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-nested-traverse.c b/ctdb/lib/tdb/test/run-nested-traverse.c
new file mode 100644 (file)
index 0000000..37d57c0
--- /dev/null
@@ -0,0 +1,87 @@
+#include "../common/tdb_private.h"
+#include "lock-tracking.h"
+#define fcntl fcntl_with_lockcheck
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#undef fcntl
+#include <stdlib.h>
+#include <stdbool.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static struct agent *agent;
+
+static bool correct_key(TDB_DATA key)
+{
+       return key.dsize == strlen("hi")
+               && memcmp(key.dptr, "hi", key.dsize) == 0;
+}
+
+static bool correct_data(TDB_DATA data)
+{
+       return data.dsize == strlen("world")
+               && memcmp(data.dptr, "world", data.dsize) == 0;
+}
+
+static int traverse2(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+                    void *p)
+{
+       ok1(correct_key(key));
+       ok1(correct_data(data));
+       return 0;
+}
+
+static int traverse1(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+                    void *p)
+{
+       ok1(correct_key(key));
+       ok1(correct_data(data));
+       ok1(external_agent_operation(agent, TRANSACTION_START, tdb_name(tdb))
+           == WOULD_HAVE_BLOCKED);
+       tdb_traverse(tdb, traverse2, NULL);
+
+       /* That should *not* release the transaction lock! */
+       ok1(external_agent_operation(agent, TRANSACTION_START, tdb_name(tdb))
+           == WOULD_HAVE_BLOCKED);
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(17);
+       agent = prepare_external_agent();
+
+       tdb = tdb_open_ex("run-nested-traverse.tdb", 1024, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+       ok1(tdb);
+
+       ok1(external_agent_operation(agent, OPEN, tdb_name(tdb)) == SUCCESS);
+       ok1(external_agent_operation(agent, TRANSACTION_START, tdb_name(tdb))
+           == SUCCESS);
+       ok1(external_agent_operation(agent, TRANSACTION_COMMIT, tdb_name(tdb))
+           == SUCCESS);
+
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data.dptr = (void *)"world";
+       data.dsize = strlen("world");
+
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+       tdb_traverse(tdb, traverse1, NULL);
+       tdb_traverse_read(tdb, traverse1, NULL);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-no-lock-during-traverse.c b/ctdb/lib/tdb/test/run-no-lock-during-traverse.c
new file mode 100644 (file)
index 0000000..0a72282
--- /dev/null
@@ -0,0 +1,113 @@
+#include "../common/tdb_private.h"
+#include "lock-tracking.h"
+
+#define fcntl fcntl_with_lockcheck
+
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+#undef fcntl
+
+#define NUM_ENTRIES 10
+
+static bool prepare_entries(struct tdb_context *tdb)
+{
+       unsigned int i;
+       TDB_DATA key, data;
+
+       for (i = 0; i < NUM_ENTRIES; i++) {
+               key.dsize = sizeof(i);
+               key.dptr = (void *)&i;
+               data.dsize = strlen("world");
+               data.dptr = (void *)"world";
+
+               if (tdb_store(tdb, key, data, 0) != 0)
+                       return false;
+       }
+       return true;
+}
+
+static void delete_entries(struct tdb_context *tdb)
+{
+       unsigned int i;
+       TDB_DATA key;
+
+       for (i = 0; i < NUM_ENTRIES; i++) {
+               key.dsize = sizeof(i);
+               key.dptr = (void *)&i;
+
+               ok1(tdb_delete(tdb, key) == 0);
+       }
+}
+
+/* We don't know how many times this will run. */
+static int delete_other(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+                       void *private_data)
+{
+       unsigned int i;
+       memcpy(&i, key.dptr, 4);
+       i = (i + 1) % NUM_ENTRIES;
+       key.dptr = (void *)&i;
+       if (tdb_delete(tdb, key) != 0)
+               (*(int *)private_data)++;
+       return 0;
+}
+
+static int delete_self(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+                       void *private_data)
+{
+       ok1(tdb_delete(tdb, key) == 0);
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       int errors = 0;
+
+       plan_tests(41);
+       tdb = tdb_open_ex("run-no-lock-during-traverse.tdb",
+                         1024, TDB_CLEAR_IF_FIRST, O_CREAT|O_TRUNC|O_RDWR,
+                         0600, &taplogctx, NULL);
+
+       ok1(tdb);
+       ok1(prepare_entries(tdb));
+       ok1(locking_errors == 0);
+       ok1(tdb_lockall(tdb) == 0);
+       ok1(locking_errors == 0);
+       tdb_traverse(tdb, delete_other, &errors);
+       ok1(errors == 0);
+       ok1(locking_errors == 0);
+       ok1(tdb_unlockall(tdb) == 0);
+
+       ok1(prepare_entries(tdb));
+       ok1(locking_errors == 0);
+       ok1(tdb_lockall(tdb) == 0);
+       ok1(locking_errors == 0);
+       tdb_traverse(tdb, delete_self, NULL);
+       ok1(locking_errors == 0);
+       ok1(tdb_unlockall(tdb) == 0);
+
+       ok1(prepare_entries(tdb));
+       ok1(locking_errors == 0);
+       ok1(tdb_lockall(tdb) == 0);
+       ok1(locking_errors == 0);
+       delete_entries(tdb);
+       ok1(locking_errors == 0);
+       ok1(tdb_unlockall(tdb) == 0);
+
+       ok1(tdb_close(tdb) == 0);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-oldhash.c b/ctdb/lib/tdb/test/run-oldhash.c
new file mode 100644 (file)
index 0000000..535336c
--- /dev/null
@@ -0,0 +1,49 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+
+       plan_tests(8);
+
+       /* Old format (with zeroes in the hash magic fields) should
+        * open with any hash (since we don't know what hash they used). */
+       tdb = tdb_open_ex("test/old-nohash-le.tdb", 0, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       tdb = tdb_open_ex("test/old-nohash-be.tdb", 0, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       tdb = tdb_open_ex("test/old-nohash-le.tdb", 0, 0, O_RDWR, 0,
+                         &taplogctx, tdb_jenkins_hash);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       tdb = tdb_open_ex("test/old-nohash-be.tdb", 0, 0, O_RDWR, 0,
+                         &taplogctx, tdb_jenkins_hash);
+       ok1(tdb);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-open-during-transaction.c b/ctdb/lib/tdb/test/run-open-during-transaction.c
new file mode 100644 (file)
index 0000000..a825e62
--- /dev/null
@@ -0,0 +1,181 @@
+#include "../common/tdb_private.h"
+#include "lock-tracking.h"
+
+static ssize_t pwrite_check(int fd, const void *buf, size_t count, off_t offset);
+static ssize_t write_check(int fd, const void *buf, size_t count);
+static int ftruncate_check(int fd, off_t length);
+
+#define pwrite pwrite_check
+#define write write_check
+#define fcntl fcntl_with_lockcheck
+#define ftruncate ftruncate_check
+
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static struct agent *agent;
+static bool opened;
+static int errors = 0;
+static bool clear_if_first;
+#define TEST_DBNAME "run-open-during-transaction.tdb"
+
+#undef write
+#undef pwrite
+#undef fcntl
+#undef ftruncate
+
+static bool is_same(const char *snapshot, const char *latest, off_t len)
+{
+       unsigned i;
+
+       for (i = 0; i < len; i++) {
+               if (snapshot[i] != latest[i])
+                       return false;
+       }
+       return true;
+}
+
+static bool compare_file(int fd, const char *snapshot, off_t snapshot_len)
+{
+       char *contents;
+       bool same;
+
+       /* over-length read serves as length check. */
+       contents = malloc(snapshot_len+1);
+       same = pread(fd, contents, snapshot_len+1, 0) == snapshot_len
+               && is_same(snapshot, contents, snapshot_len);
+       free(contents);
+       return same;
+}
+
+static void check_file_intact(int fd)
+{
+       enum agent_return ret;
+       struct stat st;
+       char *contents;
+
+       fstat(fd, &st);
+       contents = malloc(st.st_size);
+       if (pread(fd, contents, st.st_size, 0) != st.st_size) {
+               diag("Read fail");
+               errors++;
+               return;
+       }
+
+       /* Ask agent to open file. */
+       ret = external_agent_operation(agent, clear_if_first ?
+                                      OPEN_WITH_CLEAR_IF_FIRST :
+                                      OPEN,
+                                      TEST_DBNAME);
+
+       /* It's OK to open it, but it must not have changed! */
+       if (!compare_file(fd, contents, st.st_size)) {
+               diag("Agent changed file after opening %s",
+                    agent_return_name(ret));
+               errors++;
+       }
+
+       if (ret == SUCCESS) {
+               ret = external_agent_operation(agent, CLOSE, NULL);
+               if (ret != SUCCESS) {
+                       diag("Agent failed to close tdb: %s",
+                            agent_return_name(ret));
+                       errors++;
+               }
+       } else if (ret != WOULD_HAVE_BLOCKED) {
+               diag("Agent opening file gave %s",
+                    agent_return_name(ret));
+               errors++;
+       }
+
+       free(contents);
+}
+
+static void after_unlock(int fd)
+{
+       if (opened)
+               check_file_intact(fd);
+}
+
+static ssize_t pwrite_check(int fd,
+                           const void *buf, size_t count, off_t offset)
+{
+       if (opened)
+               check_file_intact(fd);
+
+       return pwrite(fd, buf, count, offset);
+}
+
+static ssize_t write_check(int fd, const void *buf, size_t count)
+{
+       if (opened)
+               check_file_intact(fd);
+
+       return write(fd, buf, count);
+}
+
+static int ftruncate_check(int fd, off_t length)
+{
+       if (opened)
+               check_file_intact(fd);
+
+       return ftruncate(fd, length);
+
+}
+
+int main(int argc, char *argv[])
+{
+       const int flags[] = { TDB_DEFAULT,
+                             TDB_CLEAR_IF_FIRST,
+                             TDB_NOMMAP,
+                             TDB_CLEAR_IF_FIRST | TDB_NOMMAP };
+       int i;
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(20);
+       agent = prepare_external_agent();
+
+       unlock_callback = after_unlock;
+       for (i = 0; i < sizeof(flags)/sizeof(flags[0]); i++) {
+               clear_if_first = (flags[i] & TDB_CLEAR_IF_FIRST);
+               diag("Test with %s and %s\n",
+                    clear_if_first ? "CLEAR" : "DEFAULT",
+                    (flags[i] & TDB_NOMMAP) ? "no mmap" : "mmap");
+               unlink(TEST_DBNAME);
+               tdb = tdb_open_ex(TEST_DBNAME, 1024, flags[i],
+                                 O_CREAT|O_TRUNC|O_RDWR, 0600,
+                                 &taplogctx, NULL);
+               ok1(tdb);
+
+               opened = true;
+               ok1(tdb_transaction_start(tdb) == 0);
+               key.dsize = strlen("hi");
+               key.dptr = (void *)"hi";
+               data.dptr = (void *)"world";
+               data.dsize = strlen("world");
+
+               ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+               ok1(tdb_transaction_commit(tdb) == 0);
+               ok(!errors, "We had %u open errors", errors);
+
+               opened = false;
+               tdb_close(tdb);
+       }
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-readonly-check.c b/ctdb/lib/tdb/test/run-readonly-check.c
new file mode 100644 (file)
index 0000000..fdd9507
--- /dev/null
@@ -0,0 +1,52 @@
+/* We should be able to tdb_check a O_RDONLY tdb, and we were previously allowed
+ * to tdb_check() inside a transaction (though that's paranoia!). */
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(11);
+       tdb = tdb_open_ex("run-readonly-check.tdb", 1024,
+                         TDB_DEFAULT,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       ok1(tdb);
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data.dsize = strlen("world");
+       data.dptr = (void *)"world";
+
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+
+       /* We are also allowed to do a check inside a transaction. */
+       ok1(tdb_transaction_start(tdb) == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       ok1(tdb_close(tdb) == 0);
+
+       tdb = tdb_open_ex("run-readonly-check.tdb", 1024,
+                         TDB_DEFAULT, O_RDONLY, 0, &taplogctx, NULL);
+
+       ok1(tdb);
+       ok1(tdb_store(tdb, key, data, TDB_MODIFY) == -1);
+       ok1(tdb_error(tdb) == TDB_ERR_RDONLY);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       ok1(tdb_close(tdb) == 0);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-rescue-find_entry.c b/ctdb/lib/tdb/test/run-rescue-find_entry.c
new file mode 100644 (file)
index 0000000..25f4f1c
--- /dev/null
@@ -0,0 +1,50 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "../common/rescue.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+#define NUM 20
+
+/* Binary searches are deceptively simple: easy to screw up! */
+int main(int argc, char *argv[])
+{
+       unsigned int i, j, n;
+       struct found f[NUM+1];
+       struct found_table table;
+
+       /* Set up array for searching. */
+       for (i = 0; i < NUM+1; i++) {
+               f[i].head = i * 3;
+       }
+       table.arr = f;
+
+       for (i = 0; i < NUM; i++) {
+               table.num = i;
+               for (j = 0; j < (i + 2) * 3; j++) {
+                       n = find_entry(&table, j);
+                       ok1(n <= i);
+
+                       /* If we were searching for something too large... */
+                       if (j > i*3)
+                               ok1(n == i);
+                       else {
+                               /* It must give us something after j */
+                               ok1(f[n].head >= j);
+                               ok1(n == 0 || f[n-1].head < j);
+                       }
+               }
+       }
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-rescue.c b/ctdb/lib/tdb/test/run-rescue.c
new file mode 100644 (file)
index 0000000..a26c493
--- /dev/null
@@ -0,0 +1,126 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "../common/rescue.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+struct walk_data {
+       TDB_DATA key;
+       TDB_DATA data;
+       bool fail;
+       unsigned count;
+};
+
+static inline bool tdb_deq(TDB_DATA a, TDB_DATA b)
+{
+       return a.dsize == b.dsize && memcmp(a.dptr, b.dptr, a.dsize) == 0;
+}
+
+static inline TDB_DATA tdb_mkdata(const void *p, size_t len)
+{
+       TDB_DATA d;
+       d.dptr = (void *)p;
+       d.dsize = len;
+       return d;
+}
+
+static void walk(TDB_DATA key, TDB_DATA data, void *_wd)
+{
+       struct walk_data *wd = _wd;
+
+       if (!tdb_deq(key, wd->key)) {
+               wd->fail = true;
+       }
+
+       if (!tdb_deq(data, wd->data)) {
+               wd->fail = true;
+       }
+       wd->count++;
+}
+
+static void count_records(TDB_DATA key, TDB_DATA data, void *_wd)
+{
+       struct walk_data *wd = _wd;
+
+       if (!tdb_deq(key, wd->key) || !tdb_deq(data, wd->data))
+               diag("%.*s::%.*s\n",
+                    (int)key.dsize, key.dptr, (int)data.dsize, data.dptr);
+       wd->count++;
+}
+
+static void log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
+{
+       unsigned int *count = tdb_get_logging_private(tdb);
+       (*count)++;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       struct walk_data wd;
+       unsigned int i, size, log_count = 0;
+       struct tdb_logging_context log_ctx = { log_fn, &log_count };
+
+       plan_tests(8);
+       tdb = tdb_open_ex("run-rescue.tdb", 1, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &log_ctx, NULL);
+
+       wd.key.dsize = strlen("hi");
+       wd.key.dptr = (void *)"hi";
+       wd.data.dsize = strlen("world");
+       wd.data.dptr = (void *)"world";
+       wd.count = 0;
+       wd.fail = false;
+
+       ok1(tdb_store(tdb, wd.key, wd.data, TDB_INSERT) == 0);
+
+       ok1(tdb_rescue(tdb, walk, &wd) == 0);
+       ok1(!wd.fail);
+       ok1(wd.count == 1);
+
+       /* Corrupt the database, walk should either get it or not. */
+       size = tdb->map_size;
+       for (i = sizeof(struct tdb_header); i < size; i++) {
+               char c;
+               if (tdb->methods->tdb_read(tdb, i, &c, 1, false) != 0)
+                       fail("Reading offset %i", i);
+               if (tdb->methods->tdb_write(tdb, i, "X", 1) != 0)
+                       fail("Writing X at offset %i", i);
+
+               wd.count = 0;
+               if (tdb_rescue(tdb, count_records, &wd) != 0) {
+                       wd.fail = true;
+                       break;
+               }
+               /* Could be 0 or 1. */
+               if (wd.count > 1) {
+                       wd.fail = true;
+                       break;
+               }
+               if (tdb->methods->tdb_write(tdb, i, &c, 1) != 0)
+                       fail("Restoring offset %i", i);
+       }
+       ok1(log_count == 0);
+       ok1(!wd.fail);
+       tdb_close(tdb);
+
+       /* Now try our known-corrupt db. */
+       tdb = tdb_open_ex("test/tdb.corrupt", 1024, 0, O_RDWR, 0,
+                         &taplogctx, NULL);
+       wd.count = 0;
+       ok1(tdb_rescue(tdb, count_records, &wd) == 0);
+       ok1(wd.count == 1627);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-rwlock-check.c b/ctdb/lib/tdb/test/run-rwlock-check.c
new file mode 100644 (file)
index 0000000..8b8072d
--- /dev/null
@@ -0,0 +1,45 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+
+static void log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
+{
+       unsigned int *count = tdb_get_logging_private(tdb);
+       if (strstr(fmt, "spinlocks"))
+               (*count)++;
+}
+
+/* The code should barf on TDBs created with rwlocks. */
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       unsigned int log_count;
+       struct tdb_logging_context log_ctx = { log_fn, &log_count };
+
+       plan_tests(4);
+
+       /* We should fail to open rwlock-using tdbs of either endian. */
+       log_count = 0;
+       tdb = tdb_open_ex("test/rwlock-le.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, NULL);
+       ok1(!tdb);
+       ok1(log_count == 1);
+
+       log_count = 0;
+       tdb = tdb_open_ex("test/rwlock-be.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, NULL);
+       ok1(!tdb);
+       ok1(log_count == 1);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-summary.c b/ctdb/lib/tdb/test/run-summary.c
new file mode 100644 (file)
index 0000000..2231284
--- /dev/null
@@ -0,0 +1,64 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "../common/summary.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+
+int main(int argc, char *argv[])
+{
+       unsigned int i, j;
+       struct tdb_context *tdb;
+       int flags[] = { TDB_INTERNAL, TDB_DEFAULT, TDB_NOMMAP,
+                       TDB_INTERNAL|TDB_CONVERT, TDB_CONVERT,
+                       TDB_NOMMAP|TDB_CONVERT };
+       TDB_DATA key = { (unsigned char *)&j, sizeof(j) };
+       TDB_DATA data = { (unsigned char *)&j, sizeof(j) };
+       char *summary;
+
+       plan_tests(sizeof(flags) / sizeof(flags[0]) * 14);
+       for (i = 0; i < sizeof(flags) / sizeof(flags[0]); i++) {
+               tdb = tdb_open("run-summary.tdb", 131, flags[i],
+                              O_RDWR|O_CREAT|O_TRUNC, 0600);
+               ok1(tdb);
+               if (!tdb)
+                       continue;
+
+               /* Put some stuff in there. */
+               for (j = 0; j < 500; j++) {
+                       /* Make sure padding varies to we get some graphs! */
+                       data.dsize = j % (sizeof(j) + 1);
+                       if (tdb_store(tdb, key, data, TDB_REPLACE) != 0)
+                               fail("Storing in tdb");
+               }
+
+               summary = tdb_summary(tdb);
+               diag("%s", summary);
+               ok1(strstr(summary, "Size of file/data: "));
+               ok1(strstr(summary, "Number of records: 500\n"));
+               ok1(strstr(summary, "Smallest/average/largest keys: 4/4/4\n"));
+               ok1(strstr(summary, "Smallest/average/largest data: 0/2/4\n"));
+               ok1(strstr(summary, "Smallest/average/largest padding: "));
+               ok1(strstr(summary, "Number of dead records: 0\n"));
+               ok1(strstr(summary, "Number of free records: 1\n"));
+               ok1(strstr(summary, "Smallest/average/largest free records: "));
+               ok1(strstr(summary, "Number of hash chains: 131\n"));
+               ok1(strstr(summary, "Smallest/average/largest hash chains: "));
+               ok1(strstr(summary, "Number of uncoalesced records: 0\n"));
+               ok1(strstr(summary, "Smallest/average/largest uncoalesced runs: 0/0/0\n"));
+               ok1(strstr(summary, "Percentage keys/data/padding/free/dead/rechdrs&tailers/hashes: "));
+
+               free(summary);
+               tdb_close(tdb);
+       }
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-transaction-expand.c b/ctdb/lib/tdb/test/run-transaction-expand.c
new file mode 100644 (file)
index 0000000..1271d92
--- /dev/null
@@ -0,0 +1,119 @@
+#include "../common/tdb_private.h"
+
+/* Speed up the tests, but do the actual sync tests. */
+static unsigned int sync_counts = 0;
+static inline int fake_fsync(int fd)
+{
+       sync_counts++;
+       return 0;
+}
+#define fsync fake_fsync
+
+#ifdef MS_SYNC
+static inline int fake_msync(void *addr, size_t length, int flags)
+{
+       sync_counts++;
+       return 0;
+}
+#define msync fake_msync
+#endif
+
+#ifdef HAVE_FDATASYNC
+static inline int fake_fdatasync(int fd)
+{
+       sync_counts++;
+       return 0;
+}
+#define fdatasync fake_fdatasync
+#endif
+
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+static void write_record(struct tdb_context *tdb, size_t extra_len,
+                        TDB_DATA *data)
+{
+       TDB_DATA key;
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+
+       data->dsize += extra_len;
+       tdb_transaction_start(tdb);
+       tdb_store(tdb, key, *data, TDB_REPLACE);
+       tdb_transaction_commit(tdb);
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       size_t i;
+       TDB_DATA data;
+       struct tdb_record rec;
+       tdb_off_t off;
+
+       /* Do *not* suppress sync for this test; we do it ourselves. */
+       unsetenv("TDB_NO_FSYNC");
+
+       plan_tests(5);
+       tdb = tdb_open_ex("run-transaction-expand.tdb",
+                         1024, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+       ok1(tdb);
+
+       data.dsize = 0;
+       data.dptr = calloc(1000, getpagesize());
+
+       /* Simulate a slowly growing record. */
+       for (i = 0; i < 1000; i++)
+               write_record(tdb, getpagesize(), &data);
+
+       tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &off);
+       tdb_read(tdb, off, &rec, sizeof(rec), DOCONV());
+       diag("TDB size = %zu, recovery = %llu-%llu",
+            (size_t)tdb->map_size, (unsigned long long)off, (unsigned long long)(off + sizeof(rec) + rec.rec_len));
+
+       /* We should only be about 5 times larger than largest record. */
+       ok1(tdb->map_size < 6 * i * getpagesize());
+       tdb_close(tdb);
+
+       tdb = tdb_open_ex("run-transaction-expand.tdb",
+                         1024, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+       ok1(tdb);
+
+       data.dsize = 0;
+
+       /* Simulate a slowly growing record, repacking to keep
+        * recovery area at end. */
+       for (i = 0; i < 1000; i++) {
+               write_record(tdb, getpagesize(), &data);
+               if (i % 10 == 0)
+                       tdb_repack(tdb);
+       }
+
+       tdb_ofs_read(tdb, TDB_RECOVERY_HEAD, &off);
+       tdb_read(tdb, off, &rec, sizeof(rec), DOCONV());
+       diag("TDB size = %zu, recovery = %llu-%llu",
+            (size_t)tdb->map_size, (unsigned long long)off, (unsigned long long)(off + sizeof(rec) + rec.rec_len));
+
+       /* We should only be about 4 times larger than largest record. */
+       ok1(tdb->map_size < 5 * i * getpagesize());
+
+       /* We should have synchronized multiple times. */
+       ok1(sync_counts);
+       tdb_close(tdb);
+       free(data.dptr);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-traverse-in-transaction.c b/ctdb/lib/tdb/test/run-traverse-in-transaction.c
new file mode 100644 (file)
index 0000000..bcdc354
--- /dev/null
@@ -0,0 +1,86 @@
+#include "lock-tracking.h"
+#include "../common/tdb_private.h"
+#define fcntl fcntl_with_lockcheck
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#undef fcntl_with_lockcheck
+#include <stdlib.h>
+#include <stdbool.h>
+#include "external-agent.h"
+#include "logging.h"
+
+static struct agent *agent;
+
+static bool correct_key(TDB_DATA key)
+{
+       return key.dsize == strlen("hi")
+               && memcmp(key.dptr, "hi", key.dsize) == 0;
+}
+
+static bool correct_data(TDB_DATA data)
+{
+       return data.dsize == strlen("world")
+               && memcmp(data.dptr, "world", data.dsize) == 0;
+}
+
+static int traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data,
+                    void *p)
+{
+       ok1(correct_key(key));
+       ok1(correct_data(data));
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(13);
+       agent = prepare_external_agent();
+
+       tdb = tdb_open_ex("run-traverse-in-transaction.tdb",
+                         1024, TDB_CLEAR_IF_FIRST, O_CREAT|O_TRUNC|O_RDWR,
+                         0600, &taplogctx, NULL);
+       ok1(tdb);
+
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data.dptr = (void *)"world";
+       data.dsize = strlen("world");
+
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+
+       ok1(external_agent_operation(agent, OPEN, tdb_name(tdb)) == SUCCESS);
+
+       ok1(tdb_transaction_start(tdb) == 0);
+       ok1(external_agent_operation(agent, TRANSACTION_START, tdb_name(tdb))
+           == WOULD_HAVE_BLOCKED);
+       tdb_traverse(tdb, traverse, NULL);
+
+       /* That should *not* release the transaction lock! */
+       ok1(external_agent_operation(agent, TRANSACTION_START, tdb_name(tdb))
+           == WOULD_HAVE_BLOCKED);
+       tdb_traverse_read(tdb, traverse, NULL);
+
+       /* That should *not* release the transaction lock! */
+       ok1(external_agent_operation(agent, TRANSACTION_START, tdb_name(tdb))
+           == WOULD_HAVE_BLOCKED);
+       ok1(tdb_transaction_commit(tdb) == 0);
+       /* Now we should be fine. */
+       ok1(external_agent_operation(agent, TRANSACTION_START, tdb_name(tdb))
+           == SUCCESS);
+
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-wronghash-fail.c b/ctdb/lib/tdb/test/run-wronghash-fail.c
new file mode 100644 (file)
index 0000000..74bbc30
--- /dev/null
@@ -0,0 +1,120 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+
+static void log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
+{
+       unsigned int *count = tdb_get_logging_private(tdb);
+       if (strstr(fmt, "hash"))
+               (*count)++;
+}
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       unsigned int log_count;
+       TDB_DATA d;
+       struct tdb_logging_context log_ctx = { log_fn, &log_count };
+
+       plan_tests(28);
+
+       /* Create with default hash. */
+       log_count = 0;
+       tdb = tdb_open_ex("run-wronghash-fail.tdb", 0, 0,
+                         O_CREAT|O_RDWR|O_TRUNC, 0600, &log_ctx, NULL);
+       ok1(tdb);
+       ok1(log_count == 0);
+       d.dptr = (void *)"Hello";
+       d.dsize = 5;
+       ok1(tdb_store(tdb, d, d, TDB_INSERT) == 0);
+       tdb_close(tdb);
+
+       /* Fail to open with different hash. */
+       tdb = tdb_open_ex("run-wronghash-fail.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, tdb_jenkins_hash);
+       ok1(!tdb);
+       ok1(log_count == 1);
+
+       /* Create with different hash. */
+       log_count = 0;
+       tdb = tdb_open_ex("run-wronghash-fail.tdb", 0, 0,
+                         O_CREAT|O_RDWR|O_TRUNC,
+                         0600, &log_ctx, tdb_jenkins_hash);
+       ok1(tdb);
+       ok1(log_count == 0);
+       tdb_close(tdb);
+
+       /* Endian should be no problem. */
+       log_count = 0;
+       tdb = tdb_open_ex("test/jenkins-le-hash.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, tdb_old_hash);
+       ok1(!tdb);
+       ok1(log_count == 1);
+
+       log_count = 0;
+       tdb = tdb_open_ex("test/jenkins-be-hash.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, tdb_old_hash);
+       ok1(!tdb);
+       ok1(log_count == 1);
+
+       log_count = 0;
+       /* Fail to open with old default hash. */
+       tdb = tdb_open_ex("run-wronghash-fail.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, tdb_old_hash);
+       ok1(!tdb);
+       ok1(log_count == 1);
+
+       log_count = 0;
+       tdb = tdb_open_ex("test/jenkins-le-hash.tdb", 0, 0, O_RDONLY,
+                         0, &log_ctx, tdb_jenkins_hash);
+       ok1(tdb);
+       ok1(log_count == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       log_count = 0;
+       tdb = tdb_open_ex("test/jenkins-be-hash.tdb", 0, 0, O_RDONLY,
+                         0, &log_ctx, tdb_jenkins_hash);
+       ok1(tdb);
+       ok1(log_count == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       /* It should open with jenkins hash if we don't specify. */
+       log_count = 0;
+       tdb = tdb_open_ex("test/jenkins-le-hash.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, NULL);
+       ok1(tdb);
+       ok1(log_count == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       log_count = 0;
+       tdb = tdb_open_ex("test/jenkins-be-hash.tdb", 0, 0, O_RDWR, 0,
+                         &log_ctx, NULL);
+       ok1(tdb);
+       ok1(log_count == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+       log_count = 0;
+       tdb = tdb_open_ex("run-wronghash-fail.tdb", 0, 0, O_RDONLY,
+                         0, &log_ctx, NULL);
+       ok1(tdb);
+       ok1(log_count == 0);
+       ok1(tdb_check(tdb, NULL, NULL) == 0);
+       tdb_close(tdb);
+
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run-zero-append.c b/ctdb/lib/tdb/test/run-zero-append.c
new file mode 100644 (file)
index 0000000..36bf699
--- /dev/null
@@ -0,0 +1,40 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(4);
+       tdb = tdb_open_ex(NULL, 1024, TDB_INTERNAL, O_CREAT|O_TRUNC|O_RDWR,
+                         0600, &taplogctx, NULL);
+       ok1(tdb);
+
+       /* Tickle bug on appending zero length buffer to zero length buffer. */
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data.dptr = (void *)"world";
+       data.dsize = 0;
+
+       ok1(tdb_append(tdb, key, data) == 0);
+       ok1(tdb_append(tdb, key, data) == 0);
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == 0);
+       tdb_close(tdb);
+       free(data.dptr);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/run.c b/ctdb/lib/tdb/test/run.c
new file mode 100644 (file)
index 0000000..f49e850
--- /dev/null
@@ -0,0 +1,49 @@
+#include "../common/tdb_private.h"
+#include "../common/io.c"
+#include "../common/tdb.c"
+#include "../common/lock.c"
+#include "../common/freelist.c"
+#include "../common/traverse.c"
+#include "../common/transaction.c"
+#include "../common/error.c"
+#include "../common/open.c"
+#include "../common/check.c"
+#include "../common/hash.c"
+#include "tap-interface.h"
+#include <stdlib.h>
+#include "logging.h"
+
+int main(int argc, char *argv[])
+{
+       struct tdb_context *tdb;
+       TDB_DATA key, data;
+
+       plan_tests(10);
+       tdb = tdb_open_ex("run.tdb", 1024, TDB_CLEAR_IF_FIRST,
+                         O_CREAT|O_TRUNC|O_RDWR, 0600, &taplogctx, NULL);
+
+       ok1(tdb);
+       key.dsize = strlen("hi");
+       key.dptr = (void *)"hi";
+       data.dsize = strlen("world");
+       data.dptr = (void *)"world";
+
+       ok1(tdb_store(tdb, key, data, TDB_MODIFY) < 0);
+       ok1(tdb_error(tdb) == TDB_ERR_NOEXIST);
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) == 0);
+       ok1(tdb_store(tdb, key, data, TDB_INSERT) < 0);
+       ok1(tdb_error(tdb) == TDB_ERR_EXISTS);
+       ok1(tdb_store(tdb, key, data, TDB_MODIFY) == 0);
+
+       data = tdb_fetch(tdb, key);
+       ok1(data.dsize == strlen("world"));
+       ok1(memcmp(data.dptr, "world", strlen("world")) == 0);
+       free(data.dptr);
+
+       key.dsize++;
+       data = tdb_fetch(tdb, key);
+       ok1(data.dptr == NULL);
+       tdb_close(tdb);
+
+       return exit_status();
+}
diff --git a/ctdb/lib/tdb/test/rwlock-be.tdb b/ctdb/lib/tdb/test/rwlock-be.tdb
new file mode 100644 (file)
index 0000000..45b5f09
Binary files /dev/null and b/ctdb/lib/tdb/test/rwlock-be.tdb differ
diff --git a/ctdb/lib/tdb/test/rwlock-le.tdb b/ctdb/lib/tdb/test/rwlock-le.tdb
new file mode 100644 (file)
index 0000000..45b5f09
Binary files /dev/null and b/ctdb/lib/tdb/test/rwlock-le.tdb differ
diff --git a/ctdb/lib/tdb/test/tap-interface.h b/ctdb/lib/tdb/test/tap-interface.h
new file mode 100644 (file)
index 0000000..d9ed6e8
--- /dev/null
@@ -0,0 +1,39 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Simplistic implementation of tap interface.
+
+   Copyright (C) Rusty Russell 2012
+   
+     ** NOTE! The following LGPL license applies to the talloc
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+   
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include <stdio.h>
+
+#ifndef __location__
+#define __TAP_STRING_LINE1__(s)    #s
+#define __TAP_STRING_LINE2__(s)   __TAP_STRING_LINE1__(s)
+#define __TAP_STRING_LINE3__  __TAP_STRING_LINE2__(__LINE__)
+#define __location__ __FILE__ ":" __TAP_STRING_LINE3__
+#endif
+
+#define plan_tests(num)
+#define ok(e, ...) do { if (e) { (void)printf("."); } else { fprintf(stderr, __VA_ARGS__); exit(1); } } while(0)
+#define ok1(e) ok((e), "%s:%s", __location__, #e)
+#define pass(...) printf(".")
+#define fail(...) do { fprintf(stderr, __VA_ARGS__); exit(1); } while(0)
+#define diag printf
+#define exit_status() 0
diff --git a/ctdb/lib/tdb/test/tap-to-subunit.h b/ctdb/lib/tdb/test/tap-to-subunit.h
new file mode 100644 (file)
index 0000000..a5cf74f
--- /dev/null
@@ -0,0 +1,155 @@
+#ifndef TAP_TO_SUBUNIT_H
+#define TAP_TO_SUBUNIT_H
+/*
+ * tap-style wrapper for subunit.
+ *
+ * Copyright (c) 2011 Rusty Russell
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include "replace.h"
+
+/**
+ * plan_tests - announce the number of tests you plan to run
+ * @tests: the number of tests
+ *
+ * This should be the first call in your test program: it allows tracing
+ * of failures which mean that not all tests are run.
+ *
+ * If you don't know how many tests will actually be run, assume all of them
+ * and use skip() if you don't actually run some tests.
+ *
+ * Example:
+ *     plan_tests(13);
+ */
+void plan_tests(unsigned int tests);
+
+/**
+ * ok1 - Simple conditional test
+ * @e: the expression which we expect to be true.
+ *
+ * This is the simplest kind of test: if the expression is true, the
+ * test passes.  The name of the test which is printed will simply be
+ * file name, line number, and the expression itself.
+ *
+ * Example:
+ *     ok1(somefunc() == 1);
+ */
+# define ok1(e) ((e) ?                                                 \
+                _gen_result(1, __func__, __FILE__, __LINE__, "%s", #e) : \
+                _gen_result(0, __func__, __FILE__, __LINE__, "%s", #e))
+
+/**
+ * ok - Conditional test with a name
+ * @e: the expression which we expect to be true.
+ * @...: the printf-style name of the test.
+ *
+ * If the expression is true, the test passes.  The name of the test will be
+ * the filename, line number, and the printf-style string.  This can be clearer
+ * than simply the expression itself.
+ *
+ * Example:
+ *     ok1(somefunc() == 1);
+ *     ok(somefunc() == 0, "Second somefunc() should fail");
+ */
+# define ok(e, ...) ((e) ?                                             \
+                    _gen_result(1, __func__, __FILE__, __LINE__,       \
+                                __VA_ARGS__) :                         \
+                    _gen_result(0, __func__, __FILE__, __LINE__,       \
+                                __VA_ARGS__))
+
+/**
+ * pass - Note that a test passed
+ * @...: the printf-style name of the test.
+ *
+ * For complicated code paths, it can be easiest to simply call pass() in one
+ * branch and fail() in another.
+ *
+ * Example:
+ *     int x = somefunc();
+ *     if (x > 0)
+ *             pass("somefunc() returned a valid value");
+ *     else
+ *             fail("somefunc() returned an invalid value");
+ */
+# define pass(...) ok(1, __VA_ARGS__)
+
+/**
+ * fail - Note that a test failed
+ * @...: the printf-style name of the test.
+ *
+ * For complicated code paths, it can be easiest to simply call pass() in one
+ * branch and fail() in another.
+ */
+# define fail(...) ok(0, __VA_ARGS__)
+
+unsigned int _gen_result(int, const char *, const char *, unsigned int,
+   const char *, ...) PRINTF_ATTRIBUTE(5, 6);
+
+/**
+ * diag - print a diagnostic message (use instead of printf/fprintf)
+ * @fmt: the format of the printf-style message
+ *
+ * diag ensures that the output will not be considered to be a test
+ * result by the TAP test harness.  It will append '\n' for you.
+ *
+ * Example:
+ *     diag("Now running complex tests");
+ */
+void diag(const char *fmt, ...) PRINTF_ATTRIBUTE(1, 2);
+
+/**
+ * skip - print a diagnostic message (use instead of printf/fprintf)
+ * @n: number of tests you're skipping.
+ * @fmt: the format of the reason you're skipping the tests.
+ *
+ * Sometimes tests cannot be run because the test system lacks some feature:
+ * you should explicitly document that you're skipping tests using skip().
+ *
+ * From the Test::More documentation:
+ *   If it's something the user might not be able to do, use SKIP.  This
+ *   includes optional modules that aren't installed, running under an OS that
+ *   doesn't have some feature (like fork() or symlinks), or maybe you need an
+ *   Internet connection and one isn't available.
+ *
+ * Example:
+ *     #ifdef HAVE_SOME_FEATURE
+ *     ok1(somefunc());
+ *     #else
+ *     skip(1, "Don't have SOME_FEATURE");
+ *     #endif
+ */
+void skip(unsigned int n, const char *fmt, ...) PRINTF_ATTRIBUTE(2, 3);
+
+/**
+ * exit_status - the value that main should return.
+ *
+ * For maximum compatibility your test program should return a particular exit
+ * code (ie. 0 if all tests were run, and every test which was expected to
+ * succeed succeeded).
+ *
+ * Example:
+ *     exit(exit_status());
+ */
+int exit_status(void);
+#endif /* CCAN_TAP_H */
diff --git a/ctdb/lib/tdb/test/tdb.corrupt b/ctdb/lib/tdb/test/tdb.corrupt
new file mode 100644 (file)
index 0000000..83d6677
Binary files /dev/null and b/ctdb/lib/tdb/test/tdb.corrupt differ
diff --git a/ctdb/lib/tdb/tools/tdbbackup.c b/ctdb/lib/tdb/tools/tdbbackup.c
new file mode 100644 (file)
index 0000000..11ecaa0
--- /dev/null
@@ -0,0 +1,342 @@
+/* 
+   Unix SMB/CIFS implementation.
+   low level tdb backup and restore utility
+   Copyright (C) Andrew Tridgell              2002
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+
+  This program is meant for backup/restore of tdb databases. Typical usage would be:
+     tdbbackup *.tdb
+  when Samba shuts down cleanly, which will make a backup of all the local databases
+  to *.bak files. Then on Samba startup you would use:
+     tdbbackup -v *.tdb
+  and this will check the databases for corruption and if corruption is detected then
+  the backup will be restored.
+
+  You may also like to do a backup on a regular basis while Samba is
+  running, perhaps using cron.
+
+  The reason this program is needed is to cope with power failures
+  while Samba is running. A power failure could lead to database
+  corruption and Samba will then not start correctly.
+
+  Note that many of the databases in Samba are transient and thus
+  don't need to be backed up, so you can optimise the above a little
+  by only running the backup on the critical databases.
+
+ */
+
+#include "replace.h"
+#include "system/locale.h"
+#include "system/time.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "tdb.h"
+
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+
+static int failed;
+
+static struct tdb_logging_context log_ctx;
+
+#ifdef PRINTF_ATTRIBUTE
+static void tdb_log(struct tdb_context *tdb, enum tdb_debug_level level, const char *format, ...) PRINTF_ATTRIBUTE(3,4);
+#endif
+static void tdb_log(struct tdb_context *tdb, enum tdb_debug_level level, const char *format, ...)
+{
+       va_list ap;
+    
+       va_start(ap, format);
+       vfprintf(stdout, format, ap);
+       va_end(ap);
+       fflush(stdout);
+}
+
+static char *add_suffix(const char *name, const char *suffix)
+{
+       char *ret;
+       int len = strlen(name) + strlen(suffix) + 1;
+       ret = (char *)malloc(len);
+       if (!ret) {
+               fprintf(stderr,"Out of memory!\n");
+               exit(1);
+       }
+       snprintf(ret, len, "%s%s", name, suffix);
+       return ret;
+}
+
+static int copy_fn(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+       TDB_CONTEXT *tdb_new = (TDB_CONTEXT *)state;
+
+       if (tdb_store(tdb_new, key, dbuf, TDB_INSERT) != 0) {
+               fprintf(stderr,"Failed to insert into %s\n", tdb_name(tdb_new));
+               failed = 1;
+               return 1;
+       }
+       return 0;
+}
+
+
+static int test_fn(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+       return 0;
+}
+
+/*
+  carefully backup a tdb, validating the contents and
+  only doing the backup if its OK
+  this function is also used for restore
+*/
+static int backup_tdb(const char *old_name, const char *new_name, int hash_size)
+{
+       TDB_CONTEXT *tdb;
+       TDB_CONTEXT *tdb_new;
+       char *tmp_name;
+       struct stat st;
+       int count1, count2;
+
+       tmp_name = add_suffix(new_name, ".tmp");
+
+       /* stat the old tdb to find its permissions */
+       if (stat(old_name, &st) != 0) {
+               perror(old_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       /* open the old tdb */
+       tdb = tdb_open_ex(old_name, 0, 0, 
+                         O_RDWR, 0, &log_ctx, NULL);
+       if (!tdb) {
+               printf("Failed to open %s\n", old_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       /* create the new tdb */
+       unlink(tmp_name);
+       tdb_new = tdb_open_ex(tmp_name, 
+                             hash_size ? hash_size : tdb_hash_size(tdb), 
+                             TDB_DEFAULT, 
+                             O_RDWR|O_CREAT|O_EXCL, st.st_mode & 0777, 
+                             &log_ctx, NULL);
+       if (!tdb_new) {
+               perror(tmp_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       if (tdb_transaction_start(tdb) != 0) {
+               printf("Failed to start transaction on old tdb\n");
+               tdb_close(tdb);
+               tdb_close(tdb_new);
+               unlink(tmp_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       /* lock the backup tdb so that nobody else can change it */
+       if (tdb_lockall(tdb_new) != 0) {
+               printf("Failed to lock backup tdb\n");
+               tdb_close(tdb);
+               tdb_close(tdb_new);
+               unlink(tmp_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       failed = 0;
+
+       /* traverse and copy */
+       count1 = tdb_traverse(tdb, copy_fn, (void *)tdb_new);
+       if (count1 < 0 || failed) {
+               fprintf(stderr,"failed to copy %s\n", old_name);
+               tdb_close(tdb);
+               tdb_close(tdb_new);
+               unlink(tmp_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       /* close the old tdb */
+       tdb_close(tdb);
+
+       /* copy done, unlock the backup tdb */
+       tdb_unlockall(tdb_new);
+
+#ifdef HAVE_FDATASYNC
+       if (fdatasync(tdb_fd(tdb_new)) != 0) {
+#else
+       if (fsync(tdb_fd(tdb_new)) != 0) {
+#endif
+               /* not fatal */
+               fprintf(stderr, "failed to fsync backup file\n");
+       }
+
+       /* close the new tdb and re-open read-only */
+       tdb_close(tdb_new);
+       tdb_new = tdb_open_ex(tmp_name, 
+                             0,
+                             TDB_DEFAULT, 
+                             O_RDONLY, 0,
+                             &log_ctx, NULL);
+       if (!tdb_new) {
+               fprintf(stderr,"failed to reopen %s\n", tmp_name);
+               unlink(tmp_name);
+               perror(tmp_name);
+               free(tmp_name);
+               return 1;
+       }
+       
+       /* traverse the new tdb to confirm */
+       count2 = tdb_traverse(tdb_new, test_fn, NULL);
+       if (count2 != count1) {
+               fprintf(stderr,"failed to copy %s\n", old_name);
+               tdb_close(tdb_new);
+               unlink(tmp_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       /* close the new tdb and rename it to .bak */
+       tdb_close(tdb_new);
+       if (rename(tmp_name, new_name) != 0) {
+               perror(new_name);
+               free(tmp_name);
+               return 1;
+       }
+
+       free(tmp_name);
+
+       return 0;
+}
+
+/*
+  verify a tdb and if it is corrupt then restore from *.bak
+*/
+static int verify_tdb(const char *fname, const char *bak_name)
+{
+       TDB_CONTEXT *tdb;
+       int count = -1;
+
+       /* open the tdb */
+       tdb = tdb_open_ex(fname, 0, 0, 
+                         O_RDONLY, 0, &log_ctx, NULL);
+
+       /* traverse the tdb, then close it */
+       if (tdb) {
+               count = tdb_traverse(tdb, test_fn, NULL);
+               tdb_close(tdb);
+       }
+
+       /* count is < 0 means an error */
+       if (count < 0) {
+               printf("restoring %s\n", fname);
+               return backup_tdb(bak_name, fname, 0);
+       }
+
+       printf("%s : %d records\n", fname, count);
+
+       return 0;
+}
+
+/*
+  see if one file is newer than another
+*/
+static int file_newer(const char *fname1, const char *fname2)
+{
+       struct stat st1, st2;
+       if (stat(fname1, &st1) != 0) {
+               return 0;
+       }
+       if (stat(fname2, &st2) != 0) {
+               return 1;
+       }
+       return (st1.st_mtime > st2.st_mtime);
+}
+
+static void usage(void)
+{
+       printf("Usage: tdbbackup [options] <fname...>\n\n");
+       printf("   -h            this help message\n");
+       printf("   -s suffix     set the backup suffix\n");
+       printf("   -v            verify mode (restore if corrupt)\n");
+       printf("   -n hashsize   set the new hash size for the backup\n");
+}
+               
+
+ int main(int argc, char *argv[])
+{
+       int i;
+       int ret = 0;
+       int c;
+       int verify = 0;
+       int hashsize = 0;
+       const char *suffix = ".bak";
+
+       log_ctx.log_fn = tdb_log;
+
+       while ((c = getopt(argc, argv, "vhs:n:")) != -1) {
+               switch (c) {
+               case 'h':
+                       usage();
+                       exit(0);
+               case 'v':
+                       verify = 1;
+                       break;
+               case 's':
+                       suffix = optarg;
+                       break;
+               case 'n':
+                       hashsize = atoi(optarg);
+                       break;
+               }
+       }
+
+       argc -= optind;
+       argv += optind;
+
+       if (argc < 1) {
+               usage();
+               exit(1);
+       }
+
+       for (i=0; i<argc; i++) {
+               const char *fname = argv[i];
+               char *bak_name;
+
+               bak_name = add_suffix(fname, suffix);
+
+               if (verify) {
+                       if (verify_tdb(fname, bak_name) != 0) {
+                               ret = 1;
+                       }
+               } else {
+                       if (file_newer(fname, bak_name) &&
+                           backup_tdb(fname, bak_name, hashsize) != 0) {
+                               ret = 1;
+                       }
+               }
+
+               free(bak_name);
+       }
+
+       return ret;
+}
diff --git a/ctdb/lib/tdb/tools/tdbdump.c b/ctdb/lib/tdb/tools/tdbdump.c
new file mode 100644 (file)
index 0000000..bcd395f
--- /dev/null
@@ -0,0 +1,171 @@
+/* 
+   Unix SMB/CIFS implementation.
+   simple tdb dump util
+   Copyright (C) Andrew Tridgell              2001
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/locale.h"
+#include "system/time.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "tdb.h"
+
+static void print_data(TDB_DATA d)
+{
+       unsigned char *p = (unsigned char *)d.dptr;
+       int len = d.dsize;
+       while (len--) {
+               if (isprint(*p) && !strchr("\"\\", *p)) {
+                       fputc(*p, stdout);
+               } else {
+                       printf("\\%02X", *p);
+               }
+               p++;
+       }
+}
+
+static int traverse_fn(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+       printf("{\n");
+       printf("key(%d) = \"", (int)key.dsize);
+       print_data(key);
+       printf("\"\n");
+       printf("data(%d) = \"", (int)dbuf.dsize);
+       print_data(dbuf);
+       printf("\"\n");
+       printf("}\n");
+       return 0;
+}
+
+static void log_stderr(struct tdb_context *tdb, enum tdb_debug_level level,
+                      const char *fmt, ...)
+{
+       va_list ap;
+       char *ptr = NULL;
+       int debuglevel = 0;
+       int ret;
+       const char *name = tdb_name(tdb);
+       const char *prefix = "";
+
+       if (!name)
+               name = "unnamed";
+
+       switch (level) {
+       case TDB_DEBUG_ERROR:
+               prefix = "ERROR: ";
+               break;
+       case TDB_DEBUG_WARNING:
+               prefix = "WARNING: ";
+               break;
+       case TDB_DEBUG_TRACE:
+               return;
+
+       default:
+       case TDB_DEBUG_FATAL:
+               prefix = "FATAL: ";
+               break;
+       }
+
+       va_start(ap, fmt);
+       fprintf(stderr, "tdb(%s): %s", name, prefix);
+       vfprintf(stderr, fmt, ap);
+       va_end(ap);
+}
+
+static void emergency_walk(TDB_DATA key, TDB_DATA dbuf, void *keyname)
+{
+       if (keyname) {
+               if (key.dsize != strlen(keyname))
+                       return;
+               if (memcmp(key.dptr, keyname, key.dsize) != 0)
+                       return;
+       }
+       traverse_fn(NULL, key, dbuf, NULL);
+}
+
+static int dump_tdb(const char *fname, const char *keyname, bool emergency)
+{
+       TDB_CONTEXT *tdb;
+       TDB_DATA key, value;
+       struct tdb_logging_context logfn = { log_stderr };
+       
+       tdb = tdb_open_ex(fname, 0, 0, O_RDONLY, 0, &logfn, NULL);
+       if (!tdb) {
+               printf("Failed to open %s\n", fname);
+               return 1;
+       }
+
+       if (emergency) {
+               return tdb_rescue(tdb, emergency_walk, keyname) == 0;
+       }
+       if (!keyname) {
+               return tdb_traverse(tdb, traverse_fn, NULL) == -1 ? 1 : 0;
+       } else {
+               key.dptr = discard_const_p(uint8_t, keyname);
+               key.dsize = strlen(keyname);
+               value = tdb_fetch(tdb, key);
+               if (!value.dptr) {
+                       return 1;
+               } else {
+                       print_data(value);
+                       free(value.dptr);
+               }
+       }
+
+       return 0;
+}
+
+static void usage( void)
+{
+       printf( "Usage: tdbdump [options] <filename>\n\n");
+       printf( "   -h          this help message\n");
+       printf( "   -k keyname  dumps value of keyname\n");
+       printf( "   -e          emergency dump, for corrupt databases\n");
+}
+
+ int main(int argc, char *argv[])
+{
+       char *fname, *keyname=NULL;
+       bool emergency = false;
+       int c;
+
+       if (argc < 2) {
+               printf("Usage: tdbdump <fname>\n");
+               exit(1);
+       }
+
+       while ((c = getopt( argc, argv, "hk:e")) != -1) {
+               switch (c) {
+               case 'h':
+                       usage();
+                       exit( 0);
+               case 'k':
+                       keyname = optarg;
+                       break;
+               case 'e':
+                       emergency = true;
+                       break;
+               default:
+                       usage();
+                       exit( 1);
+               }
+       }
+
+       fname = argv[optind];
+
+       return dump_tdb(fname, keyname, emergency);
+}
diff --git a/ctdb/lib/tdb/tools/tdbrestore.c b/ctdb/lib/tdb/tools/tdbrestore.c
new file mode 100644 (file)
index 0000000..f65b36f
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+   tdbrestore -- construct a tdb from tdbdump output.
+   Copyright (C) Volker Lendecke               2010
+   Copyright (C) Simon McVittie                        2005
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <assert.h>
+#include "replace.h"
+#include "system/locale.h"
+#include "system/time.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "tdb.h"
+
+static int read_linehead(FILE *f)
+{
+       int i, c;
+       int num_bytes;
+       char prefix[128];
+
+       while (1) {
+               c = getc(f);
+               if (c == EOF) {
+                       return -1;
+               }
+               if (c == '(') {
+                       break;
+               }
+       }
+       for (i=0; i<sizeof(prefix); i++) {
+               c = getc(f);
+               if (c == EOF) {
+                       return -1;
+               }
+               prefix[i] = c;
+               if (c == '"') {
+                       break;
+               }
+       }
+       if (i == sizeof(prefix)) {
+               return -1;
+       }
+       prefix[i] = '\0';
+
+       if (sscanf(prefix, "%d) = ", &num_bytes) != 1) {
+               return -1;
+       }
+       return num_bytes;
+}
+
+static int read_hex(void) {
+       int c;
+       c = getchar();
+       if (c == EOF) {
+               fprintf(stderr, "Unexpected EOF in data\n");
+               return -1;
+       } else if (c == '"') {
+               fprintf(stderr, "Unexpected \\\" sequence\n");
+               return -1;
+       } else if ('0' <= c && c <= '9')  {
+               return c - '0';
+       } else if ('A' <= c && c <= 'F')  {
+               return c - 'A' + 10;
+       } else if ('a' <= c && c <= 'f')  {
+               return c - 'a' + 10;
+       } else {
+               fprintf(stderr, "Invalid hex: %c\n", c);
+               return -1;
+       }
+}
+
+static int read_data(FILE *f, TDB_DATA *d, size_t size) {
+       int c, low, high;
+       int i;
+
+       d->dptr = (unsigned char *)malloc(size);
+       if (d->dptr == NULL) {
+               return -1;
+       }
+       d->dsize = size;
+
+       for (i=0; i<size; i++) {
+               c = getc(f);
+               if (c == EOF) {
+                       fprintf(stderr, "Unexpected EOF in data\n");
+                       return 1;
+               } else if (c == '"') {
+                       return 0;
+               } else if (c == '\\') {
+                       high = read_hex();
+                       if (high < 0) {
+                               return -1;
+                       }
+                       high = high << 4;
+                       assert(high == (high & 0xf0));
+                       low = read_hex();
+                       if (low < 0) {
+                               return -1;
+                       }
+                       assert(low == (low & 0x0f));
+                       d->dptr[i] = (low|high);
+               } else {
+                       d->dptr[i] = c;
+               }
+       }
+       return 0;
+}
+
+static int swallow(FILE *f, const char *s, int *eof)
+{
+       char line[128];
+
+       if (fgets(line, sizeof(line), f) == NULL) {
+               if (eof != NULL) {
+                       *eof = 1;
+               }
+               return -1;
+       }
+       if (strcmp(line, s) != 0) {
+               return -1;
+       }
+       return 0;
+}
+
+static int read_rec(FILE *f, TDB_CONTEXT *tdb, int *eof)
+{
+       int length;
+       TDB_DATA key, data;
+       int ret = -1;
+
+       key.dptr = NULL;
+       data.dptr = NULL;
+
+       if (swallow(f, "{\n", eof) == -1) {
+               goto fail;
+       }
+       length = read_linehead(f);
+       if (length == -1) {
+               goto fail;
+       }
+       if (read_data(f, &key, length) == -1) {
+               goto fail;
+       }
+       if (swallow(f, "\"\n", NULL) == -1) {
+               goto fail;
+       }
+       length = read_linehead(f);
+       if (length == -1) {
+               goto fail;
+       }
+       if (read_data(f, &data, length) == -1) {
+               goto fail;
+       }
+       if ((swallow(f, "\"\n", NULL) == -1)
+           || (swallow(f, "}\n", NULL) == -1)) {
+               goto fail;
+       }
+       if (tdb_store(tdb, key, data, TDB_INSERT) != 0) {
+               fprintf(stderr, "TDB error: %s\n", tdb_errorstr(tdb));
+               goto fail;
+       }
+
+       ret = 0;
+fail:
+       free(key.dptr);
+       free(data.dptr);
+       return ret;
+}
+
+static int restore_tdb(const char *fname)
+{
+       TDB_CONTEXT *tdb;
+
+       tdb = tdb_open(fname, 0, 0, O_RDWR|O_CREAT|O_EXCL, 0666);
+       if (!tdb) {
+               perror("tdb_open");
+               fprintf(stderr, "Failed to open %s\n", fname);
+               return 1;
+       }
+
+       while (1) {
+               int eof = 0;
+               if (read_rec(stdin, tdb, &eof) == -1) {
+                       if (eof) {
+                               break;
+                       }
+                       return 1;
+               }
+       }
+       if (tdb_close(tdb)) {
+               fprintf(stderr, "Error closing tdb\n");
+               return 1;
+       }
+       return 0;
+}
+
+int main(int argc, char *argv[])
+{
+       char *fname;
+
+       if (argc < 2) {
+               printf("Usage: %s dbname < tdbdump_output\n", argv[0]);
+               exit(1);
+       }
+
+       fname = argv[1];
+
+       return restore_tdb(fname);
+}
diff --git a/ctdb/lib/tdb/tools/tdbtest.c b/ctdb/lib/tdb/tools/tdbtest.c
new file mode 100644 (file)
index 0000000..44c78ef
--- /dev/null
@@ -0,0 +1,290 @@
+/* a test program for tdb - the trivial database */
+
+#include "replace.h"
+#include "tdb.h"
+#include "system/filesys.h"
+#include "system/time.h"
+
+#include <gdbm.h>
+
+
+#define DELETE_PROB 7
+#define STORE_PROB 5
+
+static struct tdb_context *db;
+static GDBM_FILE gdbm;
+
+struct timeval tp1,tp2;
+
+static void _start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double _end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return((tp2.tv_sec - tp1.tv_sec) + 
+              (tp2.tv_usec - tp1.tv_usec)*1.0e-6);
+}
+
+static void fatal(const char *why)
+{
+       perror(why);
+       exit(1);
+}
+
+#ifdef PRINTF_ATTRIBUTE
+static void tdb_log(struct tdb_context *tdb, int level, const char *format, ...) PRINTF_ATTRIBUTE(3,4);
+#endif
+static void tdb_log(struct tdb_context *tdb, int level, const char *format, ...)
+{
+       va_list ap;
+    
+       va_start(ap, format);
+       vfprintf(stdout, format, ap);
+       va_end(ap);
+       fflush(stdout);
+}
+
+static void compare_db(void)
+{
+       TDB_DATA d, key, nextkey;
+       datum gd, gkey, gnextkey;
+
+       key = tdb_firstkey(db);
+       while (key.dptr) {
+               d = tdb_fetch(db, key);
+               gkey.dptr = key.dptr;
+               gkey.dsize = key.dsize;
+
+               gd = gdbm_fetch(gdbm, gkey);
+
+               if (!gd.dptr) fatal("key not in gdbm");
+               if (gd.dsize != d.dsize) fatal("data sizes differ");
+               if (memcmp(gd.dptr, d.dptr, d.dsize)) {
+                       fatal("data differs");
+               }
+
+               nextkey = tdb_nextkey(db, key);
+               free(key.dptr);
+               free(d.dptr);
+               free(gd.dptr);
+               key = nextkey;
+       }
+
+       gkey = gdbm_firstkey(gdbm);
+       while (gkey.dptr) {
+               gd = gdbm_fetch(gdbm, gkey);
+               key.dptr = gkey.dptr;
+               key.dsize = gkey.dsize;
+
+               d = tdb_fetch(db, key);
+
+               if (!d.dptr) fatal("key not in db");
+               if (d.dsize != gd.dsize) fatal("data sizes differ");
+               if (memcmp(d.dptr, gd.dptr, gd.dsize)) {
+                       fatal("data differs");
+               }
+
+               gnextkey = gdbm_nextkey(gdbm, gkey);
+               free(gkey.dptr);
+               free(gd.dptr);
+               free(d.dptr);
+               gkey = gnextkey;
+       }
+}
+
+static char *randbuf(int len)
+{
+       char *buf;
+       int i;
+       buf = (char *)malloc(len+1);
+
+       for (i=0;i<len;i++) {
+               buf[i] = 'a' + (rand() % 26);
+       }
+       buf[i] = 0;
+       return buf;
+}
+
+static void addrec_db(void)
+{
+       int klen, dlen;
+       char *k, *d;
+       TDB_DATA key, data;
+
+       klen = 1 + (rand() % 4);
+       dlen = 1 + (rand() % 100);
+
+       k = randbuf(klen);
+       d = randbuf(dlen);
+
+       key.dptr = k;
+       key.dsize = klen+1;
+
+       data.dptr = d;
+       data.dsize = dlen+1;
+
+       if (rand() % DELETE_PROB == 0) {
+               tdb_delete(db, key);
+       } else if (rand() % STORE_PROB == 0) {
+               if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+                       fatal("tdb_store failed");
+               }
+       } else {
+               data = tdb_fetch(db, key);
+               if (data.dptr) free(data.dptr);
+       }
+
+       free(k);
+       free(d);
+}
+
+static void addrec_gdbm(void)
+{
+       int klen, dlen;
+       char *k, *d;
+       datum key, data;
+
+       klen = 1 + (rand() % 4);
+       dlen = 1 + (rand() % 100);
+
+       k = randbuf(klen);
+       d = randbuf(dlen);
+
+       key.dptr = k;
+       key.dsize = klen+1;
+
+       data.dptr = d;
+       data.dsize = dlen+1;
+
+       if (rand() % DELETE_PROB == 0) {
+               gdbm_delete(gdbm, key);
+       } else if (rand() % STORE_PROB == 0) {
+               if (gdbm_store(gdbm, key, data, GDBM_REPLACE) != 0) {
+                       fatal("gdbm_store failed");
+               }
+       } else {
+               data = gdbm_fetch(gdbm, key);
+               if (data.dptr) free(data.dptr);
+       }
+
+       free(k);
+       free(d);
+}
+
+static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+#if 0
+       printf("[%s] [%s]\n", key.dptr, dbuf.dptr);
+#endif
+       tdb_delete(tdb, key);
+       return 0;
+}
+
+static void merge_test(void)
+{
+       int i;
+       char keys[5][2];
+       char tdata[] = "test";
+       TDB_DATA key, data;
+       
+       for (i = 0; i < 5; i++) {
+               snprintf(keys[i],2, "%d", i);
+               key.dptr = keys[i];
+               key.dsize = 2;
+               
+               data.dptr = tdata;
+               data.dsize = 4;
+               
+               if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+                       fatal("tdb_store failed");
+               }
+       }
+
+       key.dptr = keys[0];
+       tdb_delete(db, key);
+       key.dptr = keys[4];
+       tdb_delete(db, key);
+       key.dptr = keys[2];
+       tdb_delete(db, key);
+       key.dptr = keys[1];
+       tdb_delete(db, key);
+       key.dptr = keys[3];
+       tdb_delete(db, key);
+}
+
+static char *test_path(const char *filename)
+{
+       const char *prefix = getenv("TEST_DATA_PREFIX");
+
+       if (prefix) {
+               char *path = NULL;
+               int ret;
+
+               ret = asprintf(&path, "%s/%s", prefix, filename);
+               if (ret == -1) {
+                       return NULL;
+               }
+               return path;
+       }
+
+       return strdup(filename);
+}
+
+ int main(int argc, const char *argv[])
+{
+       int i, seed=0;
+       int loops = 10000;
+       int num_entries;
+       char test_gdbm[1] = "test.gdbm";
+       char *test_tdb;
+
+       test_gdbm[0] = test_path("test.gdbm");
+       test_tdb = test_path("test.tdb");
+
+       unlink(test_gdbm[0]);
+
+       db = tdb_open(test_tdb, 0, TDB_CLEAR_IF_FIRST,
+                     O_RDWR | O_CREAT | O_TRUNC, 0600);
+       gdbm = gdbm_open(test_gdbm, 512, GDBM_WRITER|GDBM_NEWDB|GDBM_FAST, 
+                        0600, NULL);
+
+       if (!db || !gdbm) {
+               fatal("db open failed");
+       }
+
+#if 1
+       srand(seed);
+       _start_timer();
+       for (i=0;i<loops;i++) addrec_gdbm();
+       printf("gdbm got %.2f ops/sec\n", i/_end_timer());
+#endif
+
+       merge_test();
+
+       srand(seed);
+       _start_timer();
+       for (i=0;i<loops;i++) addrec_db();
+       printf("tdb got %.2f ops/sec\n", i/_end_timer());
+
+       if (tdb_validate_freelist(db, &num_entries) == -1) {
+               printf("tdb freelist is corrupt\n");
+       } else {
+               printf("tdb freelist is good (%d entries)\n", num_entries);
+       }
+
+       compare_db();
+
+       printf("traversed %d records\n", tdb_traverse(db, traverse_fn, NULL));
+       printf("traversed %d records\n", tdb_traverse(db, traverse_fn, NULL));
+
+       tdb_close(db);
+       gdbm_close(gdbm);
+
+       free(test_gdbm[0]);
+       free(test_tdb);
+
+       return 0;
+}
diff --git a/ctdb/lib/tdb/tools/tdbtool.c b/ctdb/lib/tdb/tools/tdbtool.c
new file mode 100644 (file)
index 0000000..dc5747f
--- /dev/null
@@ -0,0 +1,785 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba database functions
+   Copyright (C) Andrew Tridgell              1999-2000
+   Copyright (C) Paul `Rusty' Russell             2000
+   Copyright (C) Jeremy Allison                           2000
+   Copyright (C) Andrew Esh                        2001
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/locale.h"
+#include "system/time.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "tdb.h"
+
+static int do_command(void);
+const char *cmdname;
+char *arg1, *arg2;
+size_t arg1len, arg2len;
+int bIterate = 0;
+char *line;
+TDB_DATA iterate_kbuf;
+char cmdline[1024];
+static int disable_mmap;
+
+enum commands {
+       CMD_CREATE_TDB,
+       CMD_OPEN_TDB,
+       CMD_TRANSACTION_START,
+       CMD_TRANSACTION_COMMIT,
+       CMD_TRANSACTION_CANCEL,
+       CMD_ERASE,
+       CMD_DUMP,
+       CMD_INSERT,
+       CMD_MOVE,
+       CMD_STORE,
+       CMD_SHOW,
+       CMD_KEYS,
+       CMD_HEXKEYS,
+       CMD_DELETE,
+       CMD_LIST_HASH_FREE,
+       CMD_LIST_FREE,
+       CMD_INFO,
+       CMD_MMAP,
+       CMD_SPEED,
+       CMD_FIRST,
+       CMD_NEXT,
+       CMD_SYSTEM,
+       CMD_CHECK,
+       CMD_REPACK,
+       CMD_QUIT,
+       CMD_HELP
+};
+
+typedef struct {
+       const char *name;
+       enum commands cmd;
+} COMMAND_TABLE;
+
+COMMAND_TABLE cmd_table[] = {
+       {"create",      CMD_CREATE_TDB},
+       {"open",        CMD_OPEN_TDB},
+       {"transaction_start",   CMD_TRANSACTION_START},
+       {"transaction_commit",  CMD_TRANSACTION_COMMIT},
+       {"transaction_cancel",  CMD_TRANSACTION_CANCEL},
+       {"erase",       CMD_ERASE},
+       {"dump",        CMD_DUMP},
+       {"insert",      CMD_INSERT},
+       {"move",        CMD_MOVE},
+       {"store",       CMD_STORE},
+       {"show",        CMD_SHOW},
+       {"keys",        CMD_KEYS},
+       {"hexkeys",     CMD_HEXKEYS},
+       {"delete",      CMD_DELETE},
+       {"list",        CMD_LIST_HASH_FREE},
+       {"free",        CMD_LIST_FREE},
+       {"info",        CMD_INFO},
+       {"speed",       CMD_SPEED},
+       {"mmap",        CMD_MMAP},
+       {"first",       CMD_FIRST},
+       {"1",           CMD_FIRST},
+       {"next",        CMD_NEXT},
+       {"n",           CMD_NEXT},
+       {"check",       CMD_CHECK},
+       {"quit",        CMD_QUIT},
+       {"q",           CMD_QUIT},
+       {"!",           CMD_SYSTEM},
+       {"repack",      CMD_REPACK},
+       {NULL,          CMD_HELP}
+};
+
+struct timeval tp1,tp2;
+
+static void _start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double _end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return((tp2.tv_sec - tp1.tv_sec) + 
+              (tp2.tv_usec - tp1.tv_usec)*1.0e-6);
+}
+
+#ifdef PRINTF_ATTRIBUTE
+static void tdb_log(struct tdb_context *tdb, enum tdb_debug_level level, const char *format, ...) PRINTF_ATTRIBUTE(3,4);
+#endif
+static void tdb_log(struct tdb_context *tdb, enum tdb_debug_level level, const char *format, ...)
+{
+       va_list ap;
+
+       va_start(ap, format);
+       vfprintf(stderr, format, ap);
+       va_end(ap);
+}
+
+/* a tdb tool for manipulating a tdb database */
+
+static TDB_CONTEXT *tdb;
+
+static int print_rec(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+static int print_key(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+static int print_hexkey(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state);
+
+static void print_asc(const char *buf,int len)
+{
+       int i;
+
+       /* We're probably printing ASCII strings so don't try to display
+          the trailing NULL character. */
+
+       if (buf[len - 1] == 0)
+               len--;
+
+       for (i=0;i<len;i++)
+               printf("%c",isprint(buf[i])?buf[i]:'.');
+}
+
+static void print_data(const char *buf,int len)
+{
+       int i=0;
+       if (len<=0) return;
+       printf("[%03X] ",i);
+       for (i=0;i<len;) {
+               printf("%02X ",(int)((unsigned char)buf[i]));
+               i++;
+               if (i%8 == 0) printf(" ");
+               if (i%16 == 0) {      
+                       print_asc(&buf[i-16],8); printf(" ");
+                       print_asc(&buf[i-8],8); printf("\n");
+                       if (i<len) printf("[%03X] ",i);
+               }
+       }
+       if (i%16) {
+               int n;
+               
+               n = 16 - (i%16);
+               printf(" ");
+               if (n>8) printf(" ");
+               while (n--) printf("   ");
+               
+               n = i%16;
+               if (n > 8) n = 8;
+               print_asc(&buf[i-(i%16)],n); printf(" ");
+               n = (i%16) - n;
+               if (n>0) print_asc(&buf[i-n],n); 
+               printf("\n");    
+       }
+}
+
+static void help(void)
+{
+       printf("\n"
+"tdbtool: \n"
+"  create    dbname     : create a database\n"
+"  open      dbname     : open an existing database\n"
+"  transaction_start    : start a transaction\n"
+"  transaction_commit   : commit a transaction\n"
+"  transaction_cancel   : cancel a transaction\n"
+"  erase                : erase the database\n"
+"  dump                 : dump the database as strings\n"
+"  keys                 : dump the database keys as strings\n"
+"  hexkeys              : dump the database keys as hex values\n"
+"  info                 : print summary info about the database\n"
+"  insert    key  data  : insert a record\n"
+"  move      key  file  : move a record to a destination tdb\n"
+"  store     key  data  : store a record (replace)\n"
+"  show      key        : show a record by key\n"
+"  delete    key        : delete a record by key\n"
+"  list                 : print the database hash table and freelist\n"
+"  free                 : print the database freelist\n"
+"  check                : check the integrity of an opened database\n"
+"  repack               : repack the database\n"
+"  speed                : perform speed tests on the database\n"
+"  ! command            : execute system command\n"
+"  1 | first            : print the first record\n"
+"  n | next             : print the next record\n"
+"  q | quit             : terminate\n"
+"  \\n                   : repeat 'next' command\n"
+"\n");
+}
+
+static void terror(const char *why)
+{
+       printf("%s\n", why);
+}
+
+static void create_tdb(const char *tdbname)
+{
+       struct tdb_logging_context log_ctx;
+       log_ctx.log_fn = tdb_log;
+
+       if (tdb) tdb_close(tdb);
+       tdb = tdb_open_ex(tdbname, 0, TDB_CLEAR_IF_FIRST | (disable_mmap?TDB_NOMMAP:0),
+                         O_RDWR | O_CREAT | O_TRUNC, 0600, &log_ctx, NULL);
+       if (!tdb) {
+               printf("Could not create %s: %s\n", tdbname, strerror(errno));
+       }
+}
+
+static void open_tdb(const char *tdbname)
+{
+       struct tdb_logging_context log_ctx;
+       log_ctx.log_fn = tdb_log;
+
+       if (tdb) tdb_close(tdb);
+       tdb = tdb_open_ex(tdbname, 0, disable_mmap?TDB_NOMMAP:0, O_RDWR, 0600,
+                         &log_ctx, NULL);
+       if (!tdb) {
+               printf("Could not open %s: %s\n", tdbname, strerror(errno));
+       }
+}
+
+static void insert_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+       TDB_DATA key, dbuf;
+
+       if ((keyname == NULL) || (keylen == 0)) {
+               terror("need key");
+               return;
+       }
+
+       key.dptr = (unsigned char *)keyname;
+       key.dsize = keylen;
+       dbuf.dptr = (unsigned char *)data;
+       dbuf.dsize = datalen;
+
+       if (tdb_store(tdb, key, dbuf, TDB_INSERT) != 0) {
+               terror("insert failed");
+       }
+}
+
+static void store_tdb(char *keyname, size_t keylen, char* data, size_t datalen)
+{
+       TDB_DATA key, dbuf;
+
+       if ((keyname == NULL) || (keylen == 0)) {
+               terror("need key");
+               return;
+       }
+
+       if ((data == NULL) || (datalen == 0)) {
+               terror("need data");
+               return;
+       }
+
+       key.dptr = (unsigned char *)keyname;
+       key.dsize = keylen;
+       dbuf.dptr = (unsigned char *)data;
+       dbuf.dsize = datalen;
+
+       printf("Storing key:\n");
+       print_rec(tdb, key, dbuf, NULL);
+
+       if (tdb_store(tdb, key, dbuf, TDB_REPLACE) != 0) {
+               terror("store failed");
+       }
+}
+
+static void show_tdb(char *keyname, size_t keylen)
+{
+       TDB_DATA key, dbuf;
+
+       if ((keyname == NULL) || (keylen == 0)) {
+               terror("need key");
+               return;
+       }
+
+       key.dptr = (unsigned char *)keyname;
+       key.dsize = keylen;
+
+       dbuf = tdb_fetch(tdb, key);
+       if (!dbuf.dptr) {
+           terror("fetch failed");
+           return;
+       }
+       
+       print_rec(tdb, key, dbuf, NULL);
+       
+       free( dbuf.dptr );
+       
+       return;
+}
+
+static void delete_tdb(char *keyname, size_t keylen)
+{
+       TDB_DATA key;
+
+       if ((keyname == NULL) || (keylen == 0)) {
+               terror("need key");
+               return;
+       }
+
+       key.dptr = (unsigned char *)keyname;
+       key.dsize = keylen;
+
+       if (tdb_delete(tdb, key) != 0) {
+               terror("delete failed");
+       }
+}
+
+static void move_rec(char *keyname, size_t keylen, char* tdbname)
+{
+       TDB_DATA key, dbuf;
+       TDB_CONTEXT *dst_tdb;
+
+       if ((keyname == NULL) || (keylen == 0)) {
+               terror("need key");
+               return;
+       }
+
+       if ( !tdbname ) {
+               terror("need destination tdb name");
+               return;
+       }
+
+       key.dptr = (unsigned char *)keyname;
+       key.dsize = keylen;
+
+       dbuf = tdb_fetch(tdb, key);
+       if (!dbuf.dptr) {
+               terror("fetch failed");
+               return;
+       }
+       
+       print_rec(tdb, key, dbuf, NULL);
+       
+       dst_tdb = tdb_open(tdbname, 0, 0, O_RDWR, 0600);
+       if ( !dst_tdb ) {
+               terror("unable to open destination tdb");
+               return;
+       }
+       
+       if (tdb_store( dst_tdb, key, dbuf, TDB_REPLACE ) != 0) {
+               terror("failed to move record");
+       }
+       else
+               printf("record moved\n");
+       
+       tdb_close( dst_tdb );
+       
+       return;
+}
+
+static int print_rec(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+       printf("\nkey %d bytes\n", (int)key.dsize);
+       print_asc((const char *)key.dptr, key.dsize);
+       printf("\ndata %d bytes\n", (int)dbuf.dsize);
+       print_data((const char *)dbuf.dptr, dbuf.dsize);
+       return 0;
+}
+
+static int print_key(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+       printf("key %d bytes: ", (int)key.dsize);
+       print_asc((const char *)key.dptr, key.dsize);
+       printf("\n");
+       return 0;
+}
+
+static int print_hexkey(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+       printf("key %d bytes\n", (int)key.dsize);
+       print_data((const char *)key.dptr, key.dsize);
+       printf("\n");
+       return 0;
+}
+
+static int total_bytes;
+
+static int traverse_fn(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf, void *state)
+{
+       total_bytes += dbuf.dsize;
+       return 0;
+}
+
+static void info_tdb(void)
+{
+       char *summary = tdb_summary(tdb);
+
+       if (!summary) {
+               printf("Error = %s\n", tdb_errorstr(tdb));
+       } else {
+               printf("%s", summary);
+               free(summary);
+       }
+}
+
+static void speed_tdb(const char *tlimit)
+{
+       const char *str = "store test", *str2 = "transaction test";
+       unsigned timelimit = tlimit?atoi(tlimit):0;
+       double t;
+       int ops;
+       if (timelimit == 0) timelimit = 5;
+
+       ops = 0;
+       printf("Testing store speed for %u seconds\n", timelimit);
+       _start_timer();
+       do {
+               long int r = random();
+               TDB_DATA key, dbuf;
+               key.dptr = discard_const_p(uint8_t, str);
+               key.dsize = strlen((char *)key.dptr);
+               dbuf.dptr = (uint8_t *) &r;
+               dbuf.dsize = sizeof(r);
+               tdb_store(tdb, key, dbuf, TDB_REPLACE);
+               t = _end_timer();
+               ops++;
+       } while (t < timelimit);
+       printf("%10.3f ops/sec\n", ops/t);
+
+       ops = 0;
+       printf("Testing fetch speed for %u seconds\n", timelimit);
+       _start_timer();
+       do {
+               TDB_DATA key;
+               key.dptr = discard_const_p(uint8_t, str);
+               key.dsize = strlen((char *)key.dptr);
+               tdb_fetch(tdb, key);
+               t = _end_timer();
+               ops++;
+       } while (t < timelimit);
+       printf("%10.3f ops/sec\n", ops/t);
+
+       ops = 0;
+       printf("Testing transaction speed for %u seconds\n", timelimit);
+       _start_timer();
+       do {
+               long int r = random();
+               TDB_DATA key, dbuf;
+               key.dptr = discard_const_p(uint8_t, str2);
+               key.dsize = strlen((char *)key.dptr);
+               dbuf.dptr = (uint8_t *) &r;
+               dbuf.dsize = sizeof(r);
+               tdb_transaction_start(tdb);
+               tdb_store(tdb, key, dbuf, TDB_REPLACE);
+               tdb_transaction_commit(tdb);
+               t = _end_timer();
+               ops++;
+       } while (t < timelimit);
+       printf("%10.3f ops/sec\n", ops/t);
+
+       ops = 0;
+       printf("Testing traverse speed for %u seconds\n", timelimit);
+       _start_timer();
+       do {
+               tdb_traverse(tdb, traverse_fn, NULL);
+               t = _end_timer();
+               ops++;
+       } while (t < timelimit);
+       printf("%10.3f ops/sec\n", ops/t);
+}
+
+static void toggle_mmap(void)
+{
+       disable_mmap = !disable_mmap;
+       if (disable_mmap) {
+               printf("mmap is disabled\n");
+       } else {
+               printf("mmap is enabled\n");
+       }
+}
+
+static char *tdb_getline(const char *prompt)
+{
+       static char thisline[1024];
+       char *p;
+       fputs(prompt, stdout);
+       thisline[0] = 0;
+       p = fgets(thisline, sizeof(thisline)-1, stdin);
+       if (p) p = strchr(p, '\n');
+       if (p) *p = 0;
+       return p?thisline:NULL;
+}
+
+static int do_delete_fn(TDB_CONTEXT *the_tdb, TDB_DATA key, TDB_DATA dbuf,
+                     void *state)
+{
+    return tdb_delete(the_tdb, key);
+}
+
+static void first_record(TDB_CONTEXT *the_tdb, TDB_DATA *pkey)
+{
+       TDB_DATA dbuf;
+       *pkey = tdb_firstkey(the_tdb);
+       
+       dbuf = tdb_fetch(the_tdb, *pkey);
+       if (!dbuf.dptr) terror("fetch failed");
+       else {
+               print_rec(the_tdb, *pkey, dbuf, NULL);
+       }
+}
+
+static void next_record(TDB_CONTEXT *the_tdb, TDB_DATA *pkey)
+{
+       TDB_DATA dbuf;
+       *pkey = tdb_nextkey(the_tdb, *pkey);
+       
+       dbuf = tdb_fetch(the_tdb, *pkey);
+       if (!dbuf.dptr) 
+               terror("fetch failed");
+       else
+               print_rec(the_tdb, *pkey, dbuf, NULL);
+}
+
+static int count(TDB_DATA key, TDB_DATA data, void *private_data)
+{
+       (*(unsigned int *)private_data)++;
+       return 0;
+}
+
+static void check_db(TDB_CONTEXT *the_tdb)
+{
+       int tdbcount = 0;
+       if (!the_tdb)
+               printf("Error: No database opened!\n");
+       else if (tdb_check(the_tdb, count, &tdbcount) == -1)
+               printf("Integrity check for the opened database failed.\n");
+       else
+               printf("Database integrity is OK and has %d records.\n",
+                      tdbcount);
+}
+
+static int do_command(void)
+{
+       COMMAND_TABLE *ctp = cmd_table;
+       enum commands mycmd = CMD_HELP;
+       int cmd_len;
+
+       if (cmdname && strlen(cmdname) == 0) {
+               mycmd = CMD_NEXT;
+       } else {
+               while (ctp->name) {
+                       cmd_len = strlen(ctp->name);
+                       if (strncmp(ctp->name,cmdname,cmd_len) == 0) {
+                               mycmd = ctp->cmd;
+                               break;
+                       }
+                       ctp++;
+               }
+       }
+
+       switch (mycmd) {
+       case CMD_CREATE_TDB:
+               bIterate = 0;
+               create_tdb(arg1);
+               return 0;
+       case CMD_OPEN_TDB:
+               bIterate = 0;
+               open_tdb(arg1);
+               return 0;
+       case CMD_SYSTEM:
+               /* Shell command */
+               if (system(arg1) == -1) {
+                       terror("system() call failed\n");
+               }
+               return 0;
+       case CMD_QUIT:
+               return 1;
+       default:
+               /* all the rest require a open database */
+               if (!tdb) {
+                       bIterate = 0;
+                       terror("database not open");
+                       help();
+                       return 0;
+               }
+               switch (mycmd) {
+               case CMD_TRANSACTION_START:
+                       bIterate = 0;
+                       tdb_transaction_start(tdb);
+                       return 0;
+               case CMD_TRANSACTION_COMMIT:
+                       bIterate = 0;
+                       tdb_transaction_commit(tdb);
+                       return 0;
+               case CMD_REPACK:
+                       bIterate = 0;
+                       tdb_repack(tdb);
+                       return 0;
+               case CMD_TRANSACTION_CANCEL:
+                       bIterate = 0;
+                       tdb_transaction_cancel(tdb);
+                       return 0;
+               case CMD_ERASE:
+                       bIterate = 0;
+                       tdb_traverse(tdb, do_delete_fn, NULL);
+                       return 0;
+               case CMD_DUMP:
+                       bIterate = 0;
+                       tdb_traverse(tdb, print_rec, NULL);
+                       return 0;
+               case CMD_INSERT:
+                       bIterate = 0;
+                       insert_tdb(arg1, arg1len,arg2,arg2len);
+                       return 0;
+               case CMD_MOVE:
+                       bIterate = 0;
+                       move_rec(arg1,arg1len,arg2);
+                       return 0;
+               case CMD_STORE:
+                       bIterate = 0;
+                       store_tdb(arg1,arg1len,arg2,arg2len);
+                       return 0;
+               case CMD_SHOW:
+                       bIterate = 0;
+                       show_tdb(arg1, arg1len);
+                       return 0;
+               case CMD_KEYS:
+                       tdb_traverse(tdb, print_key, NULL);
+                       return 0;
+               case CMD_HEXKEYS:
+                       tdb_traverse(tdb, print_hexkey, NULL);
+                       return 0;
+               case CMD_DELETE:
+                       bIterate = 0;
+                       delete_tdb(arg1,arg1len);
+                       return 0;
+               case CMD_LIST_HASH_FREE:
+                       tdb_dump_all(tdb);
+                       return 0;
+               case CMD_LIST_FREE:
+                       tdb_printfreelist(tdb);
+                       return 0;
+               case CMD_INFO:
+                       info_tdb();
+                       return 0;
+               case CMD_SPEED:
+                       speed_tdb(arg1);
+                       return 0;
+               case CMD_MMAP:
+                       toggle_mmap();
+                       return 0;
+               case CMD_FIRST:
+                       bIterate = 1;
+                       first_record(tdb, &iterate_kbuf);
+                       return 0;
+               case CMD_NEXT:
+                       if (bIterate)
+                               next_record(tdb, &iterate_kbuf);
+                       return 0;
+               case CMD_CHECK:
+                       check_db(tdb);
+                       return 0;
+               case CMD_HELP:
+                       help();
+                       return 0;
+               case CMD_CREATE_TDB:
+               case CMD_OPEN_TDB:
+               case CMD_SYSTEM:
+               case CMD_QUIT:
+                       /*
+                        * unhandled commands.  cases included here to avoid compiler
+                        * warnings.
+                        */
+                       return 0;
+               }
+       }
+
+       return 0;
+}
+
+static char *tdb_convert_string(char *instring, size_t *sizep)
+{
+       size_t length = 0;
+       char *outp, *inp;
+       char temp[3];
+
+       outp = inp = instring;
+
+       while (*inp) {
+               if (*inp == '\\') {
+                       inp++;
+                       if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+                               temp[0] = *inp++;
+                               temp[1] = '\0';
+                               if (*inp && strchr("0123456789abcdefABCDEF",(int)*inp)) {
+                                       temp[1] = *inp++;
+                                       temp[2] = '\0';
+                               }
+                               *outp++ = (char)strtol((const char *)temp,NULL,16);
+                       } else {
+                               *outp++ = *inp++;
+                       }
+               } else {
+                       *outp++ = *inp++;
+               }
+               length++;
+       }
+       *sizep = length;
+       return instring;
+}
+
+int main(int argc, char *argv[])
+{
+       cmdname = "";
+       arg1 = NULL;
+       arg1len = 0;
+       arg2 = NULL;
+       arg2len = 0;
+
+       if (argv[1]) {
+               cmdname = "open";
+               arg1 = argv[1];
+               do_command();
+               cmdname =  "";
+               arg1 = NULL;
+       }
+
+       switch (argc) {
+       case 1:
+       case 2:
+               /* Interactive mode */
+               while ((cmdname = tdb_getline("tdb> "))) {
+                       arg2 = arg1 = NULL;
+                       if ((arg1 = strchr((const char *)cmdname,' ')) != NULL) {
+                               arg1++;
+                               arg2 = arg1;
+                               while (*arg2) {
+                                       if (*arg2 == ' ') {
+                                               *arg2++ = '\0';
+                                               break;
+                                       }
+                                       if ((*arg2++ == '\\') && (*arg2 == ' ')) {
+                                               arg2++;
+                                       }
+                               }
+                       }
+                       if (arg1) arg1 = tdb_convert_string(arg1,&arg1len);
+                       if (arg2) arg2 = tdb_convert_string(arg2,&arg2len);
+                       if (do_command()) break;
+               }
+               break;
+       case 5:
+               arg2 = tdb_convert_string(argv[4],&arg2len);
+       case 4:
+               arg1 = tdb_convert_string(argv[3],&arg1len);
+       case 3:
+               cmdname = argv[2];
+       default:
+               do_command();
+               break;
+       }
+
+       if (tdb) tdb_close(tdb);
+
+       return 0;
+}
diff --git a/ctdb/lib/tdb/tools/tdbtorture.c b/ctdb/lib/tdb/tools/tdbtorture.c
new file mode 100644 (file)
index 0000000..64c5043
--- /dev/null
@@ -0,0 +1,453 @@
+/* this tests tdb by doing lots of ops from several simultaneous
+   writers - that stresses the locking code. 
+*/
+
+#include "replace.h"
+#include "system/time.h"
+#include "system/wait.h"
+#include "system/filesys.h"
+#include "tdb.h"
+
+#ifdef HAVE_GETOPT_H
+#include <getopt.h>
+#endif
+
+
+#define REOPEN_PROB 30
+#define DELETE_PROB 8
+#define STORE_PROB 4
+#define APPEND_PROB 6
+#define TRANSACTION_PROB 10
+#define TRANSACTION_PREPARE_PROB 2
+#define LOCKSTORE_PROB 5
+#define TRAVERSE_PROB 20
+#define TRAVERSE_READ_PROB 20
+#define CULL_PROB 100
+#define KEYLEN 3
+#define DATALEN 100
+
+static struct tdb_context *db;
+static int in_transaction;
+static int error_count;
+static int always_transaction = 0;
+static int hash_size = 2;
+static int loopnum;
+static int count_pipe;
+static struct tdb_logging_context log_ctx;
+
+#ifdef PRINTF_ATTRIBUTE
+static void tdb_log(struct tdb_context *tdb, enum tdb_debug_level level, const char *format, ...) PRINTF_ATTRIBUTE(3,4);
+#endif
+static void tdb_log(struct tdb_context *tdb, enum tdb_debug_level level, const char *format, ...)
+{
+       va_list ap;
+
+       /* trace level messages do not indicate an error */
+       if (level != TDB_DEBUG_TRACE) {
+               error_count++;
+       }
+
+       va_start(ap, format);
+       vfprintf(stdout, format, ap);
+       va_end(ap);
+       fflush(stdout);
+#if 0
+       if (level != TDB_DEBUG_TRACE) {
+               char *ptr;
+               signal(SIGUSR1, SIG_IGN);
+               asprintf(&ptr,"xterm -e gdb /proc/%d/exe %d", getpid(), getpid());
+               system(ptr);
+               free(ptr);
+       }
+#endif 
+}
+
+static void fatal(const char *why)
+{
+       perror(why);
+       error_count++;
+}
+
+static char *randbuf(int len)
+{
+       char *buf;
+       int i;
+       buf = (char *)malloc(len+1);
+
+       for (i=0;i<len;i++) {
+               buf[i] = 'a' + (rand() % 26);
+       }
+       buf[i] = 0;
+       return buf;
+}
+
+static int cull_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+                        void *state)
+{
+#if CULL_PROB
+       if (random() % CULL_PROB == 0) {
+               tdb_delete(tdb, key);
+       }
+#endif
+       return 0;
+}
+
+static void addrec_db(void)
+{
+       int klen, dlen;
+       char *k, *d;
+       TDB_DATA key, data;
+
+       klen = 1 + (rand() % KEYLEN);
+       dlen = 1 + (rand() % DATALEN);
+
+       k = randbuf(klen);
+       d = randbuf(dlen);
+
+       key.dptr = (unsigned char *)k;
+       key.dsize = klen+1;
+
+       data.dptr = (unsigned char *)d;
+       data.dsize = dlen+1;
+
+#if REOPEN_PROB
+       if (in_transaction == 0 && random() % REOPEN_PROB == 0) {
+               tdb_reopen_all(0);
+               goto next;
+       }
+#endif
+
+#if TRANSACTION_PROB
+       if (in_transaction == 0 &&
+           (always_transaction || random() % TRANSACTION_PROB == 0)) {
+               if (tdb_transaction_start(db) != 0) {
+                       fatal("tdb_transaction_start failed");
+               }
+               in_transaction++;
+               goto next;
+       }
+       if (in_transaction && random() % TRANSACTION_PROB == 0) {
+               if (random() % TRANSACTION_PREPARE_PROB == 0) {
+                       if (tdb_transaction_prepare_commit(db) != 0) {
+                               fatal("tdb_transaction_prepare_commit failed");
+                       }
+               }
+               if (tdb_transaction_commit(db) != 0) {
+                       fatal("tdb_transaction_commit failed");
+               }
+               in_transaction--;
+               goto next;
+       }
+       if (in_transaction && random() % TRANSACTION_PROB == 0) {
+               if (tdb_transaction_cancel(db) != 0) {
+                       fatal("tdb_transaction_cancel failed");
+               }
+               in_transaction--;
+               goto next;
+       }
+#endif
+
+#if DELETE_PROB
+       if (random() % DELETE_PROB == 0) {
+               tdb_delete(db, key);
+               goto next;
+       }
+#endif
+
+#if STORE_PROB
+       if (random() % STORE_PROB == 0) {
+               if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+                       fatal("tdb_store failed");
+               }
+               goto next;
+       }
+#endif
+
+#if APPEND_PROB
+       if (random() % APPEND_PROB == 0) {
+               if (tdb_append(db, key, data) != 0) {
+                       fatal("tdb_append failed");
+               }
+               goto next;
+       }
+#endif
+
+#if LOCKSTORE_PROB
+       if (random() % LOCKSTORE_PROB == 0) {
+               tdb_chainlock(db, key);
+               data = tdb_fetch(db, key);
+               if (tdb_store(db, key, data, TDB_REPLACE) != 0) {
+                       fatal("tdb_store failed");
+               }
+               if (data.dptr) free(data.dptr);
+               tdb_chainunlock(db, key);
+               goto next;
+       } 
+#endif
+
+#if TRAVERSE_PROB
+       if (random() % TRAVERSE_PROB == 0) {
+               tdb_traverse(db, cull_traverse, NULL);
+               goto next;
+       }
+#endif
+
+#if TRAVERSE_READ_PROB
+       if (random() % TRAVERSE_READ_PROB == 0) {
+               tdb_traverse_read(db, NULL, NULL);
+               goto next;
+       }
+#endif
+
+       data = tdb_fetch(db, key);
+       if (data.dptr) free(data.dptr);
+
+next:
+       free(k);
+       free(d);
+}
+
+static int traverse_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA dbuf,
+                       void *state)
+{
+       tdb_delete(tdb, key);
+       return 0;
+}
+
+static void usage(void)
+{
+       printf("Usage: tdbtorture [-t] [-k] [-n NUM_PROCS] [-l NUM_LOOPS] [-s SEED] [-H HASH_SIZE]\n");
+       exit(0);
+}
+
+static void send_count_and_suicide(int sig)
+{
+       /* This ensures our successor can continue where we left off. */
+       write(count_pipe, &loopnum, sizeof(loopnum));
+       /* This gives a unique signature. */
+       kill(getpid(), SIGUSR2);
+}
+
+static int run_child(const char *filename, int i, int seed, unsigned num_loops, unsigned start)
+{
+       db = tdb_open_ex(filename, hash_size, TDB_DEFAULT,
+                        O_RDWR | O_CREAT, 0600, &log_ctx, NULL);
+       if (!db) {
+               fatal("db open failed");
+       }
+
+       srand(seed + i);
+       srandom(seed + i);
+
+       /* Set global, then we're ready to handle being killed. */
+       loopnum = start;
+       signal(SIGUSR1, send_count_and_suicide);
+
+       for (;loopnum<num_loops && error_count == 0;loopnum++) {
+               addrec_db();
+       }
+
+       if (error_count == 0) {
+               tdb_traverse_read(db, NULL, NULL);
+               if (always_transaction) {
+                       while (in_transaction) {
+                               tdb_transaction_cancel(db);
+                               in_transaction--;
+                       }
+                       if (tdb_transaction_start(db) != 0)
+                               fatal("tdb_transaction_start failed");
+               }
+               tdb_traverse(db, traverse_fn, NULL);
+               tdb_traverse(db, traverse_fn, NULL);
+               if (always_transaction) {
+                       if (tdb_transaction_commit(db) != 0)
+                               fatal("tdb_transaction_commit failed");
+               }
+       }
+
+       tdb_close(db);
+
+       return (error_count < 100 ? error_count : 100);
+}
+
+static char *test_path(const char *filename)
+{
+       const char *prefix = getenv("TEST_DATA_PREFIX");
+
+       if (prefix) {
+               char *path = NULL;
+               int ret;
+
+               ret = asprintf(&path, "%s/%s", prefix, filename);
+               if (ret == -1) {
+                       return NULL;
+               }
+               return path;
+       }
+
+       return strdup(filename);
+}
+
+int main(int argc, char * const *argv)
+{
+       int i, seed = -1;
+       int num_loops = 5000;
+       int num_procs = 3;
+       int c, pfds[2];
+       extern char *optarg;
+       pid_t *pids;
+       int kill_random = 0;
+       int *done;
+       char *test_tdb;
+
+       log_ctx.log_fn = tdb_log;
+
+       while ((c = getopt(argc, argv, "n:l:s:H:thk")) != -1) {
+               switch (c) {
+               case 'n':
+                       num_procs = strtol(optarg, NULL, 0);
+                       break;
+               case 'l':
+                       num_loops = strtol(optarg, NULL, 0);
+                       break;
+               case 'H':
+                       hash_size = strtol(optarg, NULL, 0);
+                       break;
+               case 's':
+                       seed = strtol(optarg, NULL, 0);
+                       break;
+               case 't':
+                       always_transaction = 1;
+                       break;
+               case 'k':
+                       kill_random = 1;
+                       break;
+               default:
+                       usage();
+               }
+       }
+
+       test_tdb = test_path("torture.tdb");
+
+       unlink(test_tdb);
+
+       if (seed == -1) {
+               seed = (getpid() + time(NULL)) & 0x7FFFFFFF;
+       }
+
+       if (num_procs == 1 && !kill_random) {
+               /* Don't fork for this case, makes debugging easier. */
+               error_count = run_child(test_tdb, 0, seed, num_loops, 0);
+               goto done;
+       }
+
+       pids = (pid_t *)calloc(sizeof(pid_t), num_procs);
+       done = (int *)calloc(sizeof(int), num_procs);
+
+       if (pipe(pfds) != 0) {
+               perror("Creating pipe");
+               exit(1);
+       }
+       count_pipe = pfds[1];
+
+       for (i=0;i<num_procs;i++) {
+               if ((pids[i]=fork()) == 0) {
+                       close(pfds[0]);
+                       if (i == 0) {
+                               printf("Testing with %d processes, %d loops, %d hash_size, seed=%d%s\n",
+                                      num_procs, num_loops, hash_size, seed, always_transaction ? " (all within transactions)" : "");
+                       }
+                       exit(run_child(test_tdb, i, seed, num_loops, 0));
+               }
+       }
+
+       while (num_procs) {
+               int status, j;
+               pid_t pid;
+
+               if (error_count != 0) {
+                       /* try and stop the test on any failure */
+                       for (j=0;j<num_procs;j++) {
+                               if (pids[j] != 0) {
+                                       kill(pids[j], SIGTERM);
+                               }
+                       }
+               }
+
+               pid = waitpid(-1, &status, kill_random ? WNOHANG : 0);
+               if (pid == 0) {
+                       struct timeval tv;
+
+                       /* Sleep for 1/10 second. */
+                       tv.tv_sec = 0;
+                       tv.tv_usec = 100000;
+                       select(0, NULL, NULL, NULL, &tv);
+
+                       /* Kill someone. */
+                       kill(pids[random() % num_procs], SIGUSR1);
+                       continue;
+               }
+
+               if (pid == -1) {
+                       perror("failed to wait for child\n");
+                       exit(1);
+               }
+
+               for (j=0;j<num_procs;j++) {
+                       if (pids[j] == pid) break;
+               }
+               if (j == num_procs) {
+                       printf("unknown child %d exited!?\n", (int)pid);
+                       exit(1);
+               }
+               if (WIFSIGNALED(status)) {
+                       if (WTERMSIG(status) == SIGUSR2
+                           || WTERMSIG(status) == SIGUSR1) {
+                               /* SIGUSR2 means they wrote to pipe. */
+                               if (WTERMSIG(status) == SIGUSR2) {
+                                       read(pfds[0], &done[j],
+                                            sizeof(done[j]));
+                               }
+                               pids[j] = fork();
+                               if (pids[j] == 0)
+                                       exit(run_child(test_tdb, j, seed,
+                                                      num_loops, done[j]));
+                               printf("Restarting child %i for %u-%u\n",
+                                      j, done[j], num_loops);
+                               continue;
+                       }
+                       printf("child %d exited with signal %d\n",
+                              (int)pid, WTERMSIG(status));
+                       error_count++;
+               } else {
+                       if (WEXITSTATUS(status) != 0) {
+                               printf("child %d exited with status %d\n",
+                                      (int)pid, WEXITSTATUS(status));
+                               error_count++;
+                       }
+               }
+               memmove(&pids[j], &pids[j+1],
+                       (num_procs - j - 1)*sizeof(pids[0]));
+               num_procs--;
+       }
+
+       free(pids);
+
+done:
+       if (error_count == 0) {
+               db = tdb_open_ex(test_tdb, hash_size, TDB_DEFAULT,
+                                O_RDWR, 0, &log_ctx, NULL);
+               if (!db) {
+                       fatal("db open failed");
+               }
+               if (tdb_check(db, NULL, NULL) == -1) {
+                       printf("db check failed");
+                       exit(1);
+               }
+               tdb_close(db);
+               printf("OK\n");
+       }
+
+       free(test_tdb);
+       return error_count;
+}
diff --git a/ctdb/lib/tdb/web/index.html b/ctdb/lib/tdb/web/index.html
new file mode 100644 (file)
index 0000000..99e8a2f
--- /dev/null
@@ -0,0 +1,48 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
+<HTML>
+<HEAD>
+<TITLE>ldb</TITLE>
+</HEAD>
+<BODY BGCOLOR="#ffffff" TEXT="#000000" VLINK="#292555" LINK="#292555" ALINK="#cc0033">
+
+<h1>tdb</h1>
+
+TDB is a Trivial Database. In concept, it is very much like GDBM, and BSD's DB 
+except that it allows multiple simultaneous writers and uses locking 
+internally to keep writers from trampling on each other. TDB is also extremely 
+small.
+
+<h2>Download</h2>
+You can download the latest releases of tdb from the <a
+href="http://samba.org/ftp/tdb">tdb directory</a> on the samba public
+source archive.
+
+
+<h2>Discussion and bug reports</h2>
+
+tdb does not currently have its own mailing list or bug tracking
+system. For now, please use the <a
+href="https://lists.samba.org/mailman/listinfo/samba-technical">samba-technical</a>
+mailing list, and the <a href="http://bugzilla.samba.org/">Samba
+bugzilla</a> bug tracking system.
+
+<h2>Download</h2>
+
+You can download the latest code either via git or rsync.<br>
+<br>
+To fetch via git see the following guide:<br>
+<a href="http://wiki.samba.org/index.php/Using_Git_for_Samba_Development">Using Git for Samba Development</a><br>
+Once you have cloned the tree switch to the master branch and cd into the source/lib/tdb directory.<br>
+<br>
+To fetch via rsync use these commands:
+
+<pre>
+  rsync -Pavz samba.org::ftp/unpacked/standalone_projects/lib/tdb .
+  rsync -Pavz samba.org::ftp/unpacked/standalone_projects/lib/replace .
+</pre>
+
+and build in tdb. It will find the replace library in the directory
+above automatically.
+
+</BODY>
+</HTML>
diff --git a/ctdb/lib/tdb/wscript b/ctdb/lib/tdb/wscript
new file mode 100644 (file)
index 0000000..9d309a0
--- /dev/null
@@ -0,0 +1,224 @@
+#!/usr/bin/env python
+
+APPNAME = 'tdb'
+VERSION = '1.2.11'
+
+blddir = 'bin'
+
+import sys, os
+
+# find the buildtools directory
+srcdir = '.'
+while not os.path.exists(srcdir+'/buildtools') and len(srcdir.split('/')) < 5:
+    srcdir = '../' + srcdir
+sys.path.insert(0, srcdir + '/buildtools/wafsamba')
+
+import wafsamba, samba_dist, Options, Logs
+
+samba_dist.DIST_DIRS('lib/tdb:. lib/replace:lib/replace buildtools:buildtools')
+
+def set_options(opt):
+    opt.BUILTIN_DEFAULT('replace')
+    opt.PRIVATE_EXTENSION_DEFAULT('tdb', noextension='tdb')
+    opt.RECURSE('lib/replace')
+    if opt.IN_LAUNCH_DIR():
+        opt.add_option('--disable-python',
+                       help=("disable the pytdb module"),
+                       action="store_true", dest='disable_python', default=False)
+
+
+def configure(conf):
+    conf.RECURSE('lib/replace')
+
+    conf.env.standalone_tdb = conf.IN_LAUNCH_DIR()
+    conf.env.building_tdb = True
+
+    if not conf.env.standalone_tdb:
+        if conf.CHECK_BUNDLED_SYSTEM_PKG('tdb', minversion=VERSION,
+                                     implied_deps='replace'):
+            conf.define('USING_SYSTEM_TDB', 1)
+            conf.env.building_tdb = False
+            if conf.CHECK_BUNDLED_SYSTEM_PYTHON('pytdb', 'tdb', minversion=VERSION):
+                conf.define('USING_SYSTEM_PYTDB', 1)
+
+    conf.env.disable_python = getattr(Options.options, 'disable_python', False)
+
+    conf.CHECK_XSLTPROC_MANPAGES()
+
+    if not conf.env.disable_python:
+        # also disable if we don't have the python libs installed
+        conf.find_program('python', var='PYTHON')
+        conf.check_tool('python')
+        conf.check_python_version((2,4,2))
+        conf.SAMBA_CHECK_PYTHON_HEADERS(mandatory=False)
+        if not conf.env.HAVE_PYTHON_H:
+            Logs.warn('Disabling pytdb as python devel libs not found')
+            conf.env.disable_python = True
+
+    conf.SAMBA_CONFIG_H()
+
+    conf.SAMBA_CHECK_UNDEFINED_SYMBOL_FLAGS()
+
+def build(bld):
+    bld.RECURSE('lib/replace')
+
+    COMMON_SRC = bld.SUBDIR('common',
+                            '''check.c error.c tdb.c traverse.c
+                            freelistcheck.c lock.c dump.c freelist.c
+                            io.c open.c transaction.c hash.c summary.c rescue.c''')
+
+    if bld.env.standalone_tdb:
+        bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig'
+        private_library = False
+    else:
+        private_library = True
+
+    if not bld.CONFIG_SET('USING_SYSTEM_TDB'):
+        bld.SAMBA_LIBRARY('tdb',
+                          COMMON_SRC,
+                          deps='replace',
+                          includes='include',
+                          abi_directory='ABI',
+                          abi_match='tdb_*',
+                          hide_symbols=True,
+                          vnum=VERSION,
+                          public_headers='include/tdb.h',
+                          public_headers_install=not private_library,
+                          pc_files='tdb.pc',
+                          private_library=private_library)
+
+        bld.SAMBA_BINARY('tdbtorture',
+                         'tools/tdbtorture.c',
+                         'tdb',
+                         install=False)
+
+        bld.SAMBA_BINARY('tdbrestore',
+                         'tools/tdbrestore.c',
+                         'tdb', manpages='manpages/tdbrestore.8')
+
+        bld.SAMBA_BINARY('tdbdump',
+                         'tools/tdbdump.c',
+                         'tdb', manpages='manpages/tdbdump.8')
+
+        bld.SAMBA_BINARY('tdbbackup',
+                         'tools/tdbbackup.c',
+                         'tdb',
+                         manpages='manpages/tdbbackup.8')
+
+        bld.SAMBA_BINARY('tdbtool',
+                         'tools/tdbtool.c',
+                         'tdb', manpages='manpages/tdbtool.8')
+
+        # FIXME: This hardcoded list is stupid, stupid, stupid.
+        bld.SAMBA_SUBSYSTEM('tdb-test-helpers',
+                            'test/external-agent.c test/lock-tracking.c test/logging.c',
+                            'replace',
+                            includes='include')
+
+        bld.SAMBA_BINARY('tdb1-run-3G-file', 'test/run-3G-file.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-bad-tdb-header', 'test/run-bad-tdb-header.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run', 'test/run.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-check', 'test/run-check.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-corrupt', 'test/run-corrupt.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-die-during-transaction', 'test/run-die-during-transaction.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-endian', 'test/run-endian.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-incompatible', 'test/run-incompatible.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-nested-transactions', 'test/run-nested-transactions.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-nested-traverse', 'test/run-nested-traverse.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-no-lock-during-traverse', 'test/run-no-lock-during-traverse.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-oldhash', 'test/run-oldhash.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-open-during-transaction', 'test/run-open-during-transaction.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-readonly-check', 'test/run-readonly-check.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-rescue', 'test/run-rescue.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-rescue-find_entry', 'test/run-rescue-find_entry.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-rwlock-check', 'test/run-rwlock-check.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-summary', 'test/run-summary.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-transaction-expand', 'test/run-transaction-expand.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-traverse-in-transaction', 'test/run-traverse-in-transaction.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-wronghash-fail', 'test/run-wronghash-fail.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+        bld.SAMBA_BINARY('tdb1-run-zero-append', 'test/run-zero-append.c',
+                         'replace tdb-test-helpers', includes='include', install=False)
+
+    if not bld.CONFIG_SET('USING_SYSTEM_PYTDB'):
+        bld.SAMBA_PYTHON('pytdb',
+                         'pytdb.c',
+                         deps='tdb',
+                         enabled=not bld.env.disable_python,
+                         realname='tdb.so',
+                         cflags='-DPACKAGE_VERSION=\"%s\"' % VERSION)
+
+def testonly(ctx):
+    '''run tdb testsuite'''
+    import Utils, samba_utils, shutil
+    ecode = 0
+
+    test_prefix = "%s/st" % (Utils.g_module.blddir)
+    shutil.rmtree(test_prefix, ignore_errors=True)
+    os.makedirs(test_prefix)
+    os.environ['TEST_DATA_PREFIX'] = test_prefix
+
+    env = samba_utils.LOAD_ENVIRONMENT()
+    # FIXME: This is horrible :(
+    if env.building_tdb:
+        # Create scratch directory for tests.
+        testdir = os.path.join(test_prefix, 'tdb-tests')
+        samba_utils.mkdir_p(testdir)
+        # Symlink back to source dir so it can find tests in test/
+        link = os.path.join(testdir, 'test')
+        if not os.path.exists(link):
+            os.symlink(os.path.abspath(os.path.join(env.cwd, 'test')), link)
+
+        for f in 'tdb1-run-3G-file', 'tdb1-run-bad-tdb-header', 'tdb1-run', 'tdb1-run-check', 'tdb1-run-corrupt', 'tdb1-run-die-during-transaction', 'tdb1-run-endian', 'tdb1-run-incompatible', 'tdb1-run-nested-transactions', 'tdb1-run-nested-traverse', 'tdb1-run-no-lock-during-traverse', 'tdb1-run-oldhash', 'tdb1-run-open-during-transaction', 'tdb1-run-readonly-check', 'tdb1-run-rescue', 'tdb1-run-rescue-find_entry', 'tdb1-run-rwlock-check', 'tdb1-run-summary', 'tdb1-run-transaction-expand', 'tdb1-run-traverse-in-transaction', 'tdb1-run-wronghash-fail', 'tdb1-run-zero-append':
+            cmd = "cd " + testdir + " && " + os.path.abspath(os.path.join(Utils.g_module.blddir, f)) + " > test-output 2>&1"
+            print("..." + f)
+            ret = samba_utils.RUN_COMMAND(cmd)
+            if ret != 0:
+                print("%s failed:" % f)
+                samba_utils.RUN_COMMAND("cat " + os.path.join(testdir, 'test-output'))
+                ecode = ret
+                break
+
+    if ecode == 0:
+        cmd = os.path.join(Utils.g_module.blddir, 'tdbtorture')
+        ret = samba_utils.RUN_COMMAND(cmd)
+        print("testsuite returned %d" % ret)
+        if ret != 0:
+            ecode = ret
+    sys.exit(ecode)
+
+# WAF doesn't build the unit tests for this, maybe because they don't link with tdb?
+# This forces it
+def test(ctx):
+    import Scripting
+    Scripting.commands.append('build')
+    Scripting.commands.append('testonly')
+
+def dist():
+    '''makes a tarball for distribution'''
+    samba_dist.dist()
+
+def reconfigure(ctx):
+    '''reconfigure if config scripts have changed'''
+    import samba_utils
+    samba_utils.reconfigure(ctx)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.10.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.10.sigs
new file mode 100644 (file)
index 0000000..9adaba5
--- /dev/null
@@ -0,0 +1,73 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.11.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.11.sigs
new file mode 100644 (file)
index 0000000..9adaba5
--- /dev/null
@@ -0,0 +1,73 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.12.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.12.sigs
new file mode 100644 (file)
index 0000000..df9b08d
--- /dev/null
@@ -0,0 +1,74 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.13.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.13.sigs
new file mode 100644 (file)
index 0000000..888ca0e
--- /dev/null
@@ -0,0 +1,75 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_defer_callback: void (struct tevent_req *, struct tevent_context *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.14.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.14.sigs
new file mode 100644 (file)
index 0000000..13c461c
--- /dev/null
@@ -0,0 +1,78 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_entry: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_optimize_empty: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_running: bool (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_defer_callback: void (struct tevent_req *, struct tevent_context *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.15.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.15.sigs
new file mode 100644 (file)
index 0000000..13c461c
--- /dev/null
@@ -0,0 +1,78 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_entry: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_optimize_empty: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_running: bool (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_defer_callback: void (struct tevent_req *, struct tevent_context *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.16.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.16.sigs
new file mode 100644 (file)
index 0000000..ea7f944
--- /dev/null
@@ -0,0 +1,82 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_context_init_ops: struct tevent_context *(TALLOC_CTX *, const struct tevent_ops *, void *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_get_trace_callback: void (struct tevent_context *, tevent_trace_callback_t *, void *)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_entry: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_optimize_empty: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_running: bool (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_defer_callback: void (struct tevent_req *, struct tevent_context *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_set_trace_callback: void (struct tevent_context *, tevent_trace_callback_t, void *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_trace_point_callback: void (struct tevent_context *, enum tevent_trace_point)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.17.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.17.sigs
new file mode 100644 (file)
index 0000000..ea7f944
--- /dev/null
@@ -0,0 +1,82 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_context_init_ops: struct tevent_context *(TALLOC_CTX *, const struct tevent_ops *, void *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_get_trace_callback: void (struct tevent_context *, tevent_trace_callback_t *, void *)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_entry: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_optimize_empty: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_running: bool (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_defer_callback: void (struct tevent_req *, struct tevent_context *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_set_trace_callback: void (struct tevent_context *, tevent_trace_callback_t, void *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_trace_point_callback: void (struct tevent_context *, enum tevent_trace_point)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.18.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.18.sigs
new file mode 100644 (file)
index 0000000..70d20b6
--- /dev/null
@@ -0,0 +1,83 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_req_oom: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_add_timer_v2: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_context_init_ops: struct tevent_context *(TALLOC_CTX *, const struct tevent_ops *, void *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_get_trace_callback: void (struct tevent_context *, tevent_trace_callback_t *, void *)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_entry: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_add_optimize_empty: struct tevent_queue_entry *(struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_running: bool (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_defer_callback: void (struct tevent_req *, struct tevent_context *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_set_trace_callback: void (struct tevent_context *, tevent_trace_callback_t, void *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_trace_point_callback: void (struct tevent_context *, enum tevent_trace_point)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/ABI/tevent-0.9.9.sigs b/ctdb/lib/tevent/ABI/tevent-0.9.9.sigs
new file mode 100644 (file)
index 0000000..9adaba5
--- /dev/null
@@ -0,0 +1,73 @@
+_tevent_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+_tevent_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+_tevent_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+_tevent_create_immediate: struct tevent_immediate *(TALLOC_CTX *, const char *)
+_tevent_loop_once: int (struct tevent_context *, const char *)
+_tevent_loop_until: int (struct tevent_context *, bool (*)(void *), void *, const char *)
+_tevent_loop_wait: int (struct tevent_context *, const char *)
+_tevent_queue_create: struct tevent_queue *(TALLOC_CTX *, const char *, const char *)
+_tevent_req_callback_data: void *(struct tevent_req *)
+_tevent_req_cancel: bool (struct tevent_req *, const char *)
+_tevent_req_create: struct tevent_req *(TALLOC_CTX *, void *, size_t, const char *, const char *)
+_tevent_req_data: void *(struct tevent_req *)
+_tevent_req_done: void (struct tevent_req *, const char *)
+_tevent_req_error: bool (struct tevent_req *, uint64_t, const char *)
+_tevent_req_nomem: bool (const void *, struct tevent_req *, const char *)
+_tevent_req_notify_callback: void (struct tevent_req *, const char *)
+_tevent_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_backend_list: const char **(TALLOC_CTX *)
+tevent_cleanup_pending_signal_handlers: void (struct tevent_signal *)
+tevent_common_add_fd: struct tevent_fd *(struct tevent_context *, TALLOC_CTX *, int, uint16_t, tevent_fd_handler_t, void *, const char *, const char *)
+tevent_common_add_signal: struct tevent_signal *(struct tevent_context *, TALLOC_CTX *, int, int, tevent_signal_handler_t, void *, const char *, const char *)
+tevent_common_add_timer: struct tevent_timer *(struct tevent_context *, TALLOC_CTX *, struct timeval, tevent_timer_handler_t, void *, const char *, const char *)
+tevent_common_check_signal: int (struct tevent_context *)
+tevent_common_context_destructor: int (struct tevent_context *)
+tevent_common_fd_destructor: int (struct tevent_fd *)
+tevent_common_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_common_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_common_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_common_loop_immediate: bool (struct tevent_context *)
+tevent_common_loop_timer_delay: struct timeval (struct tevent_context *)
+tevent_common_loop_wait: int (struct tevent_context *, const char *)
+tevent_common_schedule_immediate: void (struct tevent_immediate *, struct tevent_context *, tevent_immediate_handler_t, void *, const char *, const char *)
+tevent_context_init: struct tevent_context *(TALLOC_CTX *)
+tevent_context_init_byname: struct tevent_context *(TALLOC_CTX *, const char *)
+tevent_debug: void (struct tevent_context *, enum tevent_debug_level, const char *, ...)
+tevent_fd_get_flags: uint16_t (struct tevent_fd *)
+tevent_fd_set_auto_close: void (struct tevent_fd *)
+tevent_fd_set_close_fn: void (struct tevent_fd *, tevent_fd_close_fn_t)
+tevent_fd_set_flags: void (struct tevent_fd *, uint16_t)
+tevent_loop_allow_nesting: void (struct tevent_context *)
+tevent_loop_set_nesting_hook: void (struct tevent_context *, tevent_nesting_hook, void *)
+tevent_queue_add: bool (struct tevent_queue *, struct tevent_context *, struct tevent_req *, tevent_queue_trigger_fn_t, void *)
+tevent_queue_length: size_t (struct tevent_queue *)
+tevent_queue_start: void (struct tevent_queue *)
+tevent_queue_stop: void (struct tevent_queue *)
+tevent_re_initialise: int (struct tevent_context *)
+tevent_register_backend: bool (const char *, const struct tevent_ops *)
+tevent_req_default_print: char *(struct tevent_req *, TALLOC_CTX *)
+tevent_req_is_error: bool (struct tevent_req *, enum tevent_req_state *, uint64_t *)
+tevent_req_is_in_progress: bool (struct tevent_req *)
+tevent_req_poll: bool (struct tevent_req *, struct tevent_context *)
+tevent_req_post: struct tevent_req *(struct tevent_req *, struct tevent_context *)
+tevent_req_print: char *(TALLOC_CTX *, struct tevent_req *)
+tevent_req_received: void (struct tevent_req *)
+tevent_req_set_callback: void (struct tevent_req *, tevent_req_fn, void *)
+tevent_req_set_cancel_fn: void (struct tevent_req *, tevent_req_cancel_fn)
+tevent_req_set_endtime: bool (struct tevent_req *, struct tevent_context *, struct timeval)
+tevent_req_set_print_fn: void (struct tevent_req *, tevent_req_print_fn)
+tevent_set_abort_fn: void (void (*)(const char *))
+tevent_set_debug: int (struct tevent_context *, void (*)(void *, enum tevent_debug_level, const char *, va_list), void *)
+tevent_set_debug_stderr: int (struct tevent_context *)
+tevent_set_default_backend: void (const char *)
+tevent_signal_support: bool (struct tevent_context *)
+tevent_timeval_add: struct timeval (const struct timeval *, uint32_t, uint32_t)
+tevent_timeval_compare: int (const struct timeval *, const struct timeval *)
+tevent_timeval_current: struct timeval (void)
+tevent_timeval_current_ofs: struct timeval (uint32_t, uint32_t)
+tevent_timeval_is_zero: bool (const struct timeval *)
+tevent_timeval_set: struct timeval (uint32_t, uint32_t)
+tevent_timeval_until: struct timeval (const struct timeval *, const struct timeval *)
+tevent_timeval_zero: struct timeval (void)
+tevent_wakeup_recv: bool (struct tevent_req *)
+tevent_wakeup_send: struct tevent_req *(TALLOC_CTX *, struct tevent_context *, struct timeval)
diff --git a/ctdb/lib/tevent/bindings.py b/ctdb/lib/tevent/bindings.py
new file mode 100644 (file)
index 0000000..1060caf
--- /dev/null
@@ -0,0 +1,62 @@
+#!/usr/bin/python
+#
+#   Python integration for tevent - tests
+#
+#   Copyright (C) Jelmer Vernooij 2010
+#
+#     ** NOTE! The following LGPL license applies to the tevent
+#     ** library. This does NOT imply that all of Samba is released
+#     ** under the LGPL
+#
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 3 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+import signal
+import _tevent
+from unittest import TestCase
+
+class BackendListTests(TestCase):
+
+    def test_backend_list(self):
+        self.assertTrue(isinstance(_tevent.backend_list(), list))
+
+
+class CreateContextTests(TestCase):
+
+    def test_by_name(self):
+        ctx = _tevent.Context(_tevent.backend_list()[0])
+        self.assertTrue(ctx is not None)
+
+    def test_no_name(self):
+        ctx = _tevent.Context()
+        self.assertTrue(ctx is not None)
+
+
+class ContextTests(TestCase):
+
+    def setUp(self):
+        super(ContextTests, self).setUp()
+        self.ctx = _tevent.Context()
+
+    def test_signal_support(self):
+        self.assertTrue(type(self.ctx.signal_support) is bool)
+
+    def test_reinitialise(self):
+        self.ctx.reinitialise()
+
+    def test_loop_wait(self):
+        self.ctx.loop_wait()
+
+    def test_add_signal(self):
+        sig = self.ctx.add_signal(signal.SIGINT, 0, lambda callback: None)
+        self.assertTrue(isinstance(sig, _tevent.Signal))
diff --git a/ctdb/lib/tevent/doc/mainpage.dox b/ctdb/lib/tevent/doc/mainpage.dox
new file mode 100644 (file)
index 0000000..e2f986e
--- /dev/null
@@ -0,0 +1,42 @@
+/**
+ * @mainpage
+ *
+ * Tevent is an event system based on the talloc memory management library. It
+ * is the core event system used in Samba.
+ *
+ * The low level tevent has support for many event types, including timers,
+ * signals, and the classic file descriptor events.
+ *
+ * Tevent also provide helpers to deal with asynchronous code providing the
+ * tevent_req (tevent request) functions.
+ *
+ * @section tevent_download Download
+ *
+ * You can download the latest releases of tevent from the
+ * <a href="http://samba.org/ftp/tevent" target="_blank">tevent directory</a>
+ * on the samba public source archive.
+ *
+ * @section tevent_bugs Discussion and bug reports
+ *
+ * tevent does not currently have its own mailing list or bug tracking system.
+ * For now, please use the
+ * <a href="https://lists.samba.org/mailman/listinfo/samba-technical" target="_blank">samba-technical</a>
+ * mailing list, and the
+ * <a href="http://bugzilla.samba.org/" target="_blank">Samba bugzilla</a>
+ * bug tracking system.
+ *
+ * @section tevent_devel Development
+ * You can download the latest code either via git or rsync.
+ *
+ * To fetch via git see the following guide:
+ *
+ * <a href="http://wiki.samba.org/index.php/Using_Git_for_Samba_Development" target="_blank">Using Git for Samba Development</a>
+ *
+ * Once you have cloned the tree switch to the master branch and cd into the
+ * lib/tevent directory.
+ *
+ * To fetch via rsync use this command:
+ *
+ *   rsync -Pavz samba.org::ftp/unpacked/standalone_projects/lib/tevent .
+ *
+ */
diff --git a/ctdb/lib/tevent/doc/tutorials.dox b/ctdb/lib/tevent/doc/tutorials.dox
new file mode 100644 (file)
index 0000000..e8beed7
--- /dev/null
@@ -0,0 +1,43 @@
+/**
+ * @page tevent_queue_tutorial The tevent_queue tutorial
+ *
+ * @section Introduction
+ *
+ * A tevent_queue is used to queue up async requests that must be
+ * serialized. For example writing buffers into a socket must be
+ * serialized. Writing a large lump of data into a socket can require
+ * multiple write(2) or send(2) system calls. If more than one async
+ * request is outstanding to write large buffers into a socket, every
+ * request must individually be completed before the next one begins,
+ * even if multiple syscalls are required.
+ *
+ * To do this, every socket gets assigned a tevent_queue struct.
+ *
+ * Creating a serialized async request follows the usual convention to
+ * return a tevent_req structure with an embedded state structure. To
+ * serialize the work the requests is about to so, instead of directly
+ * starting or doing that work, tevent_queue_add must be called. When it
+ * is time for the serialized async request to do its work, the trigger
+ * callback function tevent_queue_add was given is called. In the example
+ * of writing to a socket, the trigger is called when the write request
+ * can begin accessing the socket.
+ *
+ * How does this engine work behind the scenes? When the queue is empty,
+ * tevent_queue_add schedules an immediate call to the trigger
+ * callback. The trigger callback starts its work, likely by starting
+ * other async subrequests. While these async subrequests are working,
+ * more requests can accumulate in the queue by tevent_queue_add. While
+ * there is no function to explicitly trigger the next waiter in line, it
+ * still works: When the active request in the queue is done, it will be
+ * destroyed by talloc_free. Talloc_free of an serialized async request
+ * that had been added to a queue will trigger the next request in the
+ * queue via a talloc destructor attached to a child of the serialized
+ * request. This way the queue will be kept busy when an async request
+ * finishes.
+ *
+ * @section Example
+ *
+ * @code
+ *      Metze: Please add a code example here.
+ * @endcode
+ */
diff --git a/ctdb/lib/tevent/doxy.config b/ctdb/lib/tevent/doxy.config
new file mode 100644 (file)
index 0000000..0d67ae3
--- /dev/null
@@ -0,0 +1,1535 @@
+# Doxyfile 1.6.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME           = tevent
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         = 0.9.8
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrilic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it parses.
+# With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this tag.
+# The format is ext=language, where ext is a file extension, and language is one of
+# the parsers supported by doxygen: IDL, Java, Javascript, C#, C, C++, D, PHP,
+# Objective-C, Python, Fortran, VHDL, C, C++. For instance to make doxygen treat
+# .inc files as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C. Note that for custom extensions you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen to replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penality.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will rougly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols
+
+SYMBOL_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespace are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the (brief and detailed) documentation of class members so that constructors and destructors are listed first. If set to NO (the default) the constructors will appear in the respective orders defined by SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed by
+# doxygen. The layout file controls the global structure of the generated output files
+# in an output format independent way. The create the layout file that represents
+# doxygen's defaults, run doxygen with the -l option. You can optionally specify a
+# file name after the option, if omitted DoxygenLayout.xml will be used as the name
+# of the layout file.
+
+LAYOUT_FILE            =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be abled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = . doc
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx
+# *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.py *.f90
+
+FILE_PATTERNS          = *.cpp \
+                         *.cc \
+                         *.c \
+                         *.h \
+                         *.hh \
+                         *.hpp \
+                         *.dox
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or
+# directories that are symbolic links (a Unix filesystem feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = */.git/*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty, INPUT_FILTER
+# is applied to all files.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# If the HTML_FOOTER_DESCRIPTION tag is set to YES, Doxygen will
+# add generated date, project name and doxygen version to HTML footer.
+
+HTML_FOOTER_DESCRIPTION= NO
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# stylesheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and QHP_VIRTUAL_FOLDER
+# are set, an additional index file will be generated that can be used as input for
+# Qt's qhelpgenerator to generate a Qt Compressed Help (.qch) of the generated
+# HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          =
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to add.
+# For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the custom filter to add.For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX          = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW      = NONE
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# When the SEARCHENGINE tag is enable doxygen will generate a search box for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using HTML help (GENERATE_HTMLHELP) or Qt help (GENERATE_QHP)
+# there is already a search function so this one should typically
+# be disabled.
+
+SEARCHENGINE           = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include source code with syntax highlighting in the LaTeX output. Note that which sources are shown also depends on other settings such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = YES
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = DOXYGEN PRINTF_ATTRIBUTE(x,y)=
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse
+# the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option is superseded by the HAVE_DOT option below. This is only a
+# fallback. It is recommended to install and use dot, since it yields more
+# powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# By default doxygen will write a font called FreeSans.ttf to the output
+# directory and reference it in all dot files that doxygen generates. This
+# font does not include all possible unicode characters however, so when you need
+# these (or just want a differently looking font) you can specify the font name
+# using DOT_FONTNAME. You need need to make sure dot is able to find the font,
+# which can be done by putting it in a standard location or by setting the
+# DOTFONTPATH environment variable or by setting DOT_FONTPATH to the directory
+# containing the font.
+
+DOT_FONTNAME           = FreeSans
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the output directory to look for the
+# FreeSans.ttf font (which doxygen will put there itself). If you specify a
+# different font using DOT_FONTNAME you can set the path where dot
+# can find it using this tag.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT       = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = YES
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/ctdb/lib/tevent/libtevent.m4 b/ctdb/lib/tevent/libtevent.m4
new file mode 100644 (file)
index 0000000..0682f5b
--- /dev/null
@@ -0,0 +1,94 @@
+dnl Check to see if we should use the included tevent
+
+INCLUDED_TEVENT=auto
+AC_ARG_WITH(included-tevent,
+    [AC_HELP_STRING([--with-included-tevent], [use bundled tevent library, not from system])],
+    [ INCLUDED_TEVENT=$withval ])
+
+AC_SUBST(TEVENT_LIBS)
+AC_SUBST(TEVENT_CFLAGS)
+
+if test x"$INCLUDED_TEVENT" != x"yes" ; then
+    AC_CHECK_HEADERS(tevent.h)
+    AC_CHECK_LIB(tevent, tevent_context_init, [ TEVENT_LIBS="-ltevent" ])
+    AC_CHECK_DECLS([TEVENT_TRACE_BEFORE_WAIT],,, [[#include <tevent.h>]])
+    if test x"$ac_cv_header_tevent_h" = x"no" -o \
+       x"$ac_cv_lib_tevent_tevent_context_init" = x"no" -o \
+       x"$ac_cv_have_decl_TEVENT_TRACE_BEFORE_WAIT" = x"no" ; then
+        INCLUDED_TEVENT=yes
+        TEVENT_CFLAGS=""
+    else
+        INCLUDED_TEVENT=no
+    fi
+fi
+
+AC_MSG_CHECKING(whether to use included tevent)
+AC_MSG_RESULT($INCLUDED_TEVENT)
+if test x"$INCLUDED_TEVENT" != x"no" ; then
+    dnl find the tevent sources. This is meant to work both for
+    dnl standalone builds, and builds of packages using libtevent
+       teventdir=""
+       teventpaths="$srcdir $srcdir/lib/tevent $srcdir/tevent $srcdir/../tevent"
+       for d in $teventpaths; do
+               if test -f "$d/tevent.c"; then
+                       teventdir="$d"
+            AC_SUBST(teventdir)
+                       break
+               fi
+       done
+       if test x"$teventdir" = "x"; then
+          AC_MSG_ERROR([cannot find tevent source in $teventpaths])
+       fi
+    TEVENT_OBJ="tevent.o tevent_debug.o tevent_util.o"
+    TEVENT_OBJ="$TEVENT_OBJ tevent_fd.o tevent_timed.o tevent_immediate.o tevent_signal.o"
+    TEVENT_OBJ="$TEVENT_OBJ tevent_req.o tevent_wakeup.o tevent_queue.o"
+    TEVENT_OBJ="$TEVENT_OBJ tevent_standard.o tevent_select.o tevent_poll.o"
+    AC_SUBST(TEVENT_OBJ)
+
+    TEVENT_CFLAGS="-I$teventdir"
+    AC_SUBST(TEVENT_CFLAGS)
+
+    TEVENT_LIBS=""
+    AC_SUBST(TEVENT_LIBS)
+
+    AC_CHECK_HEADERS(sys/epoll.h)
+    AC_CHECK_FUNCS(epoll_create)
+    if test x"$ac_cv_header_sys_epoll_h" = x"yes" -a x"$ac_cv_func_epoll_create" = x"yes"; then
+        TEVENT_OBJ="$TEVENT_OBJ tevent_epoll.o"
+        AC_DEFINE(HAVE_EPOLL, 1, [Whether epoll available])
+    fi
+
+    tevent_num_signals_includes="$ac_includes_default
+    #include <signal.h>
+    "
+    tevent_num_signals=64
+    AC_CHECK_VALUEOF(NSIG, [$tevent_num_signals_includes])
+    v=$ac_cv_valueof_NSIG
+    test -n "$v" && test "$v" -gt "$tevent_num_signals" && {
+            tevent_num_signals=$v
+    }
+    AC_CHECK_VALUEOF(_NSIG, [$tevent_num_signals_includes])
+    v=$ac_cv_valueof__NSIG
+    test -n "$v" && test "$v" -gt "$tevent_num_signals" && {
+            tevent_num_signals=$v
+    }
+    AC_CHECK_VALUEOF(SIGRTMAX, [$tevent_num_signals_includes])
+    v=$ac_cv_valueof_SIGRTMAX
+    test -n "$v" && test "$v" -gt "$tevent_num_signals" && {
+            tevent_num_signals=$v
+    }
+    AC_CHECK_VALUEOF(SIGRTMIN, [$tevent_num_signals_includes])
+    v=$ac_cv_valueof_SIGRTMIN
+    test -n "$v" && {
+            v=`expr $v + $v`
+    }
+    test -n "$v" && test "$v" -gt "$tevent_num_signals" && {
+            tevent_num_signals=$v
+    }
+    AC_DEFINE_UNQUOTED(TEVENT_NUM_SIGNALS, $tevent_num_signals, [Max signal number value])
+
+    if test x"$VERSIONSCRIPT" != "x"; then
+        EXPORTSFILE=tevent.exports
+        AC_SUBST(EXPORTSFILE)
+    fi
+fi
diff --git a/ctdb/lib/tevent/pytevent.c b/ctdb/lib/tevent/pytevent.c
new file mode 100644 (file)
index 0000000..870f5aa
--- /dev/null
@@ -0,0 +1,766 @@
+/*
+   Unix SMB/CIFS implementation.
+   Python bindings for tevent
+
+   Copyright (C) Jelmer Vernooij 2010
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <Python.h>
+#include <tevent.h>
+
+void init_tevent(void);
+
+typedef struct {
+       PyObject_HEAD
+       struct tevent_context *ev;
+} TeventContext_Object;
+
+typedef struct {
+       PyObject_HEAD
+       struct tevent_queue *queue;
+} TeventQueue_Object;
+
+typedef struct {
+       PyObject_HEAD
+       struct tevent_req *req;
+} TeventReq_Object;
+
+typedef struct {
+       PyObject_HEAD
+       struct tevent_signal *signal;
+} TeventSignal_Object;
+
+typedef struct {
+       PyObject_HEAD
+       struct tevent_timer *timer;
+} TeventTimer_Object;
+
+typedef struct {
+       PyObject_HEAD
+       struct tevent_fd *fd;
+} TeventFd_Object;
+
+staticforward PyTypeObject TeventContext_Type;
+staticforward PyTypeObject TeventReq_Type;
+staticforward PyTypeObject TeventQueue_Type;
+staticforward PyTypeObject TeventSignal_Type;
+staticforward PyTypeObject TeventTimer_Type;
+staticforward PyTypeObject TeventFd_Type;
+
+static int py_context_init(struct tevent_context *ev)
+{
+       /* FIXME */
+       return 0;
+}
+
+static struct tevent_fd *py_add_fd(struct tevent_context *ev,
+                                   TALLOC_CTX *mem_ctx,
+                                   int fd, uint16_t flags,
+                                   tevent_fd_handler_t handler,
+                                   void *private_data,
+                                   const char *handler_name,
+                                   const char *location)
+{
+       /* FIXME */
+       return NULL;
+}
+
+static void py_set_fd_close_fn(struct tevent_fd *fde,
+                               tevent_fd_close_fn_t close_fn)
+{
+       /* FIXME */
+}
+
+static uint16_t py_get_fd_flags(struct tevent_fd *fde)
+{
+       /* FIXME */
+       return 0;
+}
+
+static void py_set_fd_flags(struct tevent_fd *fde, uint16_t flags)
+{
+       /* FIXME */
+}
+
+/* timed_event functions */
+static struct tevent_timer *py_add_timer(struct tevent_context *ev,
+                                         TALLOC_CTX *mem_ctx,
+                                         struct timeval next_event,
+                                         tevent_timer_handler_t handler,
+                                         void *private_data,
+                                         const char *handler_name,
+                                         const char *location)
+{
+       /* FIXME */
+       return NULL;
+}
+
+/* immediate event functions */
+static void py_schedule_immediate(struct tevent_immediate *im,
+                                  struct tevent_context *ev,
+                                  tevent_immediate_handler_t handler,
+                                  void *private_data,
+                                  const char *handler_name,
+                                  const char *location)
+{
+       /* FIXME */
+}
+
+/* signal functions */
+static struct tevent_signal *py_add_signal(struct tevent_context *ev,
+                                           TALLOC_CTX *mem_ctx,
+                                           int signum, int sa_flags,
+                                           tevent_signal_handler_t handler,
+                                           void *private_data,
+                                           const char *handler_name,
+                                           const char *location)
+{
+       /* FIXME */
+       return NULL;
+}
+
+/* loop functions */
+static int py_loop_once(struct tevent_context *ev, const char *location)
+{
+       /* FIXME */
+       return 0;
+}
+
+static int py_loop_wait(struct tevent_context *ev, const char *location)
+{
+       /* FIXME */
+       return 0;
+}
+
+const static struct tevent_ops py_tevent_ops = {
+       .context_init = py_context_init,
+       .add_fd = py_add_fd,
+       .set_fd_close_fn = py_set_fd_close_fn,
+       .get_fd_flags = py_get_fd_flags,
+       .set_fd_flags = py_set_fd_flags,
+       .add_timer = py_add_timer,
+       .schedule_immediate = py_schedule_immediate,
+       .add_signal = py_add_signal,
+       .loop_wait = py_loop_wait,
+       .loop_once = py_loop_once,
+};
+
+static PyObject *py_register_backend(PyObject *self, PyObject *args)
+{
+       PyObject *name, *py_backend;
+
+       if (!PyArg_ParseTuple(args, "O", &py_backend))
+               return NULL;
+
+       name = PyObject_GetAttrString(py_backend, "name");
+       if (name == NULL) {
+               PyErr_SetNone(PyExc_AttributeError);
+               return NULL;
+       }
+
+       if (!PyString_Check(name)) {
+               PyErr_SetNone(PyExc_TypeError);
+               return NULL;
+       }
+
+       if (!tevent_register_backend(PyString_AsString(name), &py_tevent_ops)) { /* FIXME: What to do with backend */
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_context_reinitialise(TeventContext_Object *self)
+{
+       int ret = tevent_re_initialise(self->ev);
+       if (ret != 0) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_queue_stop(TeventQueue_Object *self)
+{
+       tevent_queue_stop(self->queue);
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_queue_start(TeventQueue_Object *self)
+{
+       tevent_queue_start(self->queue);
+       Py_RETURN_NONE;
+}
+
+static void py_queue_trigger(struct tevent_req *req, void *private_data)
+{
+       PyObject *callback = private_data, *ret;
+
+       ret = PyObject_CallFunction(callback, "");
+       Py_XDECREF(ret);
+}
+
+static PyObject *py_tevent_queue_add(TeventQueue_Object *self, PyObject *args)
+{
+       TeventContext_Object *py_ev;
+       TeventReq_Object *py_req;
+       PyObject *trigger;
+       bool ret;
+
+       if (!PyArg_ParseTuple(args, "O!O!O", 
+                                                 &TeventContext_Type, &py_ev,
+                                                 &TeventReq_Type, &py_req,
+                                                 &trigger))
+               return NULL;
+
+       Py_INCREF(trigger);
+
+       ret = tevent_queue_add(self->queue, py_ev->ev, py_req->req,
+                                                  py_queue_trigger, trigger);
+       if (!ret) {
+               PyErr_SetString(PyExc_RuntimeError, "queue add failed");
+               Py_DECREF(trigger);
+               return NULL;
+       }
+
+       Py_RETURN_NONE;
+}
+
+static PyMethodDef py_tevent_queue_methods[] = {
+       { "stop", (PyCFunction)py_tevent_queue_stop, METH_NOARGS,
+               "S.stop()" },
+       { "start", (PyCFunction)py_tevent_queue_start, METH_NOARGS,
+               "S.start()" },
+       { "add", (PyCFunction)py_tevent_queue_add, METH_VARARGS,
+               "S.add(ctx, req, trigger, baton)" },
+       { NULL },
+};
+
+static PyObject *py_tevent_context_wakeup_send(PyObject *self, PyObject *args)
+{
+       /* FIXME */
+
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_context_loop_wait(TeventContext_Object *self)
+{
+       if (tevent_loop_wait(self->ev) != 0) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_context_loop_once(TeventContext_Object *self)
+{
+       if (tevent_loop_once(self->ev) != 0) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+       Py_RETURN_NONE;
+}
+
+#ifdef TEVENT_DEPRECATED
+static bool py_tevent_finished(PyObject *callback)
+{
+       PyObject *py_ret;
+       bool ret;
+
+       py_ret = PyObject_CallFunction(callback, "");
+       if (py_ret == NULL)
+               return true;
+       ret = PyObject_IsTrue(py_ret);
+       Py_DECREF(py_ret);
+       return ret;
+}
+
+static PyObject *py_tevent_context_loop_until(TeventContext_Object *self, PyObject *args)
+{
+       PyObject *callback;
+       if (!PyArg_ParseTuple(args, "O", &callback))
+               return NULL;
+
+       if (tevent_loop_until(self->ev, py_tevent_finished, callback) != 0) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+
+       if (PyErr_Occurred())
+               return NULL;
+
+       Py_RETURN_NONE;
+}
+#endif
+
+static void py_tevent_signal_handler(struct tevent_context *ev,
+                                       struct tevent_signal *se,
+                                       int signum,
+                                       int count,
+                                       void *siginfo,
+                                       void *private_data)
+{
+       PyObject *callback = (PyObject *)private_data, *ret;
+
+       ret = PyObject_CallFunction(callback, "ii", signum, count);
+       Py_XDECREF(ret);
+}
+
+static void py_tevent_signal_dealloc(TeventSignal_Object *self)
+{
+       talloc_free(self->signal);
+       PyObject_Del(self);
+}
+
+static PyTypeObject TeventSignal_Type = {
+       .tp_name = "tevent.Signal",
+       .tp_basicsize = sizeof(TeventSignal_Object),
+       .tp_dealloc = (destructor)py_tevent_signal_dealloc,
+       .tp_flags = Py_TPFLAGS_DEFAULT,
+};
+
+static PyObject *py_tevent_context_add_signal(TeventContext_Object *self, PyObject *args)
+{
+       int signum, sa_flags;
+       PyObject *handler;
+       struct tevent_signal *sig;
+       TeventSignal_Object *ret;
+
+       if (!PyArg_ParseTuple(args, "iiO", &signum, &sa_flags, &handler))
+               return NULL;
+
+       Py_INCREF(handler);
+       sig = tevent_add_signal(self->ev, NULL, signum, sa_flags,
+                                                       py_tevent_signal_handler, handler);
+
+       ret = PyObject_New(TeventSignal_Object, &TeventSignal_Type);
+       if (ret == NULL) {
+               PyErr_NoMemory();
+               talloc_free(sig);
+               return NULL;
+       }
+
+       ret->signal = sig;
+
+       return (PyObject *)ret;
+}
+
+static void py_timer_handler(struct tevent_context *ev,
+                                      struct tevent_timer *te,
+                                      struct timeval current_time,
+                                      void *private_data)
+{
+       PyObject *callback = private_data, *ret;
+       ret = PyObject_CallFunction(callback, "l", te);
+       Py_XDECREF(ret);
+}
+
+static PyObject *py_tevent_context_add_timer(TeventContext_Object *self, PyObject *args)
+{
+       TeventTimer_Object *ret;
+       struct timeval next_event;
+       struct tevent_timer *timer;
+       PyObject *handler;
+       if (!PyArg_ParseTuple(args, "lO", &next_event, &handler))
+               return NULL;
+
+       timer = tevent_add_timer(self->ev, NULL, next_event, py_timer_handler,
+                                                        handler);
+       if (timer == NULL) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+
+       ret = PyObject_New(TeventTimer_Object, &TeventTimer_Type);
+       if (ret == NULL) {
+               PyErr_NoMemory();
+               talloc_free(timer);
+               return NULL;
+       }
+       ret->timer = timer;
+
+       return (PyObject *)ret;
+}
+
+static void py_fd_handler(struct tevent_context *ev,
+                                   struct tevent_fd *fde,
+                                   uint16_t flags,
+                                   void *private_data)
+{
+       PyObject *callback = private_data, *ret;
+
+       ret = PyObject_CallFunction(callback, "i", flags);
+       Py_XDECREF(ret);
+}
+
+static PyObject *py_tevent_context_add_fd(TeventContext_Object *self, PyObject *args)
+{
+       int fd, flags;
+       PyObject *handler;
+       struct tevent_fd *tfd;
+       TeventFd_Object *ret;
+
+       if (!PyArg_ParseTuple(args, "iiO", &fd, &flags, &handler))
+               return NULL;
+
+       tfd = tevent_add_fd(self->ev, NULL, fd, flags, py_fd_handler, handler);
+       if (tfd == NULL) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+
+       ret = PyObject_New(TeventFd_Object, &TeventFd_Type);
+       if (ret == NULL) {
+               talloc_free(tfd);
+               return NULL;
+       }
+       ret->fd = tfd;
+
+       return (PyObject *)ret;
+}
+
+#ifdef TEVENT_DEPRECATED
+static PyObject *py_tevent_context_set_allow_nesting(TeventContext_Object *self)
+{
+       tevent_loop_allow_nesting(self->ev);
+       Py_RETURN_NONE;
+}
+#endif
+
+static PyMethodDef py_tevent_context_methods[] = {
+       { "reinitialise", (PyCFunction)py_tevent_context_reinitialise, METH_NOARGS,
+               "S.reinitialise()" },
+       { "wakeup_send", (PyCFunction)py_tevent_context_wakeup_send, 
+               METH_VARARGS, "S.wakeup_send(wakeup_time) -> req" },
+       { "loop_wait", (PyCFunction)py_tevent_context_loop_wait,
+               METH_NOARGS, "S.loop_wait()" },
+       { "loop_once", (PyCFunction)py_tevent_context_loop_once,
+               METH_NOARGS, "S.loop_once()" },
+#ifdef TEVENT_DEPRECATED
+       { "loop_until", (PyCFunction)py_tevent_context_loop_until,
+               METH_VARARGS, "S.loop_until(callback)" },
+#endif
+       { "add_signal", (PyCFunction)py_tevent_context_add_signal,
+               METH_VARARGS, "S.add_signal(signum, sa_flags, handler) -> signal" },
+       { "add_timer", (PyCFunction)py_tevent_context_add_timer,
+               METH_VARARGS, "S.add_timer(next_event, handler) -> timer" },
+       { "add_fd", (PyCFunction)py_tevent_context_add_fd, 
+               METH_VARARGS, "S.add_fd(fd, flags, handler) -> fd" },
+#ifdef TEVENT_DEPRECATED
+       { "allow_nesting", (PyCFunction)py_tevent_context_set_allow_nesting, 
+               METH_NOARGS, "Whether to allow nested tevent loops." },
+#endif
+       { NULL },
+};
+
+static PyObject *py_tevent_req_wakeup_recv(PyObject *self)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_received(PyObject *self)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_is_error(PyObject *self)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_poll(PyObject *self)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_is_in_progress(PyObject *self)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyGetSetDef py_tevent_req_getsetters[] = {
+       { "in_progress", (getter)py_tevent_req_is_in_progress, NULL,
+               "Whether the request is in progress" },
+       { NULL }
+};
+
+static PyObject *py_tevent_req_post(PyObject *self, PyObject *args)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_set_error(PyObject *self, PyObject *args)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_done(PyObject *self)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_notify_callback(PyObject *self)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_set_endtime(PyObject *self, PyObject *args)
+{
+       /* FIXME */
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_tevent_req_cancel(TeventReq_Object *self)
+{
+       if (!tevent_req_cancel(self->req)) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+       Py_RETURN_NONE;
+}
+
+static PyMethodDef py_tevent_req_methods[] = {
+       { "wakeup_recv", (PyCFunction)py_tevent_req_wakeup_recv, METH_NOARGS,
+               "Wakeup received" },
+       { "received", (PyCFunction)py_tevent_req_received, METH_NOARGS,
+               "Receive finished" },
+       { "is_error", (PyCFunction)py_tevent_req_is_error, METH_NOARGS,
+               "is_error() -> (error, state)" },
+       { "poll", (PyCFunction)py_tevent_req_poll, METH_VARARGS,
+               "poll(ctx)" },
+       { "post", (PyCFunction)py_tevent_req_post, METH_VARARGS,
+               "post(ctx) -> req" },
+       { "set_error", (PyCFunction)py_tevent_req_set_error, METH_VARARGS,
+               "set_error(error)" },
+       { "done", (PyCFunction)py_tevent_req_done, METH_NOARGS,
+               "done()" },
+       { "notify_callback", (PyCFunction)py_tevent_req_notify_callback,
+               METH_NOARGS, "notify_callback()" },
+       { "set_endtime", (PyCFunction)py_tevent_req_set_endtime,
+               METH_VARARGS, "set_endtime(ctx, endtime)" },
+       { "cancel", (PyCFunction)py_tevent_req_cancel,
+               METH_NOARGS, "cancel()" },
+       { NULL }
+};
+
+static void py_tevent_req_dealloc(TeventReq_Object *self)
+{
+       talloc_free(self->req);
+       PyObject_DEL(self);
+}
+
+static PyTypeObject TeventReq_Type = {
+       .tp_name = "tevent.Request",
+       .tp_basicsize = sizeof(TeventReq_Object),
+       .tp_methods = py_tevent_req_methods,
+       .tp_dealloc = (destructor)py_tevent_req_dealloc,
+       .tp_getset = py_tevent_req_getsetters,
+       /* FIXME: .tp_new = py_tevent_req_new, */
+};
+
+static PyObject *py_tevent_queue_get_length(TeventQueue_Object *self)
+{
+       return PyInt_FromLong(tevent_queue_length(self->queue));
+}
+
+static PyGetSetDef py_tevent_queue_getsetters[] = {
+       { "length", (getter)py_tevent_queue_get_length,
+               NULL, "The number of elements in the queue." },
+       { NULL },
+};
+
+static void py_tevent_queue_dealloc(TeventQueue_Object *self)
+{
+       talloc_free(self->queue);
+       PyObject_Del(self);
+}
+
+static PyTypeObject TeventQueue_Type = {
+       .tp_name = "tevent.Queue",
+       .tp_basicsize = sizeof(TeventQueue_Object),
+       .tp_dealloc = (destructor)py_tevent_queue_dealloc,
+       .tp_flags = Py_TPFLAGS_DEFAULT,
+       .tp_getset = py_tevent_queue_getsetters,
+       .tp_methods = py_tevent_queue_methods,
+};
+
+static PyObject *py_tevent_context_signal_support(PyObject *_self)
+{
+       TeventContext_Object *self = (TeventContext_Object *)_self;
+       return PyBool_FromLong(tevent_signal_support(self->ev));
+}
+
+static PyGetSetDef py_tevent_context_getsetters[] = {
+       { "signal_support", (getter)py_tevent_context_signal_support,
+               NULL, "if this platform and tevent context support signal handling" },
+       { NULL }
+};
+
+static void py_tevent_context_dealloc(TeventContext_Object *self)
+{
+       talloc_free(self->ev);
+       PyObject_Del(self);
+}
+
+static PyObject *py_tevent_context_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
+{
+       const char * const kwnames[] = { "name", NULL };
+       char *name = NULL;
+       struct tevent_context *ev;
+       TeventContext_Object *ret;
+
+       if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|s", kwnames, &name))
+               return NULL;
+
+       if (name == NULL) {
+               ev = tevent_context_init(NULL);
+       } else {
+               ev = tevent_context_init_byname(NULL, name);
+       }
+
+       if (ev == NULL) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               return NULL;
+       }
+
+       ret = PyObject_New(TeventContext_Object, type);
+       if (ret == NULL) {
+               PyErr_NoMemory();
+               talloc_free(ev);
+               return NULL;
+       }
+
+       ret->ev = ev;
+       return (PyObject *)ret;
+}
+
+static PyTypeObject TeventContext_Type = {
+       .tp_name = "tevent.Context",
+       .tp_new = py_tevent_context_new,
+       .tp_basicsize = sizeof(TeventContext_Object),
+       .tp_dealloc = (destructor)py_tevent_context_dealloc,
+       .tp_methods = py_tevent_context_methods,
+       .tp_getset = py_tevent_context_getsetters,
+       .tp_flags = Py_TPFLAGS_DEFAULT,
+};
+
+static PyObject *py_set_default_backend(PyObject *self, PyObject *args)
+{
+       char *backend_name;
+       if (!PyArg_ParseTuple(args, "s", &backend_name))
+               return NULL;
+
+       tevent_set_default_backend(backend_name);
+
+       Py_RETURN_NONE;
+}
+
+static PyObject *py_backend_list(PyObject *self)
+{
+       PyObject *ret;
+       int i;
+       const char **backends;
+
+       ret = PyList_New(0);
+       if (ret == NULL) {
+               return NULL;
+       }
+
+       backends = tevent_backend_list(NULL);
+       if (backends == NULL) {
+               PyErr_SetNone(PyExc_RuntimeError);
+               Py_DECREF(ret);
+               return NULL;
+       }
+       for (i = 0; backends[i]; i++) {
+               PyList_Append(ret, PyString_FromString(backends[i]));
+       }
+
+       talloc_free(backends);
+
+       return ret;
+}
+
+static PyMethodDef tevent_methods[] = {
+       { "register_backend", (PyCFunction)py_register_backend, METH_VARARGS,
+               "register_backend(backend)" },
+       { "set_default_backend", (PyCFunction)py_set_default_backend, 
+               METH_VARARGS, "set_default_backend(backend)" },
+       { "backend_list", (PyCFunction)py_backend_list, 
+               METH_NOARGS, "backend_list() -> list" },
+       { NULL },
+};
+
+void init_tevent(void)
+{
+       PyObject *m;
+
+       if (PyType_Ready(&TeventContext_Type) < 0)
+               return;
+
+       if (PyType_Ready(&TeventQueue_Type) < 0)
+               return;
+
+       if (PyType_Ready(&TeventReq_Type) < 0)
+               return;
+
+       if (PyType_Ready(&TeventSignal_Type) < 0)
+               return;
+
+       if (PyType_Ready(&TeventTimer_Type) < 0)
+               return;
+
+       if (PyType_Ready(&TeventFd_Type) < 0)
+               return;
+
+       m = Py_InitModule3("_tevent", tevent_methods, "Tevent integration for twisted.");
+       if (m == NULL)
+               return;
+
+       Py_INCREF(&TeventContext_Type);
+       PyModule_AddObject(m, "Context", (PyObject *)&TeventContext_Type);
+
+       Py_INCREF(&TeventQueue_Type);
+       PyModule_AddObject(m, "Queue", (PyObject *)&TeventQueue_Type);
+
+       Py_INCREF(&TeventReq_Type);
+       PyModule_AddObject(m, "Request", (PyObject *)&TeventReq_Type);
+
+       Py_INCREF(&TeventSignal_Type);
+       PyModule_AddObject(m, "Signal", (PyObject *)&TeventSignal_Type);
+
+       Py_INCREF(&TeventTimer_Type);
+       PyModule_AddObject(m, "Timer", (PyObject *)&TeventTimer_Type);
+
+       Py_INCREF(&TeventFd_Type);
+       PyModule_AddObject(m, "Fd", (PyObject *)&TeventFd_Type);
+
+       PyModule_AddObject(m, "__version__", PyString_FromString(PACKAGE_VERSION));
+}
diff --git a/ctdb/lib/tevent/release-script.sh b/ctdb/lib/tevent/release-script.sh
new file mode 100755 (executable)
index 0000000..077f562
--- /dev/null
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+if [ "$1" = "" ]; then
+    echo "Please provide version string, eg: 1.2.0"
+    exit 1
+fi
+
+if [ ! -d "lib/tevent" ]; then
+    echo "Run this script from the samba base directory."
+    exit 1
+fi
+
+git clean -f -x -d lib/tevent
+git clean -f -x -d lib/replace
+
+curbranch=`git-branch |grep "^*" | tr -d "* "`
+
+version=$1
+strver=`echo ${version} | tr "." "-"`
+
+# Checkout the release tag
+git branch -f tevent-release-script-${strver} tevent-${strver}
+if [ ! "$?" = "0" ];  then
+    echo "Unable to checkout tevent-${strver} release"
+    exit 1
+fi
+
+git checkout tevent-release-script-${strver}
+
+# Test configure agrees with us
+confver=`grep "^AC_INIT" lib/tevent/configure.ac | tr -d "AC_INIT(tevent, " | tr -d ")"`
+if [ ! "$confver" = "$version" ]; then
+    echo "Wrong version, requested release for ${version}, found ${confver}"
+    exit 1
+fi
+
+# Now build tarball
+cp -a lib/tevent tevent-${version}
+cp -a lib/replace tevent-${version}/libreplace
+pushd tevent-${version}
+./autogen.sh
+popd
+tar cvzf tevent-${version}.tar.gz tevent-${version}
+rm -fr tevent-${version}
+
+#Clean up
+git checkout $curbranch
+git branch -d tevent-release-script-${strver}
diff --git a/ctdb/lib/tevent/testsuite.c b/ctdb/lib/tevent/testsuite.c
new file mode 100644 (file)
index 0000000..8e3f4af
--- /dev/null
@@ -0,0 +1,833 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   testing of the events subsystem
+
+   Copyright (C) Stefan Metzmacher 2006-2009
+   Copyright (C) Jeremy Allison    2013
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/tevent/tevent.h"
+#include "system/filesys.h"
+#include "system/select.h"
+#include "system/network.h"
+#include "torture/torture.h"
+#ifdef HAVE_PTHREAD
+#include <pthread.h>
+#include <assert.h>
+#endif
+
+static int fde_count;
+
+static void fde_handler_read(struct tevent_context *ev_ctx, struct tevent_fd *f,
+                       uint16_t flags, void *private_data)
+{
+       int *fd = (int *)private_data;
+       char c;
+#ifdef SA_SIGINFO
+       kill(getpid(), SIGUSR1);
+#endif
+       kill(getpid(), SIGALRM);
+
+       read(fd[0], &c, 1);
+       fde_count++;
+}
+
+static void fde_handler_write(struct tevent_context *ev_ctx, struct tevent_fd *f,
+                       uint16_t flags, void *private_data)
+{
+       int *fd = (int *)private_data;
+       char c = 0;
+       write(fd[1], &c, 1);
+}
+
+
+/* These should never fire... */
+static void fde_handler_read_1(struct tevent_context *ev_ctx, struct tevent_fd *f,
+                       uint16_t flags, void *private_data)
+{
+       struct torture_context *test = (struct torture_context *)private_data;
+       torture_comment(test, "fde_handler_read_1 should never fire !\n");
+       abort();
+}
+
+/* These should never fire... */
+static void fde_handler_write_1(struct tevent_context *ev_ctx, struct tevent_fd *f,
+                       uint16_t flags, void *private_data)
+{
+       struct torture_context *test = (struct torture_context *)private_data;
+       torture_comment(test, "fde_handler_write_1 should never fire !\n");
+       abort();
+}
+
+static void finished_handler(struct tevent_context *ev_ctx, struct tevent_timer *te,
+                            struct timeval tval, void *private_data)
+{
+       int *finished = (int *)private_data;
+       (*finished) = 1;
+}
+
+static void count_handler(struct tevent_context *ev_ctx, struct tevent_signal *te,
+                         int signum, int count, void *info, void *private_data)
+{
+       int *countp = (int *)private_data;
+       (*countp) += count;
+}
+
+static bool test_event_context(struct torture_context *test,
+                              const void *test_data)
+{
+       struct tevent_context *ev_ctx;
+       int fd[2] = { -1, -1 };
+       const char *backend = (const char *)test_data;
+       int alarm_count=0, info_count=0;
+       struct tevent_fd *fde_read;
+       struct tevent_fd *fde_read_1;
+       struct tevent_fd *fde_write;
+       struct tevent_fd *fde_write_1;
+#ifdef SA_RESTART
+       struct tevent_signal *se1 = NULL;
+#endif
+#ifdef SA_RESETHAND
+       struct tevent_signal *se2 = NULL;
+#endif
+#ifdef SA_SIGINFO
+       struct tevent_signal *se3 = NULL;
+#endif
+       int finished=0;
+       struct timeval t;
+
+       ev_ctx = tevent_context_init_byname(test, backend);
+       if (ev_ctx == NULL) {
+               torture_comment(test, "event backend '%s' not supported\n", backend);
+               return true;
+       }
+
+       torture_comment(test, "backend '%s' - %s\n",
+                       backend, __FUNCTION__);
+
+       /* reset globals */
+       fde_count = 0;
+
+       /* create a pipe */
+       pipe(fd);
+
+       fde_read = tevent_add_fd(ev_ctx, ev_ctx, fd[0], TEVENT_FD_READ,
+                           fde_handler_read, fd);
+       fde_write_1 = tevent_add_fd(ev_ctx, ev_ctx, fd[0], TEVENT_FD_WRITE,
+                           fde_handler_write_1, test);
+
+       fde_write = tevent_add_fd(ev_ctx, ev_ctx, fd[1], TEVENT_FD_WRITE,
+                           fde_handler_write, fd);
+       fde_read_1 = tevent_add_fd(ev_ctx, ev_ctx, fd[1], TEVENT_FD_READ,
+                           fde_handler_read_1, test);
+
+       tevent_fd_set_auto_close(fde_read);
+       tevent_fd_set_auto_close(fde_write);
+
+       tevent_add_timer(ev_ctx, ev_ctx, timeval_current_ofs(2,0),
+                        finished_handler, &finished);
+
+#ifdef SA_RESTART
+       se1 = tevent_add_signal(ev_ctx, ev_ctx, SIGALRM, SA_RESTART, count_handler, &alarm_count);
+       torture_assert(test, se1 != NULL, "failed to setup se1");
+#endif
+#ifdef SA_RESETHAND
+       se2 = tevent_add_signal(ev_ctx, ev_ctx, SIGALRM, SA_RESETHAND, count_handler, &alarm_count);
+       torture_assert(test, se2 != NULL, "failed to setup se2");
+#endif
+#ifdef SA_SIGINFO
+       se3 = tevent_add_signal(ev_ctx, ev_ctx, SIGUSR1, SA_SIGINFO, count_handler, &info_count);
+       torture_assert(test, se3 != NULL, "failed to setup se3");
+#endif
+
+       t = timeval_current();
+       while (!finished) {
+               errno = 0;
+               if (tevent_loop_once(ev_ctx) == -1) {
+                       talloc_free(ev_ctx);
+                       torture_fail(test, talloc_asprintf(test, "Failed event loop %s\n", strerror(errno)));
+               }
+       }
+
+       talloc_free(fde_read);
+       talloc_free(fde_write);
+       talloc_free(fde_read_1);
+       talloc_free(fde_write_1);
+
+       while (alarm_count < fde_count+1) {
+               if (tevent_loop_once(ev_ctx) == -1) {
+                       break;
+               }
+       }
+
+       torture_comment(test, "Got %.2f pipe events/sec\n", fde_count/timeval_elapsed(&t));
+
+#ifdef SA_RESTART
+       talloc_free(se1);
+#endif
+
+       torture_assert_int_equal(test, alarm_count, 1+fde_count, "alarm count mismatch");
+
+#ifdef SA_RESETHAND
+       /*
+        * we do not call talloc_free(se2)
+        * because it is already gone,
+        * after triggering the event handler.
+        */
+#endif
+
+#ifdef SA_SIGINFO
+       talloc_free(se3);
+       torture_assert_int_equal(test, info_count, fde_count, "info count mismatch");
+#endif
+
+       talloc_free(ev_ctx);
+
+       return true;
+}
+
+struct test_event_fd1_state {
+       struct torture_context *tctx;
+       const char *backend;
+       struct tevent_context *ev;
+       int sock[2];
+       struct tevent_timer *te;
+       struct tevent_fd *fde0;
+       struct tevent_fd *fde1;
+       bool got_write;
+       bool got_read;
+       bool drain;
+       bool drain_done;
+       unsigned loop_count;
+       bool finished;
+       const char *error;
+};
+
+static void test_event_fd1_fde_handler(struct tevent_context *ev_ctx,
+                                      struct tevent_fd *fde,
+                                      uint16_t flags,
+                                      void *private_data)
+{
+       struct test_event_fd1_state *state =
+               (struct test_event_fd1_state *)private_data;
+
+       if (state->drain_done) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+
+       if (state->drain) {
+               ssize_t ret;
+               uint8_t c = 0;
+
+               if (!(flags & TEVENT_FD_READ)) {
+                       state->finished = true;
+                       state->error = __location__;
+                       return;
+               }
+
+               ret = read(state->sock[0], &c, 1);
+               if (ret == 1) {
+                       return;
+               }
+
+               /*
+                * end of test...
+                */
+               tevent_fd_set_flags(fde, 0);
+               state->drain_done = true;
+               return;
+       }
+
+       if (!state->got_write) {
+               uint8_t c = 0;
+
+               if (flags != TEVENT_FD_WRITE) {
+                       state->finished = true;
+                       state->error = __location__;
+                       return;
+               }
+               state->got_write = true;
+
+               /*
+                * we write to the other socket...
+                */
+               write(state->sock[1], &c, 1);
+               TEVENT_FD_NOT_WRITEABLE(fde);
+               TEVENT_FD_READABLE(fde);
+               return;
+       }
+
+       if (!state->got_read) {
+               if (flags != TEVENT_FD_READ) {
+                       state->finished = true;
+                       state->error = __location__;
+                       return;
+               }
+               state->got_read = true;
+
+               TEVENT_FD_NOT_READABLE(fde);
+               return;
+       }
+
+       state->finished = true;
+       state->error = __location__;
+       return;
+}
+
+static void test_event_fd1_finished(struct tevent_context *ev_ctx,
+                                   struct tevent_timer *te,
+                                   struct timeval tval,
+                                   void *private_data)
+{
+       struct test_event_fd1_state *state =
+               (struct test_event_fd1_state *)private_data;
+
+       if (state->drain_done) {
+               state->finished = true;
+               return;
+       }
+
+       if (!state->got_write) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+
+       if (!state->got_read) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+
+       state->loop_count++;
+       if (state->loop_count > 3) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+
+       state->got_write = false;
+       state->got_read = false;
+
+       tevent_fd_set_flags(state->fde0, TEVENT_FD_WRITE);
+
+       if (state->loop_count > 2) {
+               state->drain = true;
+               TALLOC_FREE(state->fde1);
+               TEVENT_FD_READABLE(state->fde0);
+       }
+
+       state->te = tevent_add_timer(state->ev, state->ev,
+                                   timeval_current_ofs(0,2000),
+                                   test_event_fd1_finished, state);
+}
+
+static bool test_event_fd1(struct torture_context *tctx,
+                          const void *test_data)
+{
+       struct test_event_fd1_state state;
+
+       ZERO_STRUCT(state);
+       state.tctx = tctx;
+       state.backend = (const char *)test_data;
+
+       state.ev = tevent_context_init_byname(tctx, state.backend);
+       if (state.ev == NULL) {
+               torture_skip(tctx, talloc_asprintf(tctx,
+                            "event backend '%s' not supported\n",
+                            state.backend));
+               return true;
+       }
+
+       tevent_set_debug_stderr(state.ev);
+       torture_comment(tctx, "backend '%s' - %s\n",
+                       state.backend, __FUNCTION__);
+
+       /*
+        * This tests the following:
+        *
+        * It monitors the state of state.sock[0]
+        * with tevent_fd, but we never read/write on state.sock[0]
+        * while state.sock[1] * is only used to write a few bytes.
+        *
+        * We have a loop:
+        *   - we wait only for TEVENT_FD_WRITE on state.sock[0]
+        *   - we write 1 byte to state.sock[1]
+        *   - we wait only for TEVENT_FD_READ on state.sock[0]
+        *   - we disable events on state.sock[0]
+        *   - the timer event restarts the loop
+        * Then we close state.sock[1]
+        * We have a loop:
+        *   - we wait for TEVENT_FD_READ/WRITE on state.sock[0]
+        *   - we try to read 1 byte
+        *   - if the read gets an error of returns 0
+        *     we disable the event handler
+        *   - the timer finishes the test
+        */
+       state.sock[0] = -1;
+       state.sock[1] = -1;
+       socketpair(AF_UNIX, SOCK_STREAM, 0, state.sock);
+
+       state.te = tevent_add_timer(state.ev, state.ev,
+                                   timeval_current_ofs(0,1000),
+                                   test_event_fd1_finished, &state);
+       state.fde0 = tevent_add_fd(state.ev, state.ev,
+                                  state.sock[0], TEVENT_FD_WRITE,
+                                  test_event_fd1_fde_handler, &state);
+       /* state.fde1 is only used to auto close */
+       state.fde1 = tevent_add_fd(state.ev, state.ev,
+                                  state.sock[1], 0,
+                                  test_event_fd1_fde_handler, &state);
+
+       tevent_fd_set_auto_close(state.fde0);
+       tevent_fd_set_auto_close(state.fde1);
+
+       while (!state.finished) {
+               errno = 0;
+               if (tevent_loop_once(state.ev) == -1) {
+                       talloc_free(state.ev);
+                       torture_fail(tctx, talloc_asprintf(tctx,
+                                    "Failed event loop %s\n",
+                                    strerror(errno)));
+               }
+       }
+
+       talloc_free(state.ev);
+
+       torture_assert(tctx, state.error == NULL, talloc_asprintf(tctx,
+                      "%s", state.error));
+
+       return true;
+}
+
+struct test_event_fd2_state {
+       struct torture_context *tctx;
+       const char *backend;
+       struct tevent_context *ev;
+       struct tevent_timer *te;
+       struct test_event_fd2_sock {
+               struct test_event_fd2_state *state;
+               int fd;
+               struct tevent_fd *fde;
+               size_t num_written;
+               size_t num_read;
+               bool got_full;
+       } sock0, sock1;
+       bool finished;
+       const char *error;
+};
+
+static void test_event_fd2_sock_handler(struct tevent_context *ev_ctx,
+                                       struct tevent_fd *fde,
+                                       uint16_t flags,
+                                       void *private_data)
+{
+       struct test_event_fd2_sock *cur_sock =
+               (struct test_event_fd2_sock *)private_data;
+       struct test_event_fd2_state *state = cur_sock->state;
+       struct test_event_fd2_sock *oth_sock = NULL;
+       uint8_t v = 0, c;
+       ssize_t ret;
+
+       if (cur_sock == &state->sock0) {
+               oth_sock = &state->sock1;
+       } else {
+               oth_sock = &state->sock0;
+       }
+
+       if (oth_sock->num_written == 1) {
+               if (flags != (TEVENT_FD_READ | TEVENT_FD_WRITE)) {
+                       state->finished = true;
+                       state->error = __location__;
+                       return;
+               }
+       }
+
+       if (cur_sock->num_read == oth_sock->num_written) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+
+       if (!(flags & TEVENT_FD_READ)) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+
+       if (oth_sock->num_read > 0) {
+               /*
+                * There should be room to write a byte again
+                */
+               if (!(flags & TEVENT_FD_WRITE)) {
+                       state->finished = true;
+                       state->error = __location__;
+                       return;
+               }
+       }
+
+       if ((flags & TEVENT_FD_WRITE) && !cur_sock->got_full) {
+               v = (uint8_t)cur_sock->num_written;
+               ret = write(cur_sock->fd, &v, 1);
+               if (ret != 1) {
+                       state->finished = true;
+                       state->error = __location__;
+                       return;
+               }
+               cur_sock->num_written++;
+               if (cur_sock->num_written > 0x80000000) {
+                       state->finished = true;
+                       state->error = __location__;
+                       return;
+               }
+               return;
+       }
+
+       if (!cur_sock->got_full) {
+               cur_sock->got_full = true;
+
+               if (!oth_sock->got_full) {
+                       /*
+                        * cur_sock is full,
+                        * lets wait for oth_sock
+                        * to be filled
+                        */
+                       tevent_fd_set_flags(cur_sock->fde, 0);
+                       return;
+               }
+
+               /*
+                * oth_sock waited for cur_sock,
+                * lets restart it
+                */
+               tevent_fd_set_flags(oth_sock->fde,
+                                   TEVENT_FD_READ|TEVENT_FD_WRITE);
+       }
+
+       ret = read(cur_sock->fd, &v, 1);
+       if (ret != 1) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+       c = (uint8_t)cur_sock->num_read;
+       if (c != v) {
+               state->finished = true;
+               state->error = __location__;
+               return;
+       }
+       cur_sock->num_read++;
+
+       if (cur_sock->num_read < oth_sock->num_written) {
+               /* there is more to read */
+               return;
+       }
+       /*
+        * we read everything, we need to remove TEVENT_FD_WRITE
+        * to avoid spinning
+        */
+       TEVENT_FD_NOT_WRITEABLE(cur_sock->fde);
+
+       if (oth_sock->num_read == cur_sock->num_written) {
+               /*
+                * both directions are finished
+                */
+               state->finished = true;
+       }
+
+       return;
+}
+
+static void test_event_fd2_finished(struct tevent_context *ev_ctx,
+                                   struct tevent_timer *te,
+                                   struct timeval tval,
+                                   void *private_data)
+{
+       struct test_event_fd2_state *state =
+               (struct test_event_fd2_state *)private_data;
+
+       /*
+        * this should never be triggered
+        */
+       state->finished = true;
+       state->error = __location__;
+}
+
+static bool test_event_fd2(struct torture_context *tctx,
+                          const void *test_data)
+{
+       struct test_event_fd2_state state;
+       int sock[2];
+       uint8_t c = 0;
+
+       ZERO_STRUCT(state);
+       state.tctx = tctx;
+       state.backend = (const char *)test_data;
+
+       state.ev = tevent_context_init_byname(tctx, state.backend);
+       if (state.ev == NULL) {
+               torture_skip(tctx, talloc_asprintf(tctx,
+                            "event backend '%s' not supported\n",
+                            state.backend));
+               return true;
+       }
+
+       tevent_set_debug_stderr(state.ev);
+       torture_comment(tctx, "backend '%s' - %s\n",
+                       state.backend, __FUNCTION__);
+
+       /*
+        * This tests the following
+        *
+        * - We write 1 byte to each socket
+        * - We wait for TEVENT_FD_READ/WRITE on both sockets
+        * - When we get TEVENT_FD_WRITE we write 1 byte
+        *   until both socket buffers are full, which
+        *   means both sockets only get TEVENT_FD_READ.
+        * - Then we read 1 byte until we have consumed
+        *   all bytes the other end has written.
+        */
+       sock[0] = -1;
+       sock[1] = -1;
+       socketpair(AF_UNIX, SOCK_STREAM, 0, sock);
+
+       /*
+        * the timer should never expire
+        */
+       state.te = tevent_add_timer(state.ev, state.ev,
+                                   timeval_current_ofs(600, 0),
+                                   test_event_fd2_finished, &state);
+       state.sock0.state = &state;
+       state.sock0.fd = sock[0];
+       state.sock0.fde = tevent_add_fd(state.ev, state.ev,
+                                       state.sock0.fd,
+                                       TEVENT_FD_READ | TEVENT_FD_WRITE,
+                                       test_event_fd2_sock_handler,
+                                       &state.sock0);
+       state.sock1.state = &state;
+       state.sock1.fd = sock[1];
+       state.sock1.fde = tevent_add_fd(state.ev, state.ev,
+                                       state.sock1.fd,
+                                       TEVENT_FD_READ | TEVENT_FD_WRITE,
+                                       test_event_fd2_sock_handler,
+                                       &state.sock1);
+
+       tevent_fd_set_auto_close(state.sock0.fde);
+       tevent_fd_set_auto_close(state.sock1.fde);
+
+       write(state.sock0.fd, &c, 1);
+       state.sock0.num_written++;
+       write(state.sock1.fd, &c, 1);
+       state.sock1.num_written++;
+
+       while (!state.finished) {
+               errno = 0;
+               if (tevent_loop_once(state.ev) == -1) {
+                       talloc_free(state.ev);
+                       torture_fail(tctx, talloc_asprintf(tctx,
+                                    "Failed event loop %s\n",
+                                    strerror(errno)));
+               }
+       }
+
+       talloc_free(state.ev);
+
+       torture_assert(tctx, state.error == NULL, talloc_asprintf(tctx,
+                      "%s", state.error));
+
+       return true;
+}
+
+#ifdef HAVE_PTHREAD
+
+static pthread_mutex_t threaded_mutex = PTHREAD_MUTEX_INITIALIZER;
+static bool do_shutdown = false;
+
+static void test_event_threaded_lock(void)
+{
+       int ret;
+       ret = pthread_mutex_lock(&threaded_mutex);
+       assert(ret == 0);
+}
+
+static void test_event_threaded_unlock(void)
+{
+       int ret;
+       ret = pthread_mutex_unlock(&threaded_mutex);
+       assert(ret == 0);
+}
+
+static void test_event_threaded_trace(enum tevent_trace_point point,
+                                     void *private_data)
+{
+       switch (point) {
+       case TEVENT_TRACE_BEFORE_WAIT:
+               test_event_threaded_unlock();
+               break;
+       case TEVENT_TRACE_AFTER_WAIT:
+               test_event_threaded_lock();
+               break;
+       case TEVENT_TRACE_BEFORE_LOOP_ONCE:
+       case TEVENT_TRACE_AFTER_LOOP_ONCE:
+               break;
+       }
+}
+
+static void test_event_threaded_timer(struct tevent_context *ev,
+                                     struct tevent_timer *te,
+                                     struct timeval current_time,
+                                     void *private_data)
+{
+       return;
+}
+
+static void *test_event_poll_thread(void *private_data)
+{
+       struct tevent_context *ev = (struct tevent_context *)private_data;
+
+       test_event_threaded_lock();
+
+       while (true) {
+               int ret;
+               ret = tevent_loop_once(ev);
+               assert(ret == 0);
+               if (do_shutdown) {
+                       test_event_threaded_unlock();
+                       return NULL;
+               }
+       }
+
+}
+
+static void test_event_threaded_read_handler(struct tevent_context *ev,
+                                            struct tevent_fd *fde,
+                                            uint16_t flags,
+                                            void *private_data)
+{
+       int *pfd = (int *)private_data;
+       char c;
+       ssize_t nread;
+
+       if ((flags & TEVENT_FD_READ) == 0) {
+               return;
+       }
+
+       do {
+               nread = read(*pfd, &c, 1);
+       } while ((nread == -1) && (errno == EINTR));
+
+       assert(nread == 1);
+}
+
+static bool test_event_context_threaded(struct torture_context *test,
+                                       const void *test_data)
+{
+       struct tevent_context *ev;
+       struct tevent_timer *te;
+       struct tevent_fd *fde;
+       pthread_t poll_thread;
+       int fds[2];
+       int ret;
+       char c = 0;
+
+       ev = tevent_context_init_byname(test, "poll_mt");
+       torture_assert(test, ev != NULL, "poll_mt not supported");
+
+       tevent_set_trace_callback(ev, test_event_threaded_trace, NULL);
+
+       te = tevent_add_timer(ev, ev, timeval_current_ofs(5, 0),
+                             test_event_threaded_timer, NULL);
+       torture_assert(test, te != NULL, "Could not add timer");
+
+       ret = pthread_create(&poll_thread, NULL, test_event_poll_thread, ev);
+       torture_assert(test, ret == 0, "Could not create poll thread");
+
+       ret = pipe(fds);
+       torture_assert(test, ret == 0, "Could not create pipe");
+
+       poll(NULL, 0, 100);
+
+       test_event_threaded_lock();
+
+       fde = tevent_add_fd(ev, ev, fds[0], TEVENT_FD_READ,
+                           test_event_threaded_read_handler, &fds[0]);
+       torture_assert(test, fde != NULL, "Could not add fd event");
+
+       test_event_threaded_unlock();
+
+       poll(NULL, 0, 100);
+
+       write(fds[1], &c, 1);
+
+       poll(NULL, 0, 100);
+
+       test_event_threaded_lock();
+       do_shutdown = true;
+       test_event_threaded_unlock();
+
+       write(fds[1], &c, 1);
+
+       ret = pthread_join(poll_thread, NULL);
+       torture_assert(test, ret == 0, "pthread_join failed");
+
+       return true;
+}
+
+#endif
+
+struct torture_suite *torture_local_event(TALLOC_CTX *mem_ctx)
+{
+       struct torture_suite *suite = torture_suite_create(mem_ctx, "event");
+       const char **list = tevent_backend_list(suite);
+       int i;
+
+       for (i=0;list && list[i];i++) {
+               struct torture_suite *backend_suite;
+
+               backend_suite = torture_suite_create(mem_ctx, list[i]);
+
+               torture_suite_add_simple_tcase_const(backend_suite,
+                                              "context",
+                                              test_event_context,
+                                              (const void *)list[i]);
+               torture_suite_add_simple_tcase_const(backend_suite,
+                                              "fd1",
+                                              test_event_fd1,
+                                              (const void *)list[i]);
+               torture_suite_add_simple_tcase_const(backend_suite,
+                                              "fd2",
+                                              test_event_fd2,
+                                              (const void *)list[i]);
+
+               torture_suite_add_suite(suite, backend_suite);
+       }
+
+#ifdef HAVE_PTHREAD
+       torture_suite_add_simple_tcase_const(suite, "threaded_poll_mt",
+                                            test_event_context_threaded,
+                                            NULL);
+#endif
+
+       return suite;
+}
diff --git a/ctdb/lib/tevent/tevent.c b/ctdb/lib/tevent/tevent.c
new file mode 100644 (file)
index 0000000..be0afd4
--- /dev/null
@@ -0,0 +1,669 @@
+/* 
+   Unix SMB/CIFS implementation.
+   main select loop and event handling
+   Copyright (C) Andrew Tridgell 2003
+   Copyright (C) Stefan Metzmacher 2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+  PLEASE READ THIS BEFORE MODIFYING!
+
+  This module is a general abstraction for the main select loop and
+  event handling. Do not ever put any localised hacks in here, instead
+  register one of the possible event types and implement that event
+  somewhere else.
+
+  There are 2 types of event handling that are handled in this module:
+
+  1) a file descriptor becoming readable or writeable. This is mostly
+     used for network sockets, but can be used for any type of file
+     descriptor. You may only register one handler for each file
+     descriptor/io combination or you will get unpredictable results
+     (this means that you can have a handler for read events, and a
+     separate handler for write events, but not two handlers that are
+     both handling read events)
+
+  2) a timed event. You can register an event that happens at a
+     specific time.  You can register as many of these as you
+     like. They are single shot - add a new timed event in the event
+     handler to get another event.
+
+  To setup a set of events you first need to create a event_context
+  structure using the function tevent_context_init(); This returns a
+  'struct tevent_context' that you use in all subsequent calls.
+
+  After that you can add/remove events that you are interested in
+  using tevent_add_*() and talloc_free()
+
+  Finally, you call tevent_loop_wait_once() to block waiting for one of the
+  events to occor or tevent_loop_wait() which will loop
+  forever.
+
+*/
+#include "replace.h"
+#include "system/filesys.h"
+#define TEVENT_DEPRECATED 1
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+struct tevent_ops_list {
+       struct tevent_ops_list *next, *prev;
+       const char *name;
+       const struct tevent_ops *ops;
+};
+
+/* list of registered event backends */
+static struct tevent_ops_list *tevent_backends = NULL;
+static char *tevent_default_backend = NULL;
+
+/*
+  register an events backend
+*/
+bool tevent_register_backend(const char *name, const struct tevent_ops *ops)
+{
+       struct tevent_ops_list *e;
+
+       for (e = tevent_backends; e != NULL; e = e->next) {
+               if (0 == strcmp(e->name, name)) {
+                       /* already registered, skip it */
+                       return true;
+               }
+       }
+
+       e = talloc(NULL, struct tevent_ops_list);
+       if (e == NULL) return false;
+
+       e->name = name;
+       e->ops = ops;
+       DLIST_ADD(tevent_backends, e);
+
+       return true;
+}
+
+/*
+  set the default event backend
+ */
+void tevent_set_default_backend(const char *backend)
+{
+       talloc_free(tevent_default_backend);
+       tevent_default_backend = talloc_strdup(NULL, backend);
+}
+
+/*
+  initialise backends if not already done
+*/
+static void tevent_backend_init(void)
+{
+       static bool done;
+
+       if (done) {
+               return;
+       }
+
+       done = true;
+
+       tevent_select_init();
+       tevent_poll_init();
+       tevent_poll_mt_init();
+#ifdef HAVE_EPOLL
+       tevent_epoll_init();
+#endif
+       tevent_standard_init();
+}
+
+_PRIVATE_ const struct tevent_ops *tevent_find_ops_byname(const char *name)
+{
+       struct tevent_ops_list *e;
+
+       tevent_backend_init();
+
+       if (name == NULL) {
+               name = tevent_default_backend;
+       }
+       if (name == NULL) {
+               name = "standard";
+       }
+
+       for (e = tevent_backends; e != NULL; e = e->next) {
+               if (0 == strcmp(e->name, name)) {
+                       return e->ops;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+  list available backends
+*/
+const char **tevent_backend_list(TALLOC_CTX *mem_ctx)
+{
+       const char **list = NULL;
+       struct tevent_ops_list *e;
+
+       tevent_backend_init();
+
+       for (e=tevent_backends;e;e=e->next) {
+               list = ev_str_list_add(list, e->name);
+       }
+
+       talloc_steal(mem_ctx, list);
+
+       return list;
+}
+
+int tevent_common_context_destructor(struct tevent_context *ev)
+{
+       struct tevent_fd *fd, *fn;
+       struct tevent_timer *te, *tn;
+       struct tevent_immediate *ie, *in;
+       struct tevent_signal *se, *sn;
+
+       if (ev->pipe_fde) {
+               talloc_free(ev->pipe_fde);
+               close(ev->pipe_fds[0]);
+               close(ev->pipe_fds[1]);
+               ev->pipe_fde = NULL;
+       }
+
+       for (fd = ev->fd_events; fd; fd = fn) {
+               fn = fd->next;
+               fd->event_ctx = NULL;
+               DLIST_REMOVE(ev->fd_events, fd);
+       }
+
+       ev->last_zero_timer = NULL;
+       for (te = ev->timer_events; te; te = tn) {
+               tn = te->next;
+               te->event_ctx = NULL;
+               DLIST_REMOVE(ev->timer_events, te);
+       }
+
+       for (ie = ev->immediate_events; ie; ie = in) {
+               in = ie->next;
+               ie->event_ctx = NULL;
+               ie->cancel_fn = NULL;
+               DLIST_REMOVE(ev->immediate_events, ie);
+       }
+
+       for (se = ev->signal_events; se; se = sn) {
+               sn = se->next;
+               se->event_ctx = NULL;
+               DLIST_REMOVE(ev->signal_events, se);
+               /*
+                * This is important, Otherwise signals
+                * are handled twice in child. eg, SIGHUP.
+                * one added in parent, and another one in
+                * the child. -- BoYang
+                */
+               tevent_cleanup_pending_signal_handlers(se);
+       }
+
+       /* removing nesting hook or we get an abort when nesting is
+        * not allowed. -- SSS
+        * Note that we need to leave the allowed flag at its current
+        * value, otherwise the use in tevent_re_initialise() will
+        * leave the event context with allowed forced to false, which
+        * will break users that expect nesting to be allowed
+        */
+       ev->nesting.level = 0;
+       ev->nesting.hook_fn = NULL;
+       ev->nesting.hook_private = NULL;
+
+       return 0;
+}
+
+/*
+  create a event_context structure for a specific implemementation.
+  This must be the first events call, and all subsequent calls pass
+  this event_context as the first element. Event handlers also
+  receive this as their first argument.
+
+  This function is for allowing third-party-applications to hook in gluecode
+  to their own event loop code, so that they can make async usage of our client libs
+
+  NOTE: use tevent_context_init() inside of samba!
+*/
+struct tevent_context *tevent_context_init_ops(TALLOC_CTX *mem_ctx,
+                                              const struct tevent_ops *ops,
+                                              void *additional_data)
+{
+       struct tevent_context *ev;
+       int ret;
+
+       ev = talloc_zero(mem_ctx, struct tevent_context);
+       if (!ev) return NULL;
+
+       talloc_set_destructor(ev, tevent_common_context_destructor);
+
+       ev->ops = ops;
+       ev->additional_data = additional_data;
+
+       ret = ev->ops->context_init(ev);
+       if (ret != 0) {
+               talloc_free(ev);
+               return NULL;
+       }
+
+       return ev;
+}
+
+/*
+  create a event_context structure. This must be the first events
+  call, and all subsequent calls pass this event_context as the first
+  element. Event handlers also receive this as their first argument.
+*/
+struct tevent_context *tevent_context_init_byname(TALLOC_CTX *mem_ctx,
+                                                 const char *name)
+{
+       const struct tevent_ops *ops;
+
+       ops = tevent_find_ops_byname(name);
+       if (ops == NULL) {
+               return NULL;
+       }
+
+       return tevent_context_init_ops(mem_ctx, ops, NULL);
+}
+
+
+/*
+  create a event_context structure. This must be the first events
+  call, and all subsequent calls pass this event_context as the first
+  element. Event handlers also receive this as their first argument.
+*/
+struct tevent_context *tevent_context_init(TALLOC_CTX *mem_ctx)
+{
+       return tevent_context_init_byname(mem_ctx, NULL);
+}
+
+/*
+  add a fd based event
+  return NULL on failure (memory allocation error)
+*/
+struct tevent_fd *_tevent_add_fd(struct tevent_context *ev,
+                                TALLOC_CTX *mem_ctx,
+                                int fd,
+                                uint16_t flags,
+                                tevent_fd_handler_t handler,
+                                void *private_data,
+                                const char *handler_name,
+                                const char *location)
+{
+       return ev->ops->add_fd(ev, mem_ctx, fd, flags, handler, private_data,
+                              handler_name, location);
+}
+
+/*
+  set a close function on the fd event
+*/
+void tevent_fd_set_close_fn(struct tevent_fd *fde,
+                           tevent_fd_close_fn_t close_fn)
+{
+       if (!fde) return;
+       if (!fde->event_ctx) return;
+       fde->event_ctx->ops->set_fd_close_fn(fde, close_fn);
+}
+
+static void tevent_fd_auto_close_fn(struct tevent_context *ev,
+                                   struct tevent_fd *fde,
+                                   int fd,
+                                   void *private_data)
+{
+       close(fd);
+}
+
+void tevent_fd_set_auto_close(struct tevent_fd *fde)
+{
+       tevent_fd_set_close_fn(fde, tevent_fd_auto_close_fn);
+}
+
+/*
+  return the fd event flags
+*/
+uint16_t tevent_fd_get_flags(struct tevent_fd *fde)
+{
+       if (!fde) return 0;
+       if (!fde->event_ctx) return 0;
+       return fde->event_ctx->ops->get_fd_flags(fde);
+}
+
+/*
+  set the fd event flags
+*/
+void tevent_fd_set_flags(struct tevent_fd *fde, uint16_t flags)
+{
+       if (!fde) return;
+       if (!fde->event_ctx) return;
+       fde->event_ctx->ops->set_fd_flags(fde, flags);
+}
+
+bool tevent_signal_support(struct tevent_context *ev)
+{
+       if (ev->ops->add_signal) {
+               return true;
+       }
+       return false;
+}
+
+static void (*tevent_abort_fn)(const char *reason);
+
+void tevent_set_abort_fn(void (*abort_fn)(const char *reason))
+{
+       tevent_abort_fn = abort_fn;
+}
+
+static void tevent_abort(struct tevent_context *ev, const char *reason)
+{
+       tevent_debug(ev, TEVENT_DEBUG_FATAL,
+                    "abort: %s\n", reason);
+
+       if (!tevent_abort_fn) {
+               abort();
+       }
+
+       tevent_abort_fn(reason);
+}
+
+/*
+  add a timer event
+  return NULL on failure
+*/
+struct tevent_timer *_tevent_add_timer(struct tevent_context *ev,
+                                      TALLOC_CTX *mem_ctx,
+                                      struct timeval next_event,
+                                      tevent_timer_handler_t handler,
+                                      void *private_data,
+                                      const char *handler_name,
+                                      const char *location)
+{
+       return ev->ops->add_timer(ev, mem_ctx, next_event, handler, private_data,
+                                 handler_name, location);
+}
+
+/*
+  allocate an immediate event
+  return NULL on failure (memory allocation error)
+*/
+struct tevent_immediate *_tevent_create_immediate(TALLOC_CTX *mem_ctx,
+                                                 const char *location)
+{
+       struct tevent_immediate *im;
+
+       im = talloc(mem_ctx, struct tevent_immediate);
+       if (im == NULL) return NULL;
+
+       im->prev                = NULL;
+       im->next                = NULL;
+       im->event_ctx           = NULL;
+       im->create_location     = location;
+       im->handler             = NULL;
+       im->private_data        = NULL;
+       im->handler_name        = NULL;
+       im->schedule_location   = NULL;
+       im->cancel_fn           = NULL;
+       im->additional_data     = NULL;
+
+       return im;
+}
+
+/*
+  schedule an immediate event
+*/
+void _tevent_schedule_immediate(struct tevent_immediate *im,
+                               struct tevent_context *ev,
+                               tevent_immediate_handler_t handler,
+                               void *private_data,
+                               const char *handler_name,
+                               const char *location)
+{
+       ev->ops->schedule_immediate(im, ev, handler, private_data,
+                                   handler_name, location);
+}
+
+/*
+  add a signal event
+
+  sa_flags are flags to sigaction(2)
+
+  return NULL on failure
+*/
+struct tevent_signal *_tevent_add_signal(struct tevent_context *ev,
+                                        TALLOC_CTX *mem_ctx,
+                                        int signum,
+                                        int sa_flags,
+                                        tevent_signal_handler_t handler,
+                                        void *private_data,
+                                        const char *handler_name,
+                                        const char *location)
+{
+       return ev->ops->add_signal(ev, mem_ctx, signum, sa_flags, handler, private_data,
+                                  handler_name, location);
+}
+
+void tevent_loop_allow_nesting(struct tevent_context *ev)
+{
+       ev->nesting.allowed = true;
+}
+
+void tevent_loop_set_nesting_hook(struct tevent_context *ev,
+                                 tevent_nesting_hook hook,
+                                 void *private_data)
+{
+       if (ev->nesting.hook_fn && 
+           (ev->nesting.hook_fn != hook ||
+            ev->nesting.hook_private != private_data)) {
+               /* the way the nesting hook code is currently written
+                  we cannot support two different nesting hooks at the
+                  same time. */
+               tevent_abort(ev, "tevent: Violation of nesting hook rules\n");
+       }
+       ev->nesting.hook_fn = hook;
+       ev->nesting.hook_private = private_data;
+}
+
+static void tevent_abort_nesting(struct tevent_context *ev, const char *location)
+{
+       const char *reason;
+
+       reason = talloc_asprintf(NULL, "tevent_loop_once() nesting at %s",
+                                location);
+       if (!reason) {
+               reason = "tevent_loop_once() nesting";
+       }
+
+       tevent_abort(ev, reason);
+}
+
+/*
+  do a single event loop using the events defined in ev 
+*/
+int _tevent_loop_once(struct tevent_context *ev, const char *location)
+{
+       int ret;
+       void *nesting_stack_ptr = NULL;
+
+       ev->nesting.level++;
+
+       if (ev->nesting.level > 1) {
+               if (!ev->nesting.allowed) {
+                       tevent_abort_nesting(ev, location);
+                       errno = ELOOP;
+                       return -1;
+               }
+       }
+       if (ev->nesting.level > 0) {
+               if (ev->nesting.hook_fn) {
+                       int ret2;
+                       ret2 = ev->nesting.hook_fn(ev,
+                                                  ev->nesting.hook_private,
+                                                  ev->nesting.level,
+                                                  true,
+                                                  (void *)&nesting_stack_ptr,
+                                                  location);
+                       if (ret2 != 0) {
+                               ret = ret2;
+                               goto done;
+                       }
+               }
+       }
+
+       tevent_trace_point_callback(ev, TEVENT_TRACE_BEFORE_LOOP_ONCE);
+       ret = ev->ops->loop_once(ev, location);
+       tevent_trace_point_callback(ev, TEVENT_TRACE_AFTER_LOOP_ONCE);
+
+       if (ev->nesting.level > 0) {
+               if (ev->nesting.hook_fn) {
+                       int ret2;
+                       ret2 = ev->nesting.hook_fn(ev,
+                                                  ev->nesting.hook_private,
+                                                  ev->nesting.level,
+                                                  false,
+                                                  (void *)&nesting_stack_ptr,
+                                                  location);
+                       if (ret2 != 0) {
+                               ret = ret2;
+                               goto done;
+                       }
+               }
+       }
+
+done:
+       ev->nesting.level--;
+       return ret;
+}
+
+/*
+  this is a performance optimization for the samba4 nested event loop problems
+*/
+int _tevent_loop_until(struct tevent_context *ev,
+                      bool (*finished)(void *private_data),
+                      void *private_data,
+                      const char *location)
+{
+       int ret = 0;
+       void *nesting_stack_ptr = NULL;
+
+       ev->nesting.level++;
+
+       if (ev->nesting.level > 1) {
+               if (!ev->nesting.allowed) {
+                       tevent_abort_nesting(ev, location);
+                       errno = ELOOP;
+                       return -1;
+               }
+       }
+       if (ev->nesting.level > 0) {
+               if (ev->nesting.hook_fn) {
+                       int ret2;
+                       ret2 = ev->nesting.hook_fn(ev,
+                                                  ev->nesting.hook_private,
+                                                  ev->nesting.level,
+                                                  true,
+                                                  (void *)&nesting_stack_ptr,
+                                                  location);
+                       if (ret2 != 0) {
+                               ret = ret2;
+                               goto done;
+                       }
+               }
+       }
+
+       while (!finished(private_data)) {
+               tevent_trace_point_callback(ev, TEVENT_TRACE_BEFORE_LOOP_ONCE);
+               ret = ev->ops->loop_once(ev, location);
+               tevent_trace_point_callback(ev, TEVENT_TRACE_AFTER_LOOP_ONCE);
+               if (ret != 0) {
+                       break;
+               }
+       }
+
+       if (ev->nesting.level > 0) {
+               if (ev->nesting.hook_fn) {
+                       int ret2;
+                       ret2 = ev->nesting.hook_fn(ev,
+                                                  ev->nesting.hook_private,
+                                                  ev->nesting.level,
+                                                  false,
+                                                  (void *)&nesting_stack_ptr,
+                                                  location);
+                       if (ret2 != 0) {
+                               ret = ret2;
+                               goto done;
+                       }
+               }
+       }
+
+done:
+       ev->nesting.level--;
+       return ret;
+}
+
+/*
+  return on failure or (with 0) if all fd events are removed
+*/
+int tevent_common_loop_wait(struct tevent_context *ev,
+                           const char *location)
+{
+       /*
+        * loop as long as we have events pending
+        */
+       while (ev->fd_events ||
+              ev->timer_events ||
+              ev->immediate_events ||
+              ev->signal_events) {
+               int ret;
+               ret = _tevent_loop_once(ev, location);
+               if (ret != 0) {
+                       tevent_debug(ev, TEVENT_DEBUG_FATAL,
+                                    "_tevent_loop_once() failed: %d - %s\n",
+                                    ret, strerror(errno));
+                       return ret;
+               }
+       }
+
+       tevent_debug(ev, TEVENT_DEBUG_WARNING,
+                    "tevent_common_loop_wait() out of events\n");
+       return 0;
+}
+
+/*
+  return on failure or (with 0) if all fd events are removed
+*/
+int _tevent_loop_wait(struct tevent_context *ev, const char *location)
+{
+       return ev->ops->loop_wait(ev, location);
+}
+
+
+/*
+  re-initialise a tevent context. This leaves you with the same
+  event context, but all events are wiped and the structure is
+  re-initialised. This is most useful after a fork()  
+
+  zero is returned on success, non-zero on failure
+*/
+int tevent_re_initialise(struct tevent_context *ev)
+{
+       tevent_common_context_destructor(ev);
+
+       return ev->ops->context_init(ev);
+}
diff --git a/ctdb/lib/tevent/tevent.h b/ctdb/lib/tevent/tevent.h
new file mode 100644 (file)
index 0000000..6b4d371
--- /dev/null
@@ -0,0 +1,1776 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   generalised event loop handling
+
+   Copyright (C) Andrew Tridgell 2005
+   Copyright (C) Stefan Metzmacher 2005-2009
+   Copyright (C) Volker Lendecke 2008
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __TEVENT_H__
+#define __TEVENT_H__
+
+#include <stdint.h>
+#include <talloc.h>
+#include <sys/time.h>
+#include <stdbool.h>
+
+struct tevent_context;
+struct tevent_ops;
+struct tevent_fd;
+struct tevent_timer;
+struct tevent_immediate;
+struct tevent_signal;
+
+/**
+ * @defgroup tevent The tevent API
+ *
+ * The tevent low-level API
+ *
+ * This API provides the public interface to manage events in the tevent
+ * mainloop. Functions are provided for managing low-level events such
+ * as timer events, fd events and signal handling.
+ *
+ * @{
+ */
+
+/* event handler types */
+/**
+ * Called when a file descriptor monitored by tevent has
+ * data to be read or written on it.
+ */
+typedef void (*tevent_fd_handler_t)(struct tevent_context *ev,
+                                   struct tevent_fd *fde,
+                                   uint16_t flags,
+                                   void *private_data);
+
+/**
+ * Called when tevent is ceasing the monitoring of a file descriptor.
+ */
+typedef void (*tevent_fd_close_fn_t)(struct tevent_context *ev,
+                                    struct tevent_fd *fde,
+                                    int fd,
+                                    void *private_data);
+
+/**
+ * Called when a tevent timer has fired.
+ */
+typedef void (*tevent_timer_handler_t)(struct tevent_context *ev,
+                                      struct tevent_timer *te,
+                                      struct timeval current_time,
+                                      void *private_data);
+
+/**
+ * Called when a tevent immediate event is invoked.
+ */
+typedef void (*tevent_immediate_handler_t)(struct tevent_context *ctx,
+                                          struct tevent_immediate *im,
+                                          void *private_data);
+
+/**
+ * Called after tevent detects the specified signal.
+ */
+typedef void (*tevent_signal_handler_t)(struct tevent_context *ev,
+                                       struct tevent_signal *se,
+                                       int signum,
+                                       int count,
+                                       void *siginfo,
+                                       void *private_data);
+
+/**
+ * @brief Create a event_context structure.
+ *
+ * This must be the first events call, and all subsequent calls pass this
+ * event_context as the first element. Event handlers also receive this as
+ * their first argument.
+ *
+ * @param[in]  mem_ctx  The memory context to use.
+ *
+ * @return              An allocated tevent context, NULL on error.
+ *
+ * @see tevent_context_init()
+ */
+struct tevent_context *tevent_context_init(TALLOC_CTX *mem_ctx);
+
+/**
+ * @brief Create a event_context structure and select a specific backend.
+ *
+ * This must be the first events call, and all subsequent calls pass this
+ * event_context as the first element. Event handlers also receive this as
+ * their first argument.
+ *
+ * @param[in]  mem_ctx  The memory context to use.
+ *
+ * @param[in]  name     The name of the backend to use.
+ *
+ * @return              An allocated tevent context, NULL on error.
+ */
+struct tevent_context *tevent_context_init_byname(TALLOC_CTX *mem_ctx, const char *name);
+
+/**
+ * @brief Create a custom event context
+ *
+ * @param[in]  mem_ctx  The memory context to use.
+ * @param[in]  ops      The function pointer table of the backend.
+ * @param[in]  additional_data  The additional/private data to this instance
+ *
+ * @return              An allocated tevent context, NULL on error.
+ *
+ */
+struct tevent_context *tevent_context_init_ops(TALLOC_CTX *mem_ctx,
+                                              const struct tevent_ops *ops,
+                                              void *additional_data);
+
+/**
+ * @brief List available backends.
+ *
+ * @param[in]  mem_ctx  The memory context to use.
+ *
+ * @return              A string vector with a terminating NULL element, NULL
+ *                      on error.
+ */
+const char **tevent_backend_list(TALLOC_CTX *mem_ctx);
+
+/**
+ * @brief Set the default tevent backend.
+ *
+ * @param[in]  backend  The name of the backend to set.
+ */
+void tevent_set_default_backend(const char *backend);
+
+#ifdef DOXYGEN
+/**
+ * @brief Add a file descriptor based event.
+ *
+ * @param[in]  ev       The event context to work on.
+ *
+ * @param[in]  mem_ctx  The talloc memory context to use.
+ *
+ * @param[in]  fd       The file descriptor to base the event on.
+ *
+ * @param[in]  flags    #TEVENT_FD_READ or #TEVENT_FD_WRITE
+ *
+ * @param[in]  handler  The callback handler for the event.
+ *
+ * @param[in]  private_data  The private data passed to the callback handler.
+ *
+ * @return              The file descriptor based event, NULL on error.
+ *
+ * @note To cancel the monitoring of a file descriptor, call talloc_free()
+ * on the object returned by this function.
+ */
+struct tevent_fd *tevent_add_fd(struct tevent_context *ev,
+                               TALLOC_CTX *mem_ctx,
+                               int fd,
+                               uint16_t flags,
+                               tevent_fd_handler_t handler,
+                               void *private_data);
+#else
+struct tevent_fd *_tevent_add_fd(struct tevent_context *ev,
+                                TALLOC_CTX *mem_ctx,
+                                int fd,
+                                uint16_t flags,
+                                tevent_fd_handler_t handler,
+                                void *private_data,
+                                const char *handler_name,
+                                const char *location);
+#define tevent_add_fd(ev, mem_ctx, fd, flags, handler, private_data) \
+       _tevent_add_fd(ev, mem_ctx, fd, flags, handler, private_data, \
+                      #handler, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Add a timed event
+ *
+ * @param[in]  ev       The event context to work on.
+ *
+ * @param[in]  mem_ctx  The talloc memory context to use.
+ *
+ * @param[in]  next_event  Timeval specifying the absolute time to fire this
+ * event. This is not an offset.
+ *
+ * @param[in]  handler  The callback handler for the event.
+ *
+ * @param[in]  private_data  The private data passed to the callback handler.
+ *
+ * @return The newly-created timer event, or NULL on error.
+ *
+ * @note To cancel a timer event before it fires, call talloc_free() on the
+ * event returned from this function. This event is automatically
+ * talloc_free()-ed after its event handler files, if it hasn't been freed yet.
+ *
+ * @note Unlike some mainloops, tevent timers are one-time events. To set up
+ * a recurring event, it is necessary to call tevent_add_timer() again during
+ * the handler processing.
+ *
+ * @note Due to the internal mainloop processing, a timer set to run
+ * immediately will do so after any other pending timers fire, but before
+ * any further file descriptor or signal handling events fire. Callers should
+ * not rely on this behavior!
+ */
+struct tevent_timer *tevent_add_timer(struct tevent_context *ev,
+                                      TALLOC_CTX *mem_ctx,
+                                      struct timeval next_event,
+                                      tevent_timer_handler_t handler,
+                                      void *private_data);
+#else
+struct tevent_timer *_tevent_add_timer(struct tevent_context *ev,
+                                      TALLOC_CTX *mem_ctx,
+                                      struct timeval next_event,
+                                      tevent_timer_handler_t handler,
+                                      void *private_data,
+                                      const char *handler_name,
+                                      const char *location);
+#define tevent_add_timer(ev, mem_ctx, next_event, handler, private_data) \
+       _tevent_add_timer(ev, mem_ctx, next_event, handler, private_data, \
+                         #handler, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * Initialize an immediate event object
+ *
+ * This object can be used to trigger an event to occur immediately after
+ * returning from the current event (before any other event occurs)
+ *
+ * @param[in] mem_ctx  The talloc memory context to use as the parent
+ *
+ * @return An empty tevent_immediate object. Use tevent_schedule_immediate
+ * to populate and use it.
+ *
+ * @note Available as of tevent 0.9.8
+ */
+struct tevent_immediate *tevent_create_immediate(TALLOC_CTX *mem_ctx);
+#else
+struct tevent_immediate *_tevent_create_immediate(TALLOC_CTX *mem_ctx,
+                                                 const char *location);
+#define tevent_create_immediate(mem_ctx) \
+       _tevent_create_immediate(mem_ctx, __location__)
+#endif
+
+#ifdef DOXYGEN
+
+/**
+ * Schedule an event for immediate execution. This event will occur
+ * immediately after returning from the current event (before any other
+ * event occurs)
+ *
+ * @param[in] im       The tevent_immediate object to populate and use
+ * @param[in] ctx      The tevent_context to run this event
+ * @param[in] handler  The event handler to run when this event fires
+ * @param[in] private_data  Data to pass to the event handler
+ */
+void tevent_schedule_immediate(struct tevent_immediate *im,
+                struct tevent_context *ctx,
+                tevent_immediate_handler_t handler,
+                void *private_data);
+#else
+void _tevent_schedule_immediate(struct tevent_immediate *im,
+                               struct tevent_context *ctx,
+                               tevent_immediate_handler_t handler,
+                               void *private_data,
+                               const char *handler_name,
+                               const char *location);
+#define tevent_schedule_immediate(im, ctx, handler, private_data) \
+       _tevent_schedule_immediate(im, ctx, handler, private_data, \
+                                  #handler, __location__);
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Add a tevent signal handler
+ *
+ * tevent_add_signal() creates a new event for handling a signal the next
+ * time through the mainloop. It implements a very simple traditional signal
+ * handler whose only purpose is to add the handler event into the mainloop.
+ *
+ * @param[in]  ev       The event context to work on.
+ *
+ * @param[in]  mem_ctx  The talloc memory context to use.
+ *
+ * @param[in]  signum   The signal to trap
+ *
+ * @param[in]  handler  The callback handler for the signal.
+ *
+ * @param[in]  sa_flags sigaction flags for this signal handler.
+ *
+ * @param[in]  private_data  The private data passed to the callback handler.
+ *
+ * @return The newly-created signal handler event, or NULL on error.
+ *
+ * @note To cancel a signal handler, call talloc_free() on the event returned
+ * from this function.
+ */
+struct tevent_signal *tevent_add_signal(struct tevent_context *ev,
+                     TALLOC_CTX *mem_ctx,
+                     int signum,
+                     int sa_flags,
+                     tevent_signal_handler_t handler,
+                     void *private_data);
+#else
+struct tevent_signal *_tevent_add_signal(struct tevent_context *ev,
+                                        TALLOC_CTX *mem_ctx,
+                                        int signum,
+                                        int sa_flags,
+                                        tevent_signal_handler_t handler,
+                                        void *private_data,
+                                        const char *handler_name,
+                                        const char *location);
+#define tevent_add_signal(ev, mem_ctx, signum, sa_flags, handler, private_data) \
+       _tevent_add_signal(ev, mem_ctx, signum, sa_flags, handler, private_data, \
+                          #handler, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Pass a single time through the mainloop
+ *
+ * This will process any appropriate signal, immediate, fd and timer events
+ *
+ * @param[in]  ev The event context to process
+ *
+ * @return Zero on success, nonzero if an internal error occurred
+ */
+int tevent_loop_once(struct tevent_context *ev);
+#else
+int _tevent_loop_once(struct tevent_context *ev, const char *location);
+#define tevent_loop_once(ev) \
+       _tevent_loop_once(ev, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Run the mainloop
+ *
+ * The mainloop will run until there are no events remaining to be processed
+ *
+ * @param[in]  ev The event context to process
+ *
+ * @return Zero if all events have been processed. Nonzero if an internal
+ * error occurred.
+ */
+int tevent_loop_wait(struct tevent_context *ev);
+#else
+int _tevent_loop_wait(struct tevent_context *ev, const char *location);
+#define tevent_loop_wait(ev) \
+       _tevent_loop_wait(ev, __location__)
+#endif
+
+
+/**
+ * Assign a function to run when a tevent_fd is freed
+ *
+ * This function is a destructor for the tevent_fd. It does not automatically
+ * close the file descriptor. If this is the desired behavior, then it must be
+ * performed by the close_fn.
+ *
+ * @param[in] fde       File descriptor event on which to set the destructor
+ * @param[in] close_fn  Destructor to execute when fde is freed
+ */
+void tevent_fd_set_close_fn(struct tevent_fd *fde,
+                           tevent_fd_close_fn_t close_fn);
+
+/**
+ * Automatically close the file descriptor when the tevent_fd is freed
+ *
+ * This function calls close(fd) internally.
+ *
+ * @param[in] fde  File descriptor event to auto-close
+ */
+void tevent_fd_set_auto_close(struct tevent_fd *fde);
+
+/**
+ * Return the flags set on this file descriptor event
+ *
+ * @param[in] fde  File descriptor event to query
+ *
+ * @return The flags set on the event. See #TEVENT_FD_READ and
+ * #TEVENT_FD_WRITE
+ */
+uint16_t tevent_fd_get_flags(struct tevent_fd *fde);
+
+/**
+ * Set flags on a file descriptor event
+ *
+ * @param[in] fde    File descriptor event to set
+ * @param[in] flags  Flags to set on the event. See #TEVENT_FD_READ and
+ * #TEVENT_FD_WRITE
+ */
+void tevent_fd_set_flags(struct tevent_fd *fde, uint16_t flags);
+
+/**
+ * Query whether tevent supports signal handling
+ *
+ * @param[in] ev  An initialized tevent context
+ *
+ * @return True if this platform and tevent context support signal handling
+ */
+bool tevent_signal_support(struct tevent_context *ev);
+
+void tevent_set_abort_fn(void (*abort_fn)(const char *reason));
+
+/* bits for file descriptor event flags */
+
+/**
+ * Monitor a file descriptor for write availability
+ */
+#define TEVENT_FD_READ 1
+/**
+ * Monitor a file descriptor for data to be read
+ */
+#define TEVENT_FD_WRITE 2
+
+/**
+ * Convenience function for declaring a tevent_fd writable
+ */
+#define TEVENT_FD_WRITEABLE(fde) \
+       tevent_fd_set_flags(fde, tevent_fd_get_flags(fde) | TEVENT_FD_WRITE)
+
+/**
+ * Convenience function for declaring a tevent_fd readable
+ */
+#define TEVENT_FD_READABLE(fde) \
+       tevent_fd_set_flags(fde, tevent_fd_get_flags(fde) | TEVENT_FD_READ)
+
+/**
+ * Convenience function for declaring a tevent_fd non-writable
+ */
+#define TEVENT_FD_NOT_WRITEABLE(fde) \
+       tevent_fd_set_flags(fde, tevent_fd_get_flags(fde) & ~TEVENT_FD_WRITE)
+
+/**
+ * Convenience function for declaring a tevent_fd non-readable
+ */
+#define TEVENT_FD_NOT_READABLE(fde) \
+       tevent_fd_set_flags(fde, tevent_fd_get_flags(fde) & ~TEVENT_FD_READ)
+
+/**
+ * Debug level of tevent
+ */
+enum tevent_debug_level {
+       TEVENT_DEBUG_FATAL,
+       TEVENT_DEBUG_ERROR,
+       TEVENT_DEBUG_WARNING,
+       TEVENT_DEBUG_TRACE
+};
+
+/**
+ * @brief The tevent debug callbac.
+ *
+ * @param[in]  context  The memory context to use.
+ *
+ * @param[in]  level    The debug level.
+ *
+ * @param[in]  fmt      The format string.
+ *
+ * @param[in]  ap       The arguments for the format string.
+ */
+typedef void (*tevent_debug_fn)(void *context,
+                               enum tevent_debug_level level,
+                               const char *fmt,
+                               va_list ap) PRINTF_ATTRIBUTE(3,0);
+
+/**
+ * Set destination for tevent debug messages
+ *
+ * @param[in] ev        Event context to debug
+ * @param[in] debug     Function to handle output printing
+ * @param[in] context   The context to pass to the debug function.
+ *
+ * @return Always returns 0 as of version 0.9.8
+ *
+ * @note Default is to emit no debug messages
+ */
+int tevent_set_debug(struct tevent_context *ev,
+                    tevent_debug_fn debug,
+                    void *context);
+
+/**
+ * Designate stderr for debug message output
+ *
+ * @param[in] ev     Event context to debug
+ *
+ * @note This function will only output TEVENT_DEBUG_FATAL, TEVENT_DEBUG_ERROR
+ * and TEVENT_DEBUG_WARNING messages. For TEVENT_DEBUG_TRACE, please define a
+ * function for tevent_set_debug()
+ */
+int tevent_set_debug_stderr(struct tevent_context *ev);
+
+enum tevent_trace_point {
+       /**
+        * Corresponds to a trace point just before waiting
+        */
+       TEVENT_TRACE_BEFORE_WAIT,
+       /**
+        * Corresponds to a trace point just after waiting
+        */
+       TEVENT_TRACE_AFTER_WAIT,
+#define TEVENT_HAS_LOOP_ONCE_TRACE_POINTS 1
+       /**
+        * Corresponds to a trace point just before calling
+        * the loop_once() backend function.
+        */
+       TEVENT_TRACE_BEFORE_LOOP_ONCE,
+       /**
+        * Corresponds to a trace point right after the
+        * loop_once() backend function has returned.
+        */
+       TEVENT_TRACE_AFTER_LOOP_ONCE,
+};
+
+typedef void (*tevent_trace_callback_t)(enum tevent_trace_point,
+                                       void *private_data);
+
+/**
+ * Register a callback to be called at certain trace points
+ *
+ * @param[in] ev             Event context
+ * @param[in] cb             Trace callback
+ * @param[in] private_data   Data to be passed to callback
+ *
+ * @note The callback will be called at trace points defined by
+ * tevent_trace_point.  Call with NULL to reset.
+ */
+void tevent_set_trace_callback(struct tevent_context *ev,
+                              tevent_trace_callback_t cb,
+                              void *private_data);
+
+/**
+ * Retrieve the current trace callback
+ *
+ * @param[in] ev             Event context
+ * @param[out] cb            Registered trace callback
+ * @param[out] private_data  Registered data to be passed to callback
+ *
+ * @note This can be used to allow one component that wants to
+ * register a callback to respect the callback that another component
+ * has already registered.
+ */
+void tevent_get_trace_callback(struct tevent_context *ev,
+                              tevent_trace_callback_t *cb,
+                              void *private_data);
+
+/**
+ * @}
+ */
+
+/**
+ * @defgroup tevent_request The tevent request functions.
+ * @ingroup tevent
+ *
+ * A tevent_req represents an asynchronous computation.
+ *
+ * The tevent_req group of API calls is the recommended way of
+ * programming async computations within tevent. In particular the
+ * file descriptor (tevent_add_fd) and timer (tevent_add_timed) events
+ * are considered too low-level to be used in larger computations. To
+ * read and write from and to sockets, Samba provides two calls on top
+ * of tevent_add_fd: read_packet_send/recv and writev_send/recv. These
+ * requests are much easier to compose than the low-level event
+ * handlers called from tevent_add_fd.
+ *
+ * A lot of the simplicity tevent_req has brought to the notoriously
+ * hairy async programming came via a set of conventions that every
+ * async computation programmed should follow. One central piece of
+ * these conventions is the naming of routines and variables.
+ *
+ * Every async computation needs a name (sensibly called "computation"
+ * down from here). From this name quite a few naming conventions are
+ * derived.
+ *
+ * Every computation that requires local state needs a
+ * @code
+ * struct computation_state {
+ *     int local_var;
+ * };
+ * @endcode
+ * Even if no local variables are required, such a state struct should
+ * be created containing a dummy variable. Quite a few helper
+ * functions and macros (for example tevent_req_create()) assume such
+ * a state struct.
+ *
+ * An async computation is started by a computation_send
+ * function. When it is finished, its result can be received by a
+ * computation_recv function. For an example how to set up an async
+ * computation, see the code example in the documentation for
+ * tevent_req_create() and tevent_req_post(). The prototypes for _send
+ * and _recv functions should follow some conventions:
+ *
+ * @code
+ * struct tevent_req *computation_send(TALLOC_CTX *mem_ctx,
+ *                                     struct tevent_req *ev,
+ *                                     ... further args);
+ * int computation_recv(struct tevent_req *req, ... further output args);
+ * @endcode
+ *
+ * The "int" result of computation_recv() depends on the result the
+ * sync version of the function would have, "int" is just an example
+ * here.
+ *
+ * Another important piece of the conventions is that the program flow
+ * is interrupted as little as possible. Because a blocking
+ * sub-computation requires that the flow needs to continue in a
+ * separate function that is the logical sequel of some computation,
+ * it should lexically follow sending off the blocking
+ * sub-computation. Setting the callback function via
+ * tevent_req_set_callback() requires referencing a function lexically
+ * below the call to tevent_req_set_callback(), forward declarations
+ * are required. A lot of the async computations thus begin with a
+ * sequence of declarations such as
+ *
+ * @code
+ * static void computation_step1_done(struct tevent_req *subreq);
+ * static void computation_step2_done(struct tevent_req *subreq);
+ * static void computation_step3_done(struct tevent_req *subreq);
+ * @endcode
+ *
+ * It really helps readability a lot to do these forward declarations,
+ * because the lexically sequential program flow makes the async
+ * computations almost as clear to read as a normal, sync program
+ * flow.
+ *
+ * It is up to the user of the async computation to talloc_free it
+ * after it has finished. If an async computation should be aborted,
+ * the tevent_req structure can be talloc_free'ed. After it has
+ * finished, it should talloc_free'ed by the API user.
+ *
+ * @{
+ */
+
+/**
+ * An async request moves from TEVENT_REQ_INIT to
+ * TEVENT_REQ_IN_PROGRESS. All other states are valid after a request
+ * has finished.
+ */
+enum tevent_req_state {
+       /**
+        * We are creating the request
+        */
+       TEVENT_REQ_INIT,
+       /**
+        * We are waiting the request to complete
+        */
+       TEVENT_REQ_IN_PROGRESS,
+       /**
+        * The request is finished successfully
+        */
+       TEVENT_REQ_DONE,
+       /**
+        * A user error has occurred. The user error has been
+        * indicated by tevent_req_error(), it can be retrieved via
+        * tevent_req_is_error().
+        */
+       TEVENT_REQ_USER_ERROR,
+       /**
+        * Request timed out after the timeout set by tevent_req_set_endtime.
+        */
+       TEVENT_REQ_TIMED_OUT,
+       /**
+        * An internal allocation has failed, or tevent_req_nomem has
+        * been given a NULL pointer as the first argument.
+        */
+       TEVENT_REQ_NO_MEMORY,
+       /**
+        * The request has been received by the caller. No further
+        * action is valid.
+        */
+       TEVENT_REQ_RECEIVED
+};
+
+/**
+ * @brief An async request
+ */
+struct tevent_req;
+
+/**
+ * @brief A tevent request callback function.
+ *
+ * @param[in]  req      The tevent async request which executed this callback.
+ */
+typedef void (*tevent_req_fn)(struct tevent_req *req);
+
+/**
+ * @brief Set an async request callback.
+ *
+ * See the documentation of tevent_req_post() for an example how this
+ * is supposed to be used.
+ *
+ * @param[in]  req      The async request to set the callback.
+ *
+ * @param[in]  fn       The callback function to set.
+ *
+ * @param[in]  pvt      A pointer to private data to pass to the async request
+ *                      callback.
+ */
+void tevent_req_set_callback(struct tevent_req *req, tevent_req_fn fn, void *pvt);
+
+#ifdef DOXYGEN
+/**
+ * @brief Get the private data cast to the given type for a callback from
+ *        a tevent request structure.
+ *
+ * @code
+ * static void computation_done(struct tevent_req *subreq) {
+ *     struct tevent_req *req = tevent_req_callback_data(subreq, struct tevent_req);
+ *     struct computation_state *state = tevent_req_data(req, struct computation_state);
+ *     .... more things, eventually maybe call tevent_req_done(req);
+ * }
+ * @endcode
+ *
+ * @param[in]  req      The structure to get the callback data from.
+ *
+ * @param[in]  type     The type of the private callback data to get.
+ *
+ * @return              The type casted private data set NULL if not set.
+ */
+void *tevent_req_callback_data(struct tevent_req *req, #type);
+#else
+void *_tevent_req_callback_data(struct tevent_req *req);
+#define tevent_req_callback_data(_req, _type) \
+       talloc_get_type_abort(_tevent_req_callback_data(_req), _type)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Get the private data for a callback from a tevent request structure.
+ *
+ * @param[in]  req      The structure to get the callback data from.
+ *
+ * @param[in]  req      The structure to get the data from.
+ *
+ * @return              The private data or NULL if not set.
+ */
+void *tevent_req_callback_data_void(struct tevent_req *req);
+#else
+#define tevent_req_callback_data_void(_req) \
+       _tevent_req_callback_data(_req)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Get the private data from a tevent request structure.
+ *
+ * When the tevent_req has been created by tevent_req_create, the
+ * result of tevent_req_data() is the state variable created by
+ * tevent_req_create() as a child of the req.
+ *
+ * @param[in]  req      The structure to get the private data from.
+ *
+ * @param[in]  type    The type of the private data
+ *
+ * @return              The private data or NULL if not set.
+ */
+void *tevent_req_data(struct tevent_req *req, #type);
+#else
+void *_tevent_req_data(struct tevent_req *req);
+#define tevent_req_data(_req, _type) \
+       talloc_get_type_abort(_tevent_req_data(_req), _type)
+#endif
+
+/**
+ * @brief The print function which can be set for a tevent async request.
+ *
+ * @param[in]  req      The tevent async request.
+ *
+ * @param[in]  ctx      A talloc memory context which can be uses to allocate
+ *                      memory.
+ *
+ * @return              An allocated string buffer to print.
+ *
+ * Example:
+ * @code
+ *   static char *my_print(struct tevent_req *req, TALLOC_CTX *mem_ctx)
+ *   {
+ *     struct my_data *data = tevent_req_data(req, struct my_data);
+ *     char *result;
+ *
+ *     result = tevent_req_default_print(mem_ctx, req);
+ *     if (result == NULL) {
+ *       return NULL;
+ *     }
+ *
+ *     return talloc_asprintf_append_buffer(result, "foo=%d, bar=%d",
+ *       data->foo, data->bar);
+ *   }
+ * @endcode
+ */
+typedef char *(*tevent_req_print_fn)(struct tevent_req *req, TALLOC_CTX *ctx);
+
+/**
+ * @brief This function sets a print function for the given request.
+ *
+ * This function can be used to setup a print function for the given request.
+ * This will be triggered if the tevent_req_print() function was
+ * called on the given request.
+ *
+ * @param[in]  req      The request to use.
+ *
+ * @param[in]  fn       A pointer to the print function
+ *
+ * @note This function should only be used for debugging.
+ */
+void tevent_req_set_print_fn(struct tevent_req *req, tevent_req_print_fn fn);
+
+/**
+ * @brief The default print function for creating debug messages.
+ *
+ * The function should not be used by users of the async API,
+ * but custom print function can use it and append custom text
+ * to the string.
+ *
+ * @param[in]  req      The request to be printed.
+ *
+ * @param[in]  mem_ctx  The memory context for the result.
+ *
+ * @return              Text representation of request.
+ *
+ */
+char *tevent_req_default_print(struct tevent_req *req, TALLOC_CTX *mem_ctx);
+
+/**
+ * @brief Print an tevent_req structure in debug messages.
+ *
+ * This function should be used by callers of the async API.
+ *
+ * @param[in]  mem_ctx  The memory context for the result.
+ *
+ * @param[in] req       The request to be printed.
+ *
+ * @return              Text representation of request.
+ */
+char *tevent_req_print(TALLOC_CTX *mem_ctx, struct tevent_req *req);
+
+/**
+ * @brief A typedef for a cancel function for a tevent request.
+ *
+ * @param[in]  req      The tevent request calling this function.
+ *
+ * @return              True if the request could be canceled, false if not.
+ */
+typedef bool (*tevent_req_cancel_fn)(struct tevent_req *req);
+
+/**
+ * @brief This function sets a cancel function for the given tevent request.
+ *
+ * This function can be used to setup a cancel function for the given request.
+ * This will be triggered if the tevent_req_cancel() function was
+ * called on the given request.
+ *
+ * @param[in]  req      The request to use.
+ *
+ * @param[in]  fn       A pointer to the cancel function.
+ */
+void tevent_req_set_cancel_fn(struct tevent_req *req, tevent_req_cancel_fn fn);
+
+#ifdef DOXYGEN
+/**
+ * @brief Try to cancel the given tevent request.
+ *
+ * This function can be used to cancel the given request.
+ *
+ * It is only possible to cancel a request when the implementation
+ * has registered a cancel function via the tevent_req_set_cancel_fn().
+ *
+ * @param[in]  req      The request to use.
+ *
+ * @return              This function returns true is the request is cancelable,
+ *                      othererwise false is returned.
+ *
+ * @note Even if the function returns true, the caller need to wait
+ *       for the function to complete normally.
+ *       Only the _recv() function of the given request indicates
+ *       if the request was really canceled.
+ */
+bool tevent_req_cancel(struct tevent_req *req);
+#else
+bool _tevent_req_cancel(struct tevent_req *req, const char *location);
+#define tevent_req_cancel(req) \
+       _tevent_req_cancel(req, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Create an async tevent request.
+ *
+ * The new async request will be initialized in state TEVENT_REQ_IN_PROGRESS.
+ *
+ * @code
+ * struct tevent_req *req;
+ * struct computation_state *state;
+ * req = tevent_req_create(mem_ctx, &state, struct computation_state);
+ * @endcode
+ *
+ * Tevent_req_create() creates the state variable as a talloc child of
+ * its result. The state variable should be used as the talloc parent
+ * for all temporary variables that are allocated during the async
+ * computation. This way, when the user of the async computation frees
+ * the request, the state as a talloc child will be free'd along with
+ * all the temporary variables hanging off the state.
+ *
+ * @param[in] mem_ctx   The memory context for the result.
+ * @param[in] pstate    Pointer to the private request state.
+ * @param[in] type      The name of the request.
+ *
+ * @return              A new async request. NULL on error.
+ */
+struct tevent_req *tevent_req_create(TALLOC_CTX *mem_ctx,
+                                    void **pstate, #type);
+#else
+struct tevent_req *_tevent_req_create(TALLOC_CTX *mem_ctx,
+                                     void *pstate,
+                                     size_t state_size,
+                                     const char *type,
+                                     const char *location);
+
+#define tevent_req_create(_mem_ctx, _pstate, _type) \
+       _tevent_req_create((_mem_ctx), (_pstate), sizeof(_type), \
+                          #_type, __location__)
+#endif
+
+/**
+ * @brief Set a timeout for an async request.
+ *
+ * @param[in]  req      The request to set the timeout for.
+ *
+ * @param[in]  ev       The event context to use for the timer.
+ *
+ * @param[in]  endtime  The endtime of the request.
+ *
+ * @return              True if succeeded, false if not.
+ */
+bool tevent_req_set_endtime(struct tevent_req *req,
+                           struct tevent_context *ev,
+                           struct timeval endtime);
+
+#ifdef DOXYGEN
+/**
+ * @brief Call the notify callback of the given tevent request manually.
+ *
+ * @param[in]  req      The tevent request to call the notify function from.
+ *
+ * @see tevent_req_set_callback()
+ */
+void tevent_req_notify_callback(struct tevent_req *req);
+#else
+void _tevent_req_notify_callback(struct tevent_req *req, const char *location);
+#define tevent_req_notify_callback(req)                \
+       _tevent_req_notify_callback(req, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief An async request has successfully finished.
+ *
+ * This function is to be used by implementors of async requests. When a
+ * request is successfully finished, this function calls the user's completion
+ * function.
+ *
+ * @param[in]  req       The finished request.
+ */
+void tevent_req_done(struct tevent_req *req);
+#else
+void _tevent_req_done(struct tevent_req *req,
+                     const char *location);
+#define tevent_req_done(req) \
+       _tevent_req_done(req, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief An async request has seen an error.
+ *
+ * This function is to be used by implementors of async requests. When a
+ * request can not successfully completed, the implementation should call this
+ * function with the appropriate status code.
+ *
+ * If error is 0 the function returns false and does nothing more.
+ *
+ * @param[in]  req      The request with an error.
+ *
+ * @param[in]  error    The error code.
+ *
+ * @return              On success true is returned, false if error is 0.
+ *
+ * @code
+ * int error = first_function();
+ * if (tevent_req_error(req, error)) {
+ *      return;
+ * }
+ *
+ * error = second_function();
+ * if (tevent_req_error(req, error)) {
+ *      return;
+ * }
+ *
+ * tevent_req_done(req);
+ * return;
+ * @endcode
+ */
+bool tevent_req_error(struct tevent_req *req,
+                     uint64_t error);
+#else
+bool _tevent_req_error(struct tevent_req *req,
+                      uint64_t error,
+                      const char *location);
+#define tevent_req_error(req, error) \
+       _tevent_req_error(req, error, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Helper function for nomem check.
+ *
+ * Convenience helper to easily check alloc failure within a callback
+ * implementing the next step of an async request.
+ *
+ * @param[in]  p        The pointer to be checked.
+ *
+ * @param[in]  req      The request being processed.
+ *
+ * @code
+ * p = talloc(mem_ctx, bla);
+ * if (tevent_req_nomem(p, req)) {
+ *      return;
+ * }
+ * @endcode
+ */
+bool tevent_req_nomem(const void *p,
+                     struct tevent_req *req);
+#else
+bool _tevent_req_nomem(const void *p,
+                      struct tevent_req *req,
+                      const char *location);
+#define tevent_req_nomem(p, req) \
+       _tevent_req_nomem(p, req, __location__)
+#endif
+
+#ifdef DOXYGEN
+/**
+ * @brief Indicate out of memory to a request
+ *
+ * @param[in]  req      The request being processed.
+ */
+void tevent_req_oom(struct tevent_req *req);
+#else
+void _tevent_req_oom(struct tevent_req *req,
+                    const char *location);
+#define tevent_req_oom(req) \
+       _tevent_req_oom(req, __location__)
+#endif
+
+/**
+ * @brief Finish a request before the caller had the change to set the callback.
+ *
+ * An implementation of an async request might find that it can either finish
+ * the request without waiting for an external event, or it can not even start
+ * the engine. To present the illusion of a callback to the user of the API,
+ * the implementation can call this helper function which triggers an
+ * immediate event. This way the caller can use the same calling
+ * conventions, independent of whether the request was actually deferred.
+ *
+ * @code
+ * struct tevent_req *computation_send(TALLOC_CTX *mem_ctx,
+ *                                     struct tevent_context *ev)
+ * {
+ *     struct tevent_req *req, *subreq;
+ *     struct computation_state *state;
+ *     req = tevent_req_create(mem_ctx, &state, struct computation_state);
+ *     if (req == NULL) {
+ *         return NULL;
+ *     }
+ *     subreq = subcomputation_send(state, ev);
+ *     if (tevent_req_nomem(subreq, req)) {
+ *         return tevent_req_post(req, ev);
+ *     }
+ *     tevent_req_set_callback(subreq, computation_done, req);
+ *     return req;
+ * }
+ * @endcode
+ *
+ * @param[in]  req      The finished request.
+ *
+ * @param[in]  ev       The tevent_context for the immediate event.
+ *
+ * @return              The given request will be returned.
+ */
+struct tevent_req *tevent_req_post(struct tevent_req *req,
+                                  struct tevent_context *ev);
+
+/**
+ * @brief Finish multiple requests within one function
+ *
+ * Normally tevent_req_notify_callback() and all wrappers
+ * (e.g. tevent_req_done() and tevent_req_error())
+ * need to be the last thing an event handler should call.
+ * This is because the callback is likely to destroy the
+ * context of the current function.
+ *
+ * If a function wants to notify more than one caller,
+ * it is dangerous if it just triggers multiple callbacks
+ * in a row. With tevent_req_defer_callback() it is possible
+ * to set an event context that will be used to defer the callback
+ * via an immediate event (similar to tevent_req_post()).
+ *
+ * @code
+ * struct complete_state {
+ *       struct tevent_context *ev;
+ *
+ *       struct tevent_req **reqs;
+ * };
+ *
+ * void complete(struct complete_state *state)
+ * {
+ *       size_t i, c = talloc_array_length(state->reqs);
+ *
+ *       for (i=0; i < c; i++) {
+ *            tevent_req_defer_callback(state->reqs[i], state->ev);
+ *            tevent_req_done(state->reqs[i]);
+ *       }
+ * }
+ * @endcode
+ *
+ * @param[in]  req      The finished request.
+ *
+ * @param[in]  ev       The tevent_context for the immediate event.
+ *
+ * @return              The given request will be returned.
+ */
+void tevent_req_defer_callback(struct tevent_req *req,
+                              struct tevent_context *ev);
+
+/**
+ * @brief Check if the given request is still in progress.
+ *
+ * It is typically used by sync wrapper functions.
+ *
+ * @param[in]  req      The request to poll.
+ *
+ * @return              The boolean form of "is in progress".
+ */
+bool tevent_req_is_in_progress(struct tevent_req *req);
+
+/**
+ * @brief Actively poll for the given request to finish.
+ *
+ * This function is typically used by sync wrapper functions.
+ *
+ * @param[in]  req      The request to poll.
+ *
+ * @param[in]  ev       The tevent_context to be used.
+ *
+ * @return              On success true is returned. If a critical error has
+ *                      happened in the tevent loop layer false is returned.
+ *                      This is not the return value of the given request!
+ *
+ * @note This should only be used if the given tevent context was created by the
+ * caller, to avoid event loop nesting.
+ *
+ * @code
+ * req = tstream_writev_queue_send(mem_ctx,
+ *                                 ev_ctx,
+ *                                 tstream,
+ *                                 send_queue,
+ *                                 iov, 2);
+ * ok = tevent_req_poll(req, tctx->ev);
+ * rc = tstream_writev_queue_recv(req, &sys_errno);
+ * TALLOC_FREE(req);
+ * @endcode
+ */
+bool tevent_req_poll(struct tevent_req *req,
+                    struct tevent_context *ev);
+
+/**
+ * @brief Get the tevent request state and the actual error set by
+ * tevent_req_error.
+ *
+ * @code
+ * int computation_recv(struct tevent_req *req, uint64_t *perr)
+ * {
+ *     enum tevent_req_state state;
+ *     uint64_t err;
+ *     if (tevent_req_is_error(req, &state, &err)) {
+ *         *perr = err;
+ *         return -1;
+ *     }
+ *     return 0;
+ * }
+ * @endcode
+ *
+ * @param[in]  req      The tevent request to get the error from.
+ *
+ * @param[out] state    A pointer to store the tevent request error state.
+ *
+ * @param[out] error    A pointer to store the error set by tevent_req_error().
+ *
+ * @return              True if the function could set error and state, false
+ *                      otherwise.
+ *
+ * @see tevent_req_error()
+ */
+bool tevent_req_is_error(struct tevent_req *req,
+                        enum tevent_req_state *state,
+                        uint64_t *error);
+
+/**
+ * @brief Use as the last action of a _recv() function.
+ *
+ * This function destroys the attached private data.
+ *
+ * @param[in]  req      The finished request.
+ */
+void tevent_req_received(struct tevent_req *req);
+
+/**
+ * @brief Create a tevent subrequest at a given time.
+ *
+ * The idea is that always the same syntax for tevent requests.
+ *
+ * @param[in]  mem_ctx  The talloc memory context to use.
+ *
+ * @param[in]  ev       The event handle to setup the request.
+ *
+ * @param[in]  wakeup_time The time to wakeup and execute the request.
+ *
+ * @return              The new subrequest, NULL on error.
+ *
+ * Example:
+ * @code
+ *   static void my_callback_wakeup_done(tevent_req *subreq)
+ *   {
+ *     struct tevent_req *req = tevent_req_callback_data(subreq,
+ *                              struct tevent_req);
+ *     bool ok;
+ *
+ *     ok = tevent_wakeup_recv(subreq);
+ *     TALLOC_FREE(subreq);
+ *     if (!ok) {
+ *         tevent_req_error(req, -1);
+ *         return;
+ *     }
+ *     ...
+ *   }
+ * @endcode
+ *
+ * @code
+ *   subreq = tevent_wakeup_send(mem_ctx, ev, wakeup_time);
+ *   if (tevent_req_nomem(subreq, req)) {
+ *     return false;
+ *   }
+ *   tevent_set_callback(subreq, my_callback_wakeup_done, req);
+ * @endcode
+ *
+ * @see tevent_wakeup_recv()
+ */
+struct tevent_req *tevent_wakeup_send(TALLOC_CTX *mem_ctx,
+                                     struct tevent_context *ev,
+                                     struct timeval wakeup_time);
+
+/**
+ * @brief Check if the wakeup has been correctly executed.
+ *
+ * This function needs to be called in the callback function set after calling
+ * tevent_wakeup_send().
+ *
+ * @param[in]  req      The tevent request to check.
+ *
+ * @return              True on success, false otherwise.
+ *
+ * @see tevent_wakeup_recv()
+ */
+bool tevent_wakeup_recv(struct tevent_req *req);
+
+/* @} */
+
+/**
+ * @defgroup tevent_helpers The tevent helper functiions
+ * @ingroup tevent
+ *
+ * @todo description
+ *
+ * @{
+ */
+
+/**
+ * @brief Compare two timeval values.
+ *
+ * @param[in]  tv1      The first timeval value to compare.
+ *
+ * @param[in]  tv2      The second timeval value to compare.
+ *
+ * @return              0 if they are equal.
+ *                      1 if the first time is greater than the second.
+ *                      -1 if the first time is smaller than the second.
+ */
+int tevent_timeval_compare(const struct timeval *tv1,
+                          const struct timeval *tv2);
+
+/**
+ * @brief Get a zero timval value.
+ *
+ * @return              A zero timval value.
+ */
+struct timeval tevent_timeval_zero(void);
+
+/**
+ * @brief Get a timeval value for the current time.
+ *
+ * @return              A timval value with the current time.
+ */
+struct timeval tevent_timeval_current(void);
+
+/**
+ * @brief Get a timeval structure with the given values.
+ *
+ * @param[in]  secs     The seconds to set.
+ *
+ * @param[in]  usecs    The microseconds to set.
+ *
+ * @return              A timeval structure with the given values.
+ */
+struct timeval tevent_timeval_set(uint32_t secs, uint32_t usecs);
+
+/**
+ * @brief Get the difference between two timeval values.
+ *
+ * @param[in]  tv1      The first timeval.
+ *
+ * @param[in]  tv2      The second timeval.
+ *
+ * @return              A timeval structure with the difference between the
+ *                      first and the second value.
+ */
+struct timeval tevent_timeval_until(const struct timeval *tv1,
+                                   const struct timeval *tv2);
+
+/**
+ * @brief Check if a given timeval structure is zero.
+ *
+ * @param[in]  tv       The timeval to check if it is zero.
+ *
+ * @return              True if it is zero, false otherwise.
+ */
+bool tevent_timeval_is_zero(const struct timeval *tv);
+
+/**
+ * @brief Add the given amount of time to a timeval structure.
+ *
+ * @param[in]  tv        The timeval structure to add the time.
+ *
+ * @param[in]  secs      The seconds to add to the timeval.
+ *
+ * @param[in]  usecs     The microseconds to add to the timeval.
+ *
+ * @return               The timeval structure with the new time.
+ */
+struct timeval tevent_timeval_add(const struct timeval *tv, uint32_t secs,
+                                 uint32_t usecs);
+
+/**
+ * @brief Get a timeval in the future with a specified offset from now.
+ *
+ * @param[in]  secs     The seconds of the offset from now.
+ *
+ * @param[in]  usecs    The microseconds of the offset from now.
+ *
+ * @return              A timval with the given offset in the future.
+ */
+struct timeval tevent_timeval_current_ofs(uint32_t secs, uint32_t usecs);
+
+/* @} */
+
+
+/**
+ * @defgroup tevent_queue The tevent queue functions
+ * @ingroup tevent
+ *
+ * A tevent_queue is used to queue up async requests that must be
+ * serialized. For example writing buffers into a socket must be
+ * serialized. Writing a large lump of data into a socket can require
+ * multiple write(2) or send(2) system calls. If more than one async
+ * request is outstanding to write large buffers into a socket, every
+ * request must individually be completed before the next one begins,
+ * even if multiple syscalls are required.
+ *
+ * Take a look at @ref tevent_queue_tutorial for more details.
+ * @{
+ */
+
+struct tevent_queue;
+struct tevent_queue_entry;
+
+#ifdef DOXYGEN
+/**
+ * @brief Create and start a tevent queue.
+ *
+ * @param[in]  mem_ctx  The talloc memory context to allocate the queue.
+ *
+ * @param[in]  name     The name to use to identify the queue.
+ *
+ * @return              An allocated tevent queue on success, NULL on error.
+ *
+ * @see tevent_queue_start()
+ * @see tevent_queue_stop()
+ */
+struct tevent_queue *tevent_queue_create(TALLOC_CTX *mem_ctx,
+                                        const char *name);
+#else
+struct tevent_queue *_tevent_queue_create(TALLOC_CTX *mem_ctx,
+                                         const char *name,
+                                         const char *location);
+
+#define tevent_queue_create(_mem_ctx, _name) \
+       _tevent_queue_create((_mem_ctx), (_name), __location__)
+#endif
+
+/**
+ * @brief A callback trigger function run by the queue.
+ *
+ * @param[in]  req      The tevent request the trigger function is executed on.
+ *
+ * @param[in]  private_data The private data pointer specified by
+ *                          tevent_queue_add().
+ *
+ * @see tevent_queue_add()
+ * @see tevent_queue_add_entry()
+ * @see tevent_queue_add_optimize_empty()
+ */
+typedef void (*tevent_queue_trigger_fn_t)(struct tevent_req *req,
+                                         void *private_data);
+
+/**
+ * @brief Add a tevent request to the queue.
+ *
+ * @param[in]  queue    The queue to add the request.
+ *
+ * @param[in]  ev       The event handle to use for the request.
+ *
+ * @param[in]  req      The tevent request to add to the queue.
+ *
+ * @param[in]  trigger  The function triggered by the queue when the request
+ *                      is called. Since tevent 0.9.14 it's possible to
+ *                      pass NULL, in order to just add a "blocker" to the
+ *                      queue.
+ *
+ * @param[in]  private_data The private data passed to the trigger function.
+ *
+ * @return              True if the request has been successfully added, false
+ *                      otherwise.
+ */
+bool tevent_queue_add(struct tevent_queue *queue,
+                     struct tevent_context *ev,
+                     struct tevent_req *req,
+                     tevent_queue_trigger_fn_t trigger,
+                     void *private_data);
+
+/**
+ * @brief Add a tevent request to the queue.
+ *
+ * The request can be removed from the queue by calling talloc_free()
+ * (or a similar function) on the returned queue entry. This
+ * is the only difference to tevent_queue_add().
+ *
+ * @param[in]  queue    The queue to add the request.
+ *
+ * @param[in]  ev       The event handle to use for the request.
+ *
+ * @param[in]  req      The tevent request to add to the queue.
+ *
+ * @param[in]  trigger  The function triggered by the queue when the request
+ *                      is called. Since tevent 0.9.14 it's possible to
+ *                      pass NULL, in order to just add a "blocker" to the
+ *                      queue.
+ *
+ * @param[in]  private_data The private data passed to the trigger function.
+ *
+ * @return              a pointer to the tevent_queue_entry if the request
+ *                      has been successfully added, NULL otherwise.
+ *
+ * @see tevent_queue_add()
+ * @see tevent_queue_add_optimize_empty()
+ */
+struct tevent_queue_entry *tevent_queue_add_entry(
+                                       struct tevent_queue *queue,
+                                       struct tevent_context *ev,
+                                       struct tevent_req *req,
+                                       tevent_queue_trigger_fn_t trigger,
+                                       void *private_data);
+
+/**
+ * @brief Add a tevent request to the queue using a possible optimization.
+ *
+ * This tries to optimize for the empty queue case and may calls
+ * the trigger function directly. This is the only difference compared
+ * to tevent_queue_add_entry().
+ *
+ * The caller needs to be prepared that the trigger function has
+ * already called tevent_req_notify_callback(), tevent_req_error(),
+ * tevent_req_done() or a similar function.
+ *
+ * The request can be removed from the queue by calling talloc_free()
+ * (or a similar function) on the returned queue entry.
+ *
+ * @param[in]  queue    The queue to add the request.
+ *
+ * @param[in]  ev       The event handle to use for the request.
+ *
+ * @param[in]  req      The tevent request to add to the queue.
+ *
+ * @param[in]  trigger  The function triggered by the queue when the request
+ *                      is called. Since tevent 0.9.14 it's possible to
+ *                      pass NULL, in order to just add a "blocker" to the
+ *                      queue.
+ *
+ * @param[in]  private_data The private data passed to the trigger function.
+ *
+ * @return              a pointer to the tevent_queue_entry if the request
+ *                      has been successfully added, NULL otherwise.
+ *
+ * @see tevent_queue_add()
+ * @see tevent_queue_add_entry()
+ */
+struct tevent_queue_entry *tevent_queue_add_optimize_empty(
+                                       struct tevent_queue *queue,
+                                       struct tevent_context *ev,
+                                       struct tevent_req *req,
+                                       tevent_queue_trigger_fn_t trigger,
+                                       void *private_data);
+
+/**
+ * @brief Start a tevent queue.
+ *
+ * The queue is started by default.
+ *
+ * @param[in]  queue    The queue to start.
+ */
+void tevent_queue_start(struct tevent_queue *queue);
+
+/**
+ * @brief Stop a tevent queue.
+ *
+ * The queue is started by default.
+ *
+ * @param[in]  queue    The queue to stop.
+ */
+void tevent_queue_stop(struct tevent_queue *queue);
+
+/**
+ * @brief Get the length of the queue.
+ *
+ * @param[in]  queue    The queue to get the length from.
+ *
+ * @return              The number of elements.
+ */
+size_t tevent_queue_length(struct tevent_queue *queue);
+
+/**
+ * @brief Is the tevent queue running.
+ *
+ * The queue is started by default.
+ *
+ * @param[in]  queue    The queue.
+ *
+ * @return              Wether the queue is running or not..
+ */
+bool tevent_queue_running(struct tevent_queue *queue);
+
+typedef int (*tevent_nesting_hook)(struct tevent_context *ev,
+                                  void *private_data,
+                                  uint32_t level,
+                                  bool begin,
+                                  void *stack_ptr,
+                                  const char *location);
+#ifdef TEVENT_DEPRECATED
+#ifndef _DEPRECATED_
+#if (__GNUC__ >= 3) && (__GNUC_MINOR__ >= 1 )
+#define _DEPRECATED_ __attribute__ ((deprecated))
+#else
+#define _DEPRECATED_
+#endif
+#endif
+void tevent_loop_allow_nesting(struct tevent_context *ev) _DEPRECATED_;
+void tevent_loop_set_nesting_hook(struct tevent_context *ev,
+                                 tevent_nesting_hook hook,
+                                 void *private_data) _DEPRECATED_;
+int _tevent_loop_until(struct tevent_context *ev,
+                      bool (*finished)(void *private_data),
+                      void *private_data,
+                      const char *location) _DEPRECATED_;
+#define tevent_loop_until(ev, finished, private_data) \
+       _tevent_loop_until(ev, finished, private_data, __location__)
+#endif
+
+int tevent_re_initialise(struct tevent_context *ev);
+
+/* @} */
+
+/**
+ * @defgroup tevent_ops The tevent operation functions
+ * @ingroup tevent
+ *
+ * The following structure and registration functions are exclusively
+ * needed for people writing and pluggin a different event engine.
+ * There is nothing useful for normal tevent user in here.
+ * @{
+ */
+
+struct tevent_ops {
+       /* context init */
+       int (*context_init)(struct tevent_context *ev);
+
+       /* fd_event functions */
+       struct tevent_fd *(*add_fd)(struct tevent_context *ev,
+                                   TALLOC_CTX *mem_ctx,
+                                   int fd, uint16_t flags,
+                                   tevent_fd_handler_t handler,
+                                   void *private_data,
+                                   const char *handler_name,
+                                   const char *location);
+       void (*set_fd_close_fn)(struct tevent_fd *fde,
+                               tevent_fd_close_fn_t close_fn);
+       uint16_t (*get_fd_flags)(struct tevent_fd *fde);
+       void (*set_fd_flags)(struct tevent_fd *fde, uint16_t flags);
+
+       /* timed_event functions */
+       struct tevent_timer *(*add_timer)(struct tevent_context *ev,
+                                         TALLOC_CTX *mem_ctx,
+                                         struct timeval next_event,
+                                         tevent_timer_handler_t handler,
+                                         void *private_data,
+                                         const char *handler_name,
+                                         const char *location);
+
+       /* immediate event functions */
+       void (*schedule_immediate)(struct tevent_immediate *im,
+                                  struct tevent_context *ev,
+                                  tevent_immediate_handler_t handler,
+                                  void *private_data,
+                                  const char *handler_name,
+                                  const char *location);
+
+       /* signal functions */
+       struct tevent_signal *(*add_signal)(struct tevent_context *ev,
+                                           TALLOC_CTX *mem_ctx,
+                                           int signum, int sa_flags,
+                                           tevent_signal_handler_t handler,
+                                           void *private_data,
+                                           const char *handler_name,
+                                           const char *location);
+
+       /* loop functions */
+       int (*loop_once)(struct tevent_context *ev, const char *location);
+       int (*loop_wait)(struct tevent_context *ev, const char *location);
+};
+
+bool tevent_register_backend(const char *name, const struct tevent_ops *ops);
+
+/* @} */
+
+/**
+ * @defgroup tevent_compat The tevent compatibility functions
+ * @ingroup tevent
+ *
+ * The following definitions are usueful only for compatibility with the
+ * implementation originally developed within the samba4 code and will be
+ * soon removed. Please NEVER use in new code.
+ *
+ * @todo Ignore it?
+ *
+ * @{
+ */
+
+#ifdef TEVENT_COMPAT_DEFINES
+
+#define event_context  tevent_context
+#define event_ops      tevent_ops
+#define fd_event       tevent_fd
+#define timed_event    tevent_timer
+#define signal_event   tevent_signal
+
+#define event_fd_handler_t     tevent_fd_handler_t
+#define event_timed_handler_t  tevent_timer_handler_t
+#define event_signal_handler_t tevent_signal_handler_t
+
+#define event_context_init(mem_ctx) \
+       tevent_context_init(mem_ctx)
+
+#define event_context_init_byname(mem_ctx, name) \
+       tevent_context_init_byname(mem_ctx, name)
+
+#define event_backend_list(mem_ctx) \
+       tevent_backend_list(mem_ctx)
+
+#define event_set_default_backend(backend) \
+       tevent_set_default_backend(backend)
+
+#define event_add_fd(ev, mem_ctx, fd, flags, handler, private_data) \
+       tevent_add_fd(ev, mem_ctx, fd, flags, handler, private_data)
+
+#define event_add_timed(ev, mem_ctx, next_event, handler, private_data) \
+       tevent_add_timer(ev, mem_ctx, next_event, handler, private_data)
+
+#define event_add_signal(ev, mem_ctx, signum, sa_flags, handler, private_data) \
+       tevent_add_signal(ev, mem_ctx, signum, sa_flags, handler, private_data)
+
+#define event_loop_once(ev) \
+       tevent_loop_once(ev)
+
+#define event_loop_wait(ev) \
+       tevent_loop_wait(ev)
+
+#define event_get_fd_flags(fde) \
+       tevent_fd_get_flags(fde)
+
+#define event_set_fd_flags(fde, flags) \
+       tevent_fd_set_flags(fde, flags)
+
+#define EVENT_FD_READ          TEVENT_FD_READ
+#define EVENT_FD_WRITE         TEVENT_FD_WRITE
+
+#define EVENT_FD_WRITEABLE(fde) \
+       TEVENT_FD_WRITEABLE(fde)
+
+#define EVENT_FD_READABLE(fde) \
+       TEVENT_FD_READABLE(fde)
+
+#define EVENT_FD_NOT_WRITEABLE(fde) \
+       TEVENT_FD_NOT_WRITEABLE(fde)
+
+#define EVENT_FD_NOT_READABLE(fde) \
+       TEVENT_FD_NOT_READABLE(fde)
+
+#define ev_debug_level         tevent_debug_level
+
+#define EV_DEBUG_FATAL         TEVENT_DEBUG_FATAL
+#define EV_DEBUG_ERROR         TEVENT_DEBUG_ERROR
+#define EV_DEBUG_WARNING       TEVENT_DEBUG_WARNING
+#define EV_DEBUG_TRACE         TEVENT_DEBUG_TRACE
+
+#define ev_set_debug(ev, debug, context) \
+       tevent_set_debug(ev, debug, context)
+
+#define ev_set_debug_stderr(_ev) tevent_set_debug_stderr(ev)
+
+#endif /* TEVENT_COMPAT_DEFINES */
+
+/* @} */
+
+#endif /* __TEVENT_H__ */
diff --git a/ctdb/lib/tevent/tevent.pc.in b/ctdb/lib/tevent/tevent.pc.in
new file mode 100644 (file)
index 0000000..1091ff0
--- /dev/null
@@ -0,0 +1,12 @@
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: tevent
+Description: An event system library
+Version: @PACKAGE_VERSION@
+Requires: talloc
+Libs: -L${libdir} -ltevent
+Cflags: @LIB_RPATH@ -I${includedir}
+URL: http://samba.org/
diff --git a/ctdb/lib/tevent/tevent.py b/ctdb/lib/tevent/tevent.py
new file mode 100644 (file)
index 0000000..c296544
--- /dev/null
@@ -0,0 +1,29 @@
+#!/usr/bin/python
+#
+#   Python integration for tevent
+#
+#   Copyright (C) Jelmer Vernooij 2011
+#
+#     ** NOTE! The following LGPL license applies to the tevent
+#     ** library. This does NOT imply that all of Samba is released
+#     ** under the LGPL
+#
+#   This library is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU Lesser General Public
+#   License as published by the Free Software Foundation; either
+#   version 3 of the License, or (at your option) any later version.
+#
+#   This library is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   Lesser General Public License for more details.
+#
+#   You should have received a copy of the GNU Lesser General Public
+#   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+
+from _tevent import (
+    __version__,
+    backend_list,
+    Context,
+    Signal,
+    )
diff --git a/ctdb/lib/tevent/tevent_debug.c b/ctdb/lib/tevent/tevent_debug.c
new file mode 100644 (file)
index 0000000..31da7b9
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   Copyright (C) Andrew Tridgell 2005
+   Copyright (C) Jelmer Vernooij 2005
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+
+/********************************************************************
+ * Debug wrapper functions, modeled (with lot's of code copied as is)
+ * after the ev debug wrapper functions
+ ********************************************************************/
+
+/*
+  this allows the user to choose their own debug function
+*/
+int tevent_set_debug(struct tevent_context *ev,
+                    void (*debug)(void *context,
+                                  enum tevent_debug_level level,
+                                  const char *fmt,
+                                  va_list ap) PRINTF_ATTRIBUTE(3,0),
+                    void *context)
+{
+       ev->debug_ops.debug = debug;
+       ev->debug_ops.context = context;
+       return 0;
+}
+
+/*
+  debug function for ev_set_debug_stderr
+*/
+static void tevent_debug_stderr(void *private_data,
+                               enum tevent_debug_level level,
+                               const char *fmt,
+                               va_list ap) PRINTF_ATTRIBUTE(3,0);
+static void tevent_debug_stderr(void *private_data,
+                               enum tevent_debug_level level,
+                               const char *fmt, va_list ap)
+{
+       if (level <= TEVENT_DEBUG_WARNING) {
+               vfprintf(stderr, fmt, ap);
+       }
+}
+
+/*
+  convenience function to setup debug messages on stderr
+  messages of level TEVENT_DEBUG_WARNING and higher are printed
+*/
+int tevent_set_debug_stderr(struct tevent_context *ev)
+{
+       return tevent_set_debug(ev, tevent_debug_stderr, ev);
+}
+
+/*
+ * log a message
+ *
+ * The default debug action is to ignore debugging messages.
+ * This is the most appropriate action for a library.
+ * Applications using the library must decide where to
+ * redirect debugging messages
+*/
+void tevent_debug(struct tevent_context *ev, enum tevent_debug_level level,
+                 const char *fmt, ...)
+{
+       va_list ap;
+       if (!ev) {
+               return;
+       }
+       if (ev->debug_ops.debug == NULL) {
+               return;
+       }
+       va_start(ap, fmt);
+       ev->debug_ops.debug(ev->debug_ops.context, level, fmt, ap);
+       va_end(ap);
+}
+
+void tevent_set_trace_callback(struct tevent_context *ev,
+                              tevent_trace_callback_t cb,
+                              void *private_data)
+{
+       ev->tracing.callback = cb;
+       ev->tracing.private_data = private_data;
+}
+
+void tevent_get_trace_callback(struct tevent_context *ev,
+                              tevent_trace_callback_t *cb,
+                              void *private_data)
+{
+       *cb = ev->tracing.callback;
+       *(void**)private_data = ev->tracing.private_data;
+}
+
+void tevent_trace_point_callback(struct tevent_context *ev,
+                                enum tevent_trace_point tp)
+{
+       if (ev->tracing.callback != NULL) {
+               ev->tracing.callback(tp, ev->tracing.private_data);
+       }
+}
diff --git a/ctdb/lib/tevent/tevent_epoll.c b/ctdb/lib/tevent/tevent_epoll.c
new file mode 100644 (file)
index 0000000..599c190
--- /dev/null
@@ -0,0 +1,950 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   main select loop and event handling - epoll implementation
+
+   Copyright (C) Andrew Tridgell       2003-2005
+   Copyright (C) Stefan Metzmacher     2005-2013
+   Copyright (C) Jeremy Allison                2013
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/select.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+struct epoll_event_context {
+       /* a pointer back to the generic event_context */
+       struct tevent_context *ev;
+
+       /* when using epoll this is the handle from epoll_create */
+       int epoll_fd;
+
+       pid_t pid;
+
+       bool panic_force_replay;
+       bool *panic_state;
+       bool (*panic_fallback)(struct tevent_context *ev, bool replay);
+};
+
+#define EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT     (1<<0)
+#define EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR  (1<<1)
+#define EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR     (1<<2)
+#define EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX       (1<<3)
+
+#ifdef TEST_PANIC_FALLBACK
+
+static int epoll_create_panic_fallback(struct epoll_event_context *epoll_ev,
+                                      int size)
+{
+       if (epoll_ev->panic_fallback == NULL) {
+               return epoll_create(size);
+       }
+
+       /* 50% of the time, fail... */
+       if ((random() % 2) == 0) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       return epoll_create(size);
+}
+
+static int epoll_ctl_panic_fallback(struct epoll_event_context *epoll_ev,
+                                   int epfd, int op, int fd,
+                                   struct epoll_event *event)
+{
+       if (epoll_ev->panic_fallback == NULL) {
+               return epoll_ctl(epfd, op, fd, event);
+       }
+
+       /* 50% of the time, fail... */
+       if ((random() % 2) == 0) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       return epoll_ctl(epfd, op, fd, event);
+}
+
+static int epoll_wait_panic_fallback(struct epoll_event_context *epoll_ev,
+                                    int epfd,
+                                    struct epoll_event *events,
+                                    int maxevents,
+                                    int timeout)
+{
+       if (epoll_ev->panic_fallback == NULL) {
+               return epoll_wait(epfd, events, maxevents, timeout);
+       }
+
+       /* 50% of the time, fail... */
+       if ((random() % 2) == 0) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       return epoll_wait(epfd, events, maxevents, timeout);
+}
+
+#define epoll_create(_size) \
+       epoll_create_panic_fallback(epoll_ev, _size)
+#define epoll_ctl(_epfd, _op, _fd, _event) \
+       epoll_ctl_panic_fallback(epoll_ev,_epfd, _op, _fd, _event)
+#define epoll_wait(_epfd, _events, _maxevents, _timeout) \
+       epoll_wait_panic_fallback(epoll_ev, _epfd, _events, _maxevents, _timeout)
+#endif
+
+/*
+  called to set the panic fallback function.
+*/
+_PRIVATE_ bool tevent_epoll_set_panic_fallback(struct tevent_context *ev,
+                               bool (*panic_fallback)(struct tevent_context *ev,
+                                                      bool replay))
+{
+       struct epoll_event_context *epoll_ev;
+
+       if (ev->additional_data == NULL) {
+               return false;
+       }
+
+       epoll_ev = talloc_get_type(ev->additional_data,
+                               struct epoll_event_context);
+       if (epoll_ev == NULL) {
+               return false;
+       }
+       epoll_ev->panic_fallback = panic_fallback;
+       return true;
+}
+
+/*
+  called when a epoll call fails
+*/
+static void epoll_panic(struct epoll_event_context *epoll_ev,
+                       const char *reason, bool replay)
+{
+       struct tevent_context *ev = epoll_ev->ev;
+       bool (*panic_fallback)(struct tevent_context *ev, bool replay);
+
+       panic_fallback = epoll_ev->panic_fallback;
+
+       if (epoll_ev->panic_state != NULL) {
+               *epoll_ev->panic_state = true;
+       }
+
+       if (epoll_ev->panic_force_replay) {
+               replay = true;
+       }
+
+       TALLOC_FREE(ev->additional_data);
+
+       if (panic_fallback == NULL) {
+               tevent_debug(ev, TEVENT_DEBUG_FATAL,
+                       "%s (%s) replay[%u] - calling abort()\n",
+                       reason, strerror(errno), (unsigned)replay);
+               abort();
+       }
+
+       tevent_debug(ev, TEVENT_DEBUG_ERROR,
+                    "%s (%s) replay[%u] - calling panic_fallback\n",
+                    reason, strerror(errno), (unsigned)replay);
+
+       if (!panic_fallback(ev, replay)) {
+               /* Fallback failed. */
+               tevent_debug(ev, TEVENT_DEBUG_FATAL,
+                       "%s (%s) replay[%u] - calling abort()\n",
+                       reason, strerror(errno), (unsigned)replay);
+               abort();
+       }
+}
+
+/*
+  map from TEVENT_FD_* to EPOLLIN/EPOLLOUT
+*/
+static uint32_t epoll_map_flags(uint16_t flags)
+{
+       uint32_t ret = 0;
+       if (flags & TEVENT_FD_READ) ret |= (EPOLLIN | EPOLLERR | EPOLLHUP);
+       if (flags & TEVENT_FD_WRITE) ret |= (EPOLLOUT | EPOLLERR | EPOLLHUP);
+       return ret;
+}
+
+/*
+ free the epoll fd
+*/
+static int epoll_ctx_destructor(struct epoll_event_context *epoll_ev)
+{
+       close(epoll_ev->epoll_fd);
+       epoll_ev->epoll_fd = -1;
+       return 0;
+}
+
+/*
+ init the epoll fd
+*/
+static int epoll_init_ctx(struct epoll_event_context *epoll_ev)
+{
+       epoll_ev->epoll_fd = epoll_create(64);
+       if (epoll_ev->epoll_fd == -1) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
+                            "Failed to create epoll handle.\n");
+               return -1;
+       }
+
+       if (!ev_set_close_on_exec(epoll_ev->epoll_fd)) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING,
+                            "Failed to set close-on-exec, file descriptor may be leaked to children.\n");
+       }
+
+       epoll_ev->pid = getpid();
+       talloc_set_destructor(epoll_ev, epoll_ctx_destructor);
+
+       return 0;
+}
+
+static void epoll_update_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde);
+
+/*
+  reopen the epoll handle when our pid changes
+  see http://junkcode.samba.org/ftp/unpacked/junkcode/epoll_fork.c for an 
+  demonstration of why this is needed
+ */
+static void epoll_check_reopen(struct epoll_event_context *epoll_ev)
+{
+       struct tevent_fd *fde;
+       bool *caller_panic_state = epoll_ev->panic_state;
+       bool panic_triggered = false;
+
+       if (epoll_ev->pid == getpid()) {
+               return;
+       }
+
+       close(epoll_ev->epoll_fd);
+       epoll_ev->epoll_fd = epoll_create(64);
+       if (epoll_ev->epoll_fd == -1) {
+               epoll_panic(epoll_ev, "epoll_create() failed", false);
+               return;
+       }
+
+       if (!ev_set_close_on_exec(epoll_ev->epoll_fd)) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING,
+                            "Failed to set close-on-exec, file descriptor may be leaked to children.\n");
+       }
+
+       epoll_ev->pid = getpid();
+       epoll_ev->panic_state = &panic_triggered;
+       for (fde=epoll_ev->ev->fd_events;fde;fde=fde->next) {
+               fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+               epoll_update_event(epoll_ev, fde);
+
+               if (panic_triggered) {
+                       if (caller_panic_state != NULL) {
+                               *caller_panic_state = true;
+                       }
+                       return;
+               }
+       }
+       epoll_ev->panic_state = NULL;
+}
+
+/*
+ epoll cannot add the same file descriptor twice, once
+ with read, once with write which is allowed by the
+ tevent backend. Multiplex the existing fde, flag it
+ as such so we can search for the correct fde on
+ event triggering.
+*/
+
+static int epoll_add_multiplex_fd(struct epoll_event_context *epoll_ev,
+                                 struct tevent_fd *add_fde)
+{
+       struct epoll_event event;
+       struct tevent_fd *mpx_fde;
+       int ret;
+
+       /* Find the existing fde that caused the EEXIST error. */
+       for (mpx_fde = epoll_ev->ev->fd_events; mpx_fde; mpx_fde = mpx_fde->next) {
+               if (mpx_fde->fd != add_fde->fd) {
+                       continue;
+               }
+
+               if (mpx_fde == add_fde) {
+                       continue;
+               }
+
+               break;
+       }
+       if (mpx_fde == NULL) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
+                            "can't find multiplex fde for fd[%d]",
+                            add_fde->fd);
+               return -1;
+       }
+
+       if (mpx_fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
+               /* Logic error. Can't have more than 2 multiplexed fde's. */
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
+                            "multiplex fde for fd[%d] is already multiplexed\n",
+                            mpx_fde->fd);
+               return -1;
+       }
+
+       /*
+        * The multiplex fde must have the same fd, and also
+        * already have an epoll event attached.
+        */
+       if (!(mpx_fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT)) {
+               /* Logic error. Can't have more than 2 multiplexed fde's. */
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_FATAL,
+                            "multiplex fde for fd[%d] has no event\n",
+                            mpx_fde->fd);
+               return -1;
+       }
+
+       /* Modify the mpx_fde to add in the new flags. */
+       ZERO_STRUCT(event);
+       event.events = epoll_map_flags(mpx_fde->flags);
+       event.events |= epoll_map_flags(add_fde->flags);
+       event.data.ptr = mpx_fde;
+       ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, mpx_fde->fd, &event);
+       if (ret != 0 && errno == EBADF) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_ERROR,
+                            "EPOLL_CTL_MOD EBADF for "
+                            "add_fde[%p] mpx_fde[%p] fd[%d] - disabling\n",
+                            add_fde, mpx_fde, add_fde->fd);
+               DLIST_REMOVE(epoll_ev->ev->fd_events, mpx_fde);
+               mpx_fde->event_ctx = NULL;
+               DLIST_REMOVE(epoll_ev->ev->fd_events, add_fde);
+               add_fde->event_ctx = NULL;
+               return 0;
+       } else if (ret != 0) {
+               return ret;
+       }
+
+       /*
+        * Make each fde->additional_data pointers point at each other
+        * so we can look them up from each other. They are now paired.
+        */
+       mpx_fde->additional_data = (struct tevent_fd *)add_fde;
+       add_fde->additional_data = (struct tevent_fd *)mpx_fde;
+
+       /* Now flag both fde's as being multiplexed. */
+       mpx_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX;
+       add_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX;
+
+       /* we need to keep the GOT_ERROR flag */
+       if (mpx_fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR) {
+               add_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR;
+       }
+
+       return 0;
+}
+
+/*
+ add the epoll event to the given fd_event
+*/
+static void epoll_add_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
+{
+       struct epoll_event event;
+       int ret;
+       struct tevent_fd *mpx_fde = NULL;
+
+       fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+
+       if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
+               /*
+                * This is a multiplexed fde, we need to include both
+                * flags in the modified event.
+                */
+               mpx_fde = talloc_get_type_abort(fde->additional_data,
+                                               struct tevent_fd);
+
+               mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+               mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+       }
+
+       ZERO_STRUCT(event);
+       event.events = epoll_map_flags(fde->flags);
+       if (mpx_fde != NULL) {
+               event.events |= epoll_map_flags(mpx_fde->flags);
+       }
+       event.data.ptr = fde;
+       ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_ADD, fde->fd, &event);
+       if (ret != 0 && errno == EBADF) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_ERROR,
+                            "EPOLL_CTL_ADD EBADF for "
+                            "fde[%p] mpx_fde[%p] fd[%d] - disabling\n",
+                            fde, mpx_fde, fde->fd);
+               DLIST_REMOVE(epoll_ev->ev->fd_events, fde);
+               fde->event_ctx = NULL;
+               if (mpx_fde != NULL) {
+                       DLIST_REMOVE(epoll_ev->ev->fd_events, mpx_fde);
+                       mpx_fde->event_ctx = NULL;
+               }
+               return;
+       } else if (ret != 0 && errno == EEXIST && mpx_fde == NULL) {
+               ret = epoll_add_multiplex_fd(epoll_ev, fde);
+               if (ret != 0) {
+                       epoll_panic(epoll_ev, "epoll_add_multiplex_fd failed",
+                                   false);
+                       return;
+               }
+       } else if (ret != 0) {
+               epoll_panic(epoll_ev, "EPOLL_CTL_ADD failed", false);
+               return;
+       }
+
+       fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       /* only if we want to read we want to tell the event handler about errors */
+       if (fde->flags & TEVENT_FD_READ) {
+               fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+       }
+
+       if (mpx_fde == NULL) {
+               return;
+       }
+
+       mpx_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       /* only if we want to read we want to tell the event handler about errors */
+       if (mpx_fde->flags & TEVENT_FD_READ) {
+               mpx_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+       }
+}
+
+/*
+ delete the epoll event for given fd_event
+*/
+static void epoll_del_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
+{
+       struct epoll_event event;
+       int ret;
+       struct tevent_fd *mpx_fde = NULL;
+
+       fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+
+       if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
+               /*
+                * This is a multiplexed fde, we need to modify both events.
+                */
+               mpx_fde = talloc_get_type_abort(fde->additional_data,
+                                               struct tevent_fd);
+
+               mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+               mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+       }
+
+       ZERO_STRUCT(event);
+       ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_DEL, fde->fd, &event);
+       if (ret != 0 && errno == ENOENT) {
+               /*
+                * This can happen after a epoll_check_reopen
+                * within epoll_event_fd_destructor.
+                */
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_TRACE,
+                            "EPOLL_CTL_DEL ignoring ENOENT for fd[%d]\n",
+                            fde->fd);
+               return;
+       } else if (ret != 0 && errno == EBADF) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_WARNING,
+                            "EPOLL_CTL_DEL EBADF for "
+                            "fde[%p] mpx_fde[%p] fd[%d] - disabling\n",
+                            fde, mpx_fde, fde->fd);
+               DLIST_REMOVE(epoll_ev->ev->fd_events, fde);
+               fde->event_ctx = NULL;
+               if (mpx_fde != NULL) {
+                       DLIST_REMOVE(epoll_ev->ev->fd_events, mpx_fde);
+                       mpx_fde->event_ctx = NULL;
+               }
+               return;
+       } else if (ret != 0) {
+               epoll_panic(epoll_ev, "EPOLL_CTL_DEL failed", false);
+               return;
+       }
+}
+
+/*
+ change the epoll event to the given fd_event
+*/
+static void epoll_mod_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
+{
+       struct tevent_fd *mpx_fde = NULL;
+       struct epoll_event event;
+       int ret;
+
+       fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+
+       if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
+               /*
+                * This is a multiplexed fde, we need to include both
+                * flags in the modified event.
+                */
+               mpx_fde = talloc_get_type_abort(fde->additional_data,
+                                               struct tevent_fd);
+
+               mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+               mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+       }
+
+       ZERO_STRUCT(event);
+       event.events = epoll_map_flags(fde->flags);
+       if (mpx_fde != NULL) {
+               event.events |= epoll_map_flags(mpx_fde->flags);
+       }
+       event.data.ptr = fde;
+       ret = epoll_ctl(epoll_ev->epoll_fd, EPOLL_CTL_MOD, fde->fd, &event);
+       if (ret != 0 && errno == EBADF) {
+               tevent_debug(epoll_ev->ev, TEVENT_DEBUG_ERROR,
+                            "EPOLL_CTL_MOD EBADF for "
+                            "fde[%p] mpx_fde[%p] fd[%d] - disabling\n",
+                            fde, mpx_fde, fde->fd);
+               DLIST_REMOVE(epoll_ev->ev->fd_events, fde);
+               fde->event_ctx = NULL;
+               if (mpx_fde != NULL) {
+                       DLIST_REMOVE(epoll_ev->ev->fd_events, mpx_fde);
+                       mpx_fde->event_ctx = NULL;
+               }
+               return;
+       } else if (ret != 0) {
+               epoll_panic(epoll_ev, "EPOLL_CTL_MOD failed", false);
+               return;
+       }
+
+       fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       /* only if we want to read we want to tell the event handler about errors */
+       if (fde->flags & TEVENT_FD_READ) {
+               fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+       }
+
+       if (mpx_fde == NULL) {
+               return;
+       }
+
+       mpx_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       /* only if we want to read we want to tell the event handler about errors */
+       if (mpx_fde->flags & TEVENT_FD_READ) {
+               mpx_fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR;
+       }
+}
+
+static void epoll_update_event(struct epoll_event_context *epoll_ev, struct tevent_fd *fde)
+{
+       bool got_error = (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR);
+       bool want_read = (fde->flags & TEVENT_FD_READ);
+       bool want_write= (fde->flags & TEVENT_FD_WRITE);
+       struct tevent_fd *mpx_fde = NULL;
+
+       if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
+               /*
+                * work out what the multiplexed fde wants.
+                */
+               mpx_fde = talloc_get_type_abort(fde->additional_data,
+                                               struct tevent_fd);
+
+               if (mpx_fde->flags & TEVENT_FD_READ) {
+                       want_read = true;
+               }
+
+               if (mpx_fde->flags & TEVENT_FD_WRITE) {
+                       want_write = true;
+               }
+       }
+
+       /* there's already an event */
+       if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT) {
+               if (want_read || (want_write && !got_error)) {
+                       epoll_mod_event(epoll_ev, fde);
+                       return;
+               }
+               /* 
+                * if we want to match the select behavior, we need to remove the epoll_event
+                * when the caller isn't interested in events.
+                *
+                * this is because epoll reports EPOLLERR and EPOLLHUP, even without asking for them
+                */
+               epoll_del_event(epoll_ev, fde);
+               return;
+       }
+
+       /* there's no epoll_event attached to the fde */
+       if (want_read || (want_write && !got_error)) {
+               epoll_add_event(epoll_ev, fde);
+               return;
+       }
+}
+
+/*
+  Cope with epoll returning EPOLLHUP|EPOLLERR on an event.
+  Return true if there's nothing else to do, false if
+  this event needs further handling.
+*/
+static bool epoll_handle_hup_or_err(struct epoll_event_context *epoll_ev,
+                               struct tevent_fd *fde)
+{
+       if (fde == NULL) {
+               /* Nothing to do if no event. */
+               return true;
+       }
+
+       fde->additional_flags |= EPOLL_ADDITIONAL_FD_FLAG_GOT_ERROR;
+       /*
+        * if we only wait for TEVENT_FD_WRITE, we should not tell the
+        * event handler about it, and remove the epoll_event,
+        * as we only report errors when waiting for read events,
+        * to match the select() behavior
+        */
+       if (!(fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_REPORT_ERROR)) {
+               /*
+                * Do the same as the poll backend and
+                * remove the writeable flag.
+                */
+               fde->flags &= ~TEVENT_FD_WRITE;
+               return true;
+       }
+       /* This has TEVENT_FD_READ set, we're not finished. */
+       return false;
+}
+
+/*
+  event loop handling using epoll
+*/
+static int epoll_event_loop(struct epoll_event_context *epoll_ev, struct timeval *tvalp)
+{
+       int ret, i;
+#define MAXEVENTS 1
+       struct epoll_event events[MAXEVENTS];
+       int timeout = -1;
+       int wait_errno;
+
+       if (tvalp) {
+               /* it's better to trigger timed events a bit later than too early */
+               timeout = ((tvalp->tv_usec+999) / 1000) + (tvalp->tv_sec*1000);
+       }
+
+       if (epoll_ev->ev->signal_events &&
+           tevent_common_check_signal(epoll_ev->ev)) {
+               return 0;
+       }
+
+       tevent_trace_point_callback(epoll_ev->ev, TEVENT_TRACE_BEFORE_WAIT);
+       ret = epoll_wait(epoll_ev->epoll_fd, events, MAXEVENTS, timeout);
+       wait_errno = errno;
+       tevent_trace_point_callback(epoll_ev->ev, TEVENT_TRACE_AFTER_WAIT);
+
+       if (ret == -1 && wait_errno == EINTR && epoll_ev->ev->signal_events) {
+               if (tevent_common_check_signal(epoll_ev->ev)) {
+                       return 0;
+               }
+       }
+
+       if (ret == -1 && wait_errno != EINTR) {
+               epoll_panic(epoll_ev, "epoll_wait() failed", true);
+               return -1;
+       }
+
+       if (ret == 0 && tvalp) {
+               /* we don't care about a possible delay here */
+               tevent_common_loop_timer_delay(epoll_ev->ev);
+               return 0;
+       }
+
+       for (i=0;i<ret;i++) {
+               struct tevent_fd *fde = talloc_get_type(events[i].data.ptr, 
+                                                      struct tevent_fd);
+               uint16_t flags = 0;
+               struct tevent_fd *mpx_fde = NULL;
+
+               if (fde == NULL) {
+                       epoll_panic(epoll_ev, "epoll_wait() gave bad data", true);
+                       return -1;
+               }
+               if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
+                       /*
+                        * Save off the multiplexed event in case we need
+                        * to use it to call the handler function.
+                        */
+                       mpx_fde = talloc_get_type_abort(fde->additional_data,
+                                                       struct tevent_fd);
+               }
+               if (events[i].events & (EPOLLHUP|EPOLLERR)) {
+                       bool handled_fde = epoll_handle_hup_or_err(epoll_ev, fde);
+                       bool handled_mpx = epoll_handle_hup_or_err(epoll_ev, mpx_fde);
+
+                       if (handled_fde && handled_mpx) {
+                               epoll_update_event(epoll_ev, fde);
+                               continue;
+                       }
+
+                       if (!handled_mpx) {
+                               /*
+                                * If the mpx event was the one that needs
+                                * further handling, it's the TEVENT_FD_READ
+                                * event so switch over and call that handler.
+                                */
+                               fde = mpx_fde;
+                               mpx_fde = NULL;
+                       }
+                       flags |= TEVENT_FD_READ;
+               }
+               if (events[i].events & EPOLLIN) flags |= TEVENT_FD_READ;
+               if (events[i].events & EPOLLOUT) flags |= TEVENT_FD_WRITE;
+
+               if (flags & TEVENT_FD_WRITE) {
+                       if (fde->flags & TEVENT_FD_WRITE) {
+                               mpx_fde = NULL;
+                       }
+                       if (mpx_fde && mpx_fde->flags & TEVENT_FD_WRITE) {
+                               fde = mpx_fde;
+                               mpx_fde = NULL;
+                       }
+               }
+
+               if (mpx_fde) {
+                       /* Ensure we got the right fde. */
+                       if ((flags & fde->flags) == 0) {
+                               fde = mpx_fde;
+                               mpx_fde = NULL;
+                       }
+               }
+
+               /*
+                * make sure we only pass the flags
+                * the handler is expecting.
+                */
+               flags &= fde->flags;
+               if (flags) {
+                       fde->handler(epoll_ev->ev, fde, flags, fde->private_data);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+/*
+  create a epoll_event_context structure.
+*/
+static int epoll_event_context_init(struct tevent_context *ev)
+{
+       int ret;
+       struct epoll_event_context *epoll_ev;
+
+       /*
+        * We might be called during tevent_re_initialise()
+        * which means we need to free our old additional_data.
+        */
+       TALLOC_FREE(ev->additional_data);
+
+       epoll_ev = talloc_zero(ev, struct epoll_event_context);
+       if (!epoll_ev) return -1;
+       epoll_ev->ev = ev;
+       epoll_ev->epoll_fd = -1;
+
+       ret = epoll_init_ctx(epoll_ev);
+       if (ret != 0) {
+               talloc_free(epoll_ev);
+               return ret;
+       }
+
+       ev->additional_data = epoll_ev;
+       return 0;
+}
+
+/*
+  destroy an fd_event
+*/
+static int epoll_event_fd_destructor(struct tevent_fd *fde)
+{
+       struct tevent_context *ev = fde->event_ctx;
+       struct epoll_event_context *epoll_ev = NULL;
+       bool panic_triggered = false;
+       struct tevent_fd *mpx_fde = NULL;
+       int flags = fde->flags;
+
+       if (ev == NULL) {
+               return tevent_common_fd_destructor(fde);
+       }
+
+       epoll_ev = talloc_get_type_abort(ev->additional_data,
+                                        struct epoll_event_context);
+
+       /*
+        * we must remove the event from the list
+        * otherwise a panic fallback handler may
+        * reuse invalid memory
+        */
+       DLIST_REMOVE(ev->fd_events, fde);
+
+       if (fde->additional_flags & EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX) {
+               mpx_fde = talloc_get_type_abort(fde->additional_data,
+                                               struct tevent_fd);
+
+               fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX;
+               mpx_fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_MPX;
+
+               fde->additional_data = NULL;
+               mpx_fde->additional_data = NULL;
+
+               fde->additional_flags &= ~EPOLL_ADDITIONAL_FD_FLAG_HAS_EVENT;
+       }
+
+       epoll_ev->panic_state = &panic_triggered;
+       epoll_check_reopen(epoll_ev);
+       if (panic_triggered) {
+               return tevent_common_fd_destructor(fde);
+       }
+
+       if (mpx_fde != NULL) {
+               epoll_update_event(epoll_ev, mpx_fde);
+               if (panic_triggered) {
+                       return tevent_common_fd_destructor(fde);
+               }
+       }
+
+       fde->flags = 0;
+       epoll_update_event(epoll_ev, fde);
+       fde->flags = flags;
+       if (panic_triggered) {
+               return tevent_common_fd_destructor(fde);
+       }
+       epoll_ev->panic_state = NULL;
+
+       return tevent_common_fd_destructor(fde);
+}
+
+/*
+  add a fd based event
+  return NULL on failure (memory allocation error)
+*/
+static struct tevent_fd *epoll_event_add_fd(struct tevent_context *ev, TALLOC_CTX *mem_ctx,
+                                           int fd, uint16_t flags,
+                                           tevent_fd_handler_t handler,
+                                           void *private_data,
+                                           const char *handler_name,
+                                           const char *location)
+{
+       struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
+                                                          struct epoll_event_context);
+       struct tevent_fd *fde;
+       bool panic_triggered = false;
+
+       fde = tevent_common_add_fd(ev, mem_ctx, fd, flags,
+                                  handler, private_data,
+                                  handler_name, location);
+       if (!fde) return NULL;
+
+       talloc_set_destructor(fde, epoll_event_fd_destructor);
+
+       epoll_ev->panic_state = &panic_triggered;
+       epoll_check_reopen(epoll_ev);
+       if (panic_triggered) {
+               return fde;
+       }
+       epoll_ev->panic_state = NULL;
+
+       epoll_update_event(epoll_ev, fde);
+
+       return fde;
+}
+
+/*
+  set the fd event flags
+*/
+static void epoll_event_set_fd_flags(struct tevent_fd *fde, uint16_t flags)
+{
+       struct tevent_context *ev;
+       struct epoll_event_context *epoll_ev;
+       bool panic_triggered = false;
+
+       if (fde->flags == flags) return;
+
+       ev = fde->event_ctx;
+       epoll_ev = talloc_get_type(ev->additional_data, struct epoll_event_context);
+
+       fde->flags = flags;
+
+       epoll_ev->panic_state = &panic_triggered;
+       epoll_check_reopen(epoll_ev);
+       if (panic_triggered) {
+               return;
+       }
+       epoll_ev->panic_state = NULL;
+
+       epoll_update_event(epoll_ev, fde);
+}
+
+/*
+  do a single event loop using the events defined in ev 
+*/
+static int epoll_event_loop_once(struct tevent_context *ev, const char *location)
+{
+       struct epoll_event_context *epoll_ev = talloc_get_type(ev->additional_data,
+                                                          struct epoll_event_context);
+       struct timeval tval;
+       bool panic_triggered = false;
+
+       if (ev->signal_events &&
+           tevent_common_check_signal(ev)) {
+               return 0;
+       }
+
+       if (ev->immediate_events &&
+           tevent_common_loop_immediate(ev)) {
+               return 0;
+       }
+
+       tval = tevent_common_loop_timer_delay(ev);
+       if (tevent_timeval_is_zero(&tval)) {
+               return 0;
+       }
+
+       epoll_ev->panic_state = &panic_triggered;
+       epoll_ev->panic_force_replay = true;
+       epoll_check_reopen(epoll_ev);
+       if (panic_triggered) {
+               errno = EINVAL;
+               return -1;
+       }
+       epoll_ev->panic_force_replay = false;
+       epoll_ev->panic_state = NULL;
+
+       return epoll_event_loop(epoll_ev, &tval);
+}
+
+static const struct tevent_ops epoll_event_ops = {
+       .context_init           = epoll_event_context_init,
+       .add_fd                 = epoll_event_add_fd,
+       .set_fd_close_fn        = tevent_common_fd_set_close_fn,
+       .get_fd_flags           = tevent_common_fd_get_flags,
+       .set_fd_flags           = epoll_event_set_fd_flags,
+       .add_timer              = tevent_common_add_timer_v2,
+       .schedule_immediate     = tevent_common_schedule_immediate,
+       .add_signal             = tevent_common_add_signal,
+       .loop_once              = epoll_event_loop_once,
+       .loop_wait              = tevent_common_loop_wait,
+};
+
+_PRIVATE_ bool tevent_epoll_init(void)
+{
+       return tevent_register_backend("epoll", &epoll_event_ops);
+}
diff --git a/ctdb/lib/tevent/tevent_fd.c b/ctdb/lib/tevent/tevent_fd.c
new file mode 100644 (file)
index 0000000..455961b
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   common events code for fd events
+
+   Copyright (C) Stefan Metzmacher 2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+int tevent_common_fd_destructor(struct tevent_fd *fde)
+{
+       if (fde->event_ctx) {
+               DLIST_REMOVE(fde->event_ctx->fd_events, fde);
+       }
+
+       if (fde->close_fn) {
+               fde->close_fn(fde->event_ctx, fde, fde->fd, fde->private_data);
+               fde->fd = -1;
+       }
+
+       return 0;
+}
+
+struct tevent_fd *tevent_common_add_fd(struct tevent_context *ev, TALLOC_CTX *mem_ctx,
+                                      int fd, uint16_t flags,
+                                      tevent_fd_handler_t handler,
+                                      void *private_data,
+                                      const char *handler_name,
+                                      const char *location)
+{
+       struct tevent_fd *fde;
+
+       /* tevent will crash later on select() if we save
+        * a negative file descriptor. Better to fail here
+        * so that consumers will be able to debug it
+        */
+       if (fd < 0) return NULL;
+
+       fde = talloc(mem_ctx?mem_ctx:ev, struct tevent_fd);
+       if (!fde) return NULL;
+
+       fde->event_ctx          = ev;
+       fde->fd                 = fd;
+       fde->flags              = flags;
+       fde->handler            = handler;
+       fde->close_fn           = NULL;
+       fde->private_data       = private_data;
+       fde->handler_name       = handler_name;
+       fde->location           = location;
+       fde->additional_flags   = 0;
+       fde->additional_data    = NULL;
+
+       DLIST_ADD(ev->fd_events, fde);
+
+       talloc_set_destructor(fde, tevent_common_fd_destructor);
+
+       return fde;
+}
+uint16_t tevent_common_fd_get_flags(struct tevent_fd *fde)
+{
+       return fde->flags;
+}
+
+void tevent_common_fd_set_flags(struct tevent_fd *fde, uint16_t flags)
+{
+       if (fde->flags == flags) return;
+       fde->flags = flags;
+}
+
+void tevent_common_fd_set_close_fn(struct tevent_fd *fde,
+                                  tevent_fd_close_fn_t close_fn)
+{
+       fde->close_fn = close_fn;
+}
diff --git a/ctdb/lib/tevent/tevent_immediate.c b/ctdb/lib/tevent/tevent_immediate.c
new file mode 100644 (file)
index 0000000..1ac293e
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   common events code for immediate events
+
+   Copyright (C) Stefan Metzmacher 2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+static void tevent_common_immediate_cancel(struct tevent_immediate *im)
+{
+       if (!im->event_ctx) {
+               return;
+       }
+
+       tevent_debug(im->event_ctx, TEVENT_DEBUG_TRACE,
+                    "Cancel immediate event %p \"%s\"\n",
+                    im, im->handler_name);
+
+       /* let the backend free im->additional_data */
+       if (im->cancel_fn) {
+               im->cancel_fn(im);
+       }
+
+       DLIST_REMOVE(im->event_ctx->immediate_events, im);
+       im->event_ctx           = NULL;
+       im->handler             = NULL;
+       im->private_data        = NULL;
+       im->handler_name        = NULL;
+       im->schedule_location   = NULL;
+       im->cancel_fn           = NULL;
+       im->additional_data     = NULL;
+
+       talloc_set_destructor(im, NULL);
+}
+
+/*
+  destroy an immediate event
+*/
+static int tevent_common_immediate_destructor(struct tevent_immediate *im)
+{
+       tevent_common_immediate_cancel(im);
+       return 0;
+}
+
+/*
+ * schedule an immediate event on
+ */
+void tevent_common_schedule_immediate(struct tevent_immediate *im,
+                                     struct tevent_context *ev,
+                                     tevent_immediate_handler_t handler,
+                                     void *private_data,
+                                     const char *handler_name,
+                                     const char *location)
+{
+       tevent_common_immediate_cancel(im);
+
+       if (!handler) {
+               return;
+       }
+
+       im->event_ctx           = ev;
+       im->handler             = handler;
+       im->private_data        = private_data;
+       im->handler_name        = handler_name;
+       im->schedule_location   = location;
+       im->cancel_fn           = NULL;
+       im->additional_data     = NULL;
+
+       DLIST_ADD_END(ev->immediate_events, im, struct tevent_immediate *);
+       talloc_set_destructor(im, tevent_common_immediate_destructor);
+
+       tevent_debug(ev, TEVENT_DEBUG_TRACE,
+                    "Schedule immediate event \"%s\": %p\n",
+                    handler_name, im);
+}
+
+/*
+  trigger the first immediate event and return true
+  if no event was triggered return false
+*/
+bool tevent_common_loop_immediate(struct tevent_context *ev)
+{
+       struct tevent_immediate *im = ev->immediate_events;
+       tevent_immediate_handler_t handler;
+       void *private_data;
+
+       if (!im) {
+               return false;
+       }
+
+       tevent_debug(ev, TEVENT_DEBUG_TRACE,
+                    "Run immediate event \"%s\": %p\n",
+                    im->handler_name, im);
+
+       /*
+        * remember the handler and then clear the event
+        * the handler might reschedule the event
+        */
+       handler = im->handler;
+       private_data = im->private_data;
+
+       DLIST_REMOVE(im->event_ctx->immediate_events, im);
+       im->event_ctx           = NULL;
+       im->handler             = NULL;
+       im->private_data        = NULL;
+       im->handler_name        = NULL;
+       im->schedule_location   = NULL;
+       im->cancel_fn           = NULL;
+       im->additional_data     = NULL;
+
+       talloc_set_destructor(im, NULL);
+
+       handler(ev, im, private_data);
+
+       return true;
+}
+
diff --git a/ctdb/lib/tevent/tevent_internal.h b/ctdb/lib/tevent/tevent_internal.h
new file mode 100644 (file)
index 0000000..b239e74
--- /dev/null
@@ -0,0 +1,345 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   generalised event loop handling
+
+   INTERNAL STRUCTS. THERE ARE NO API GUARANTEES.
+   External users should only ever have to include this header when 
+   implementing new tevent backends.
+
+   Copyright (C) Stefan Metzmacher 2005-2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+struct tevent_req {
+       /**
+        * @brief What to do on completion
+        *
+        * This is used for the user of an async request, fn is called when
+        * the request completes, either successfully or with an error.
+        */
+       struct {
+               /**
+                * @brief Completion function
+                * Completion function, to be filled by the API user
+                */
+               tevent_req_fn fn;
+               /**
+                * @brief Private data for the completion function
+                */
+               void *private_data;
+       } async;
+
+       /**
+        * @brief Private state pointer for the actual implementation
+        *
+        * The implementation doing the work for the async request needs to
+        * keep around current data like for example a fd event. The user of
+        * an async request should not touch this.
+        */
+       void *data;
+
+       /**
+        * @brief A function to overwrite the default print function
+        *
+        * The implementation doing the work may want to implement a
+        * custom function to print the text representation of the async
+        * request.
+        */
+       tevent_req_print_fn private_print;
+
+       /**
+        * @brief A function to cancel the request
+        *
+        * The implementation might want to set a function
+        * that is called when the tevent_req_cancel() function
+        * was called.
+        */
+       tevent_req_cancel_fn private_cancel;
+
+       /**
+        * @brief Internal state of the request
+        *
+        * Callers should only access this via functions and never directly.
+        */
+       struct {
+               /**
+                * @brief The talloc type of the data pointer
+                *
+                * This is filled by the tevent_req_create() macro.
+                *
+                * This for debugging only.
+                */
+               const char *private_type;
+
+               /**
+                * @brief The location where the request was created
+                *
+                * This uses the __location__ macro via the tevent_req_create()
+                * macro.
+                *
+                * This for debugging only.
+                */
+               const char *create_location;
+
+               /**
+                * @brief The location where the request was finished
+                *
+                * This uses the __location__ macro via the tevent_req_done(),
+                * tevent_req_error() or tevent_req_nomem() macro.
+                *
+                * This for debugging only.
+                */
+               const char *finish_location;
+
+               /**
+                * @brief The location where the request was canceled
+                *
+                * This uses the __location__ macro via the
+                * tevent_req_cancel() macro.
+                *
+                * This for debugging only.
+                */
+               const char *cancel_location;
+
+               /**
+                * @brief The external state - will be queried by the caller
+                *
+                * While the async request is being processed, state will remain in
+                * TEVENT_REQ_IN_PROGRESS. A request is finished if
+                * req->state>=TEVENT_REQ_DONE.
+                */
+               enum tevent_req_state state;
+
+               /**
+                * @brief status code when finished
+                *
+                * This status can be queried in the async completion function. It
+                * will be set to 0 when everything went fine.
+                */
+               uint64_t error;
+
+               /**
+                * @brief the immediate event used by tevent_req_post
+                *
+                */
+               struct tevent_immediate *trigger;
+
+               /**
+                * @brief An event context which will be used to
+                *        defer the _tevent_req_notify_callback().
+                */
+               struct tevent_context *defer_callback_ev;
+
+               /**
+                * @brief the timer event if tevent_req_set_endtime was used
+                *
+                */
+               struct tevent_timer *timer;
+       } internal;
+};
+
+struct tevent_fd {
+       struct tevent_fd *prev, *next;
+       struct tevent_context *event_ctx;
+       int fd;
+       uint16_t flags; /* see TEVENT_FD_* flags */
+       tevent_fd_handler_t handler;
+       tevent_fd_close_fn_t close_fn;
+       /* this is private for the specific handler */
+       void *private_data;
+       /* this is for debugging only! */
+       const char *handler_name;
+       const char *location;
+       /* this is private for the events_ops implementation */
+       uint64_t additional_flags;
+       void *additional_data;
+};
+
+struct tevent_timer {
+       struct tevent_timer *prev, *next;
+       struct tevent_context *event_ctx;
+       struct timeval next_event;
+       tevent_timer_handler_t handler;
+       /* this is private for the specific handler */
+       void *private_data;
+       /* this is for debugging only! */
+       const char *handler_name;
+       const char *location;
+       /* this is private for the events_ops implementation */
+       void *additional_data;
+};
+
+struct tevent_immediate {
+       struct tevent_immediate *prev, *next;
+       struct tevent_context *event_ctx;
+       tevent_immediate_handler_t handler;
+       /* this is private for the specific handler */
+       void *private_data;
+       /* this is for debugging only! */
+       const char *handler_name;
+       const char *create_location;
+       const char *schedule_location;
+       /* this is private for the events_ops implementation */
+       void (*cancel_fn)(struct tevent_immediate *im);
+       void *additional_data;
+};
+
+struct tevent_signal {
+       struct tevent_signal *prev, *next;
+       struct tevent_context *event_ctx;
+       int signum;
+       int sa_flags;
+       tevent_signal_handler_t handler;
+       /* this is private for the specific handler */
+       void *private_data;
+       /* this is for debugging only! */
+       const char *handler_name;
+       const char *location;
+       /* this is private for the events_ops implementation */
+       void *additional_data;
+};
+
+struct tevent_debug_ops {
+       void (*debug)(void *context, enum tevent_debug_level level,
+                     const char *fmt, va_list ap) PRINTF_ATTRIBUTE(3,0);
+       void *context;
+};
+
+void tevent_debug(struct tevent_context *ev, enum tevent_debug_level level,
+                 const char *fmt, ...) PRINTF_ATTRIBUTE(3,4);
+
+struct tevent_context {
+       /* the specific events implementation */
+       const struct tevent_ops *ops;
+
+       /* list of fd events - used by common code */
+       struct tevent_fd *fd_events;
+
+       /* list of timed events - used by common code */
+       struct tevent_timer *timer_events;
+
+       /* list of immediate events - used by common code */
+       struct tevent_immediate *immediate_events;
+
+       /* list of signal events - used by common code */
+       struct tevent_signal *signal_events;
+
+       /* this is private for the events_ops implementation */
+       void *additional_data;
+
+       /* pipe hack used with signal handlers */
+       struct tevent_fd *pipe_fde;
+       int pipe_fds[2];
+
+       /* debugging operations */
+       struct tevent_debug_ops debug_ops;
+
+       /* info about the nesting status */
+       struct {
+               bool allowed;
+               uint32_t level;
+               tevent_nesting_hook hook_fn;
+               void *hook_private;
+       } nesting;
+
+       struct {
+               tevent_trace_callback_t callback;
+               void *private_data;
+       } tracing;
+
+       /*
+        * an optimization pointer into timer_events
+        * used by used by common code via
+        * tevent_common_add_timer_v2()
+        */
+       struct tevent_timer *last_zero_timer;
+};
+
+const struct tevent_ops *tevent_find_ops_byname(const char *name);
+
+int tevent_common_context_destructor(struct tevent_context *ev);
+int tevent_common_loop_wait(struct tevent_context *ev,
+                           const char *location);
+
+int tevent_common_fd_destructor(struct tevent_fd *fde);
+struct tevent_fd *tevent_common_add_fd(struct tevent_context *ev,
+                                      TALLOC_CTX *mem_ctx,
+                                      int fd,
+                                      uint16_t flags,
+                                      tevent_fd_handler_t handler,
+                                      void *private_data,
+                                      const char *handler_name,
+                                      const char *location);
+void tevent_common_fd_set_close_fn(struct tevent_fd *fde,
+                                  tevent_fd_close_fn_t close_fn);
+uint16_t tevent_common_fd_get_flags(struct tevent_fd *fde);
+void tevent_common_fd_set_flags(struct tevent_fd *fde, uint16_t flags);
+
+struct tevent_timer *tevent_common_add_timer(struct tevent_context *ev,
+                                            TALLOC_CTX *mem_ctx,
+                                            struct timeval next_event,
+                                            tevent_timer_handler_t handler,
+                                            void *private_data,
+                                            const char *handler_name,
+                                            const char *location);
+struct tevent_timer *tevent_common_add_timer_v2(struct tevent_context *ev,
+                                               TALLOC_CTX *mem_ctx,
+                                               struct timeval next_event,
+                                               tevent_timer_handler_t handler,
+                                               void *private_data,
+                                               const char *handler_name,
+                                               const char *location);
+struct timeval tevent_common_loop_timer_delay(struct tevent_context *);
+
+void tevent_common_schedule_immediate(struct tevent_immediate *im,
+                                     struct tevent_context *ev,
+                                     tevent_immediate_handler_t handler,
+                                     void *private_data,
+                                     const char *handler_name,
+                                     const char *location);
+bool tevent_common_loop_immediate(struct tevent_context *ev);
+
+struct tevent_signal *tevent_common_add_signal(struct tevent_context *ev,
+                                              TALLOC_CTX *mem_ctx,
+                                              int signum,
+                                              int sa_flags,
+                                              tevent_signal_handler_t handler,
+                                              void *private_data,
+                                              const char *handler_name,
+                                              const char *location);
+int tevent_common_check_signal(struct tevent_context *ev);
+void tevent_cleanup_pending_signal_handlers(struct tevent_signal *se);
+
+bool tevent_standard_init(void);
+bool tevent_select_init(void);
+bool tevent_poll_init(void);
+void tevent_poll_event_add_fd_internal(struct tevent_context *ev,
+                                      struct tevent_fd *fde);
+bool tevent_poll_mt_init(void);
+#ifdef HAVE_EPOLL
+bool tevent_epoll_init(void);
+bool tevent_epoll_set_panic_fallback(struct tevent_context *ev,
+                       bool (*panic_fallback)(struct tevent_context *ev,
+                                              bool replay));
+#endif
+
+
+void tevent_trace_point_callback(struct tevent_context *ev,
+                                enum tevent_trace_point);
diff --git a/ctdb/lib/tevent/tevent_liboop.c b/ctdb/lib/tevent/tevent_liboop.c
new file mode 100644 (file)
index 0000000..68be76b
--- /dev/null
@@ -0,0 +1,292 @@
+/* 
+   Unix SMB/CIFS implementation.
+   main select loop and event handling
+   wrapper for http://git.lysator.liu.se/liboop/
+
+   Copyright (C) Stefan Metzmacher 2005
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "events.h"
+#include "events_internal.h"
+
+#include <oop.h>
+
+/*
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
+
+ NOTE: this code compiles fine, but is completely *UNTESTED*
+       and is only committed as an example
+
+ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!  
+*/
+
+static int oop_event_context_destructor(struct tevent_context *ev)
+{
+       oop_source_sys *oop_sys = ev->additional_data;
+
+       oop_sys_delete(oop_sys);
+
+       return 0;
+}
+
+/*
+  create a oop_event_context structure.
+*/
+static int oop_event_context_init(struct tevent_context *ev, void *private_data)
+{
+       oop_source_sys *oop_sys = private_data;
+
+       if (!oop_sys) {
+               oop_sys = oop_sys_new();
+               if (!oop_sys) {
+                       return -1;
+               }
+
+               talloc_set_destructor(ev, oop_event_context_destructor);
+       }
+
+       ev->additional_data = oop_sys;
+
+       return 0;
+}
+
+static void *oop_event_fd_handler(oop_source *oop, int fd, oop_event oop_type, void *ptr)
+{
+       struct tevent_fd *fde = ptr;
+
+       if (fd != fde->fd) return OOP_ERROR;
+
+       switch(oop_type) {
+               case OOP_READ:
+                       fde->handler(fde->event_ctx, fde, EVENT_FD_READ, fde->private_data);
+                       return OOP_CONTINUE;
+               case OOP_WRITE:
+                       fde->handler(fde->event_ctx, fde, EVENT_FD_WRITE, fde->private_data);
+                       return OOP_CONTINUE;                    
+               case OOP_EXCEPTION:
+                       return OOP_ERROR;
+               case OOP_NUM_EVENTS:
+                       return OOP_ERROR;
+       }
+
+       return OOP_ERROR;
+}
+
+/*
+  destroy an fd_event
+*/
+static int oop_event_fd_destructor(struct tevent_fd *fde)
+{
+       struct tevent_context *ev = fde->event_ctx;
+       oop_source_sys *oop_sys = ev->additional_data;
+       oop_source *oop = oop_sys_source(oop_sys);
+
+       if (fde->flags & EVENT_FD_READ)
+               oop->cancel_fd(oop, fde->fd, OOP_READ);
+       if (fde->flags & EVENT_FD_WRITE)
+               oop->cancel_fd(oop, fde->fd, OOP_WRITE);
+
+       if (fde->flags & EVENT_FD_AUTOCLOSE) {
+               close(fde->fd);
+               fde->fd = -1;
+       }
+
+       return 0;
+}
+
+/*
+  add a fd based event
+  return NULL on failure (memory allocation error)
+*/
+static struct tevent_fd *oop_event_add_fd(struct tevent_context *ev, TALLOC_CTX *mem_ctx,
+                                        int fd, uint16_t flags,
+                                        event_fd_handler_t handler,
+                                        void *private_data)
+{
+       struct tevent_fd *fde;
+       oop_source_sys *oop_sys = ev->additional_data;
+       oop_source *oop = oop_sys_source(oop_sys);
+       
+       fde = talloc(mem_ctx?mem_ctx:ev, struct tevent_fd);
+       if (!fde) return NULL;
+
+       fde->event_ctx          = ev;
+       fde->fd                 = fd;
+       fde->flags              = flags;
+       fde->handler            = handler;
+       fde->private_data       = private_data;
+       fde->additional_flags   = 0;
+       fde->additional_data    = NULL;
+
+       if (fde->flags & EVENT_FD_READ)
+               oop->on_fd(oop, fde->fd, OOP_READ, oop_event_fd_handler, fde);
+       if (fde->flags & EVENT_FD_WRITE)
+               oop->on_fd(oop, fde->fd, OOP_WRITE, oop_event_fd_handler, fde);
+
+       talloc_set_destructor(fde, oop_event_fd_destructor);
+
+       return fde;
+}
+
+/*
+  return the fd event flags
+*/
+static uint16_t oop_event_get_fd_flags(struct tevent_fd *fde)
+{
+       return fde->flags;
+}
+
+/*
+  set the fd event flags
+*/
+static void oop_event_set_fd_flags(struct tevent_fd *fde, uint16_t flags)
+{
+       oop_source_sys *oop_sys;
+       oop_source *oop;
+
+       oop_sys = fde->event_ctx->additional_data;
+       oop = oop_sys_source(oop_sys);
+
+       if ((fde->flags & EVENT_FD_READ)&&(!(flags & EVENT_FD_READ)))
+               oop->cancel_fd(oop, fde->fd, OOP_READ);
+
+       if ((!(fde->flags & EVENT_FD_READ))&&(flags & EVENT_FD_READ))
+               oop->on_fd(oop, fde->fd, OOP_READ, oop_event_fd_handler, fde);
+
+       if ((fde->flags & EVENT_FD_WRITE)&&(!(flags & EVENT_FD_WRITE)))
+               oop->cancel_fd(oop, fde->fd, OOP_WRITE);
+
+       if ((!(fde->flags & EVENT_FD_WRITE))&&(flags & EVENT_FD_WRITE))
+               oop->on_fd(oop, fde->fd, OOP_WRITE, oop_event_fd_handler, fde);
+
+       fde->flags = flags;
+}
+
+static int oop_event_timed_destructor(struct tevent_timer *te);
+
+static int oop_event_timed_deny_destructor(struct tevent_timer *te)
+{
+       return -1;
+}
+
+static void *oop_event_timed_handler(oop_source *oop, struct timeval t, void *ptr)
+{
+       struct tevent_timer *te = ptr;
+
+       /* deny the handler to free the event */
+       talloc_set_destructor(te, oop_event_timed_deny_destructor);
+       te->handler(te->event_ctx, te, t, te->private_data);
+
+       talloc_set_destructor(te, oop_event_timed_destructor);
+       talloc_free(te);
+
+       return OOP_CONTINUE;
+}
+
+/*
+  destroy a timed event
+*/
+static int oop_event_timed_destructor(struct tevent_timer *te)
+{
+       struct tevent_context *ev = te->event_ctx;
+       oop_source_sys *oop_sys = ev->additional_data;
+       oop_source *oop = oop_sys_source(oop_sys);
+
+       oop->cancel_time(oop, te->next_event, oop_event_timed_handler, te);
+
+       return 0;
+}
+
+/*
+  add a timed event
+  return NULL on failure (memory allocation error)
+*/
+static struct tevent_timer *oop_event_add_timed(struct tevent_context *ev, TALLOC_CTX *mem_ctx,
+                                              struct timeval next_event, 
+                                              event_timed_handler_t handler, 
+                                              void *private_data) 
+{
+       oop_source_sys *oop_sys = ev->additional_data;
+       oop_source *oop = oop_sys_source(oop_sys);
+       struct tevent_timer *te;
+
+       te = talloc(mem_ctx?mem_ctx:ev, struct tevent_timer);
+       if (te == NULL) return NULL;
+
+       te->event_ctx           = ev;
+       te->next_event          = next_event;
+       te->handler             = handler;
+       te->private_data        = private_data;
+       te->additional_data     = NULL;
+
+       oop->on_time(oop, te->next_event, oop_event_timed_handler, te);
+
+       talloc_set_destructor(te, oop_event_timed_destructor);
+
+       return te;
+}
+
+/*
+  do a single event loop using the events defined in ev 
+*/
+static int oop_event_loop_once(struct tevent_context *ev)
+{
+       void *oop_ret;
+       oop_source_sys *oop_sys = ev->additional_data;
+
+       oop_ret = oop_sys_run_once(oop_sys);
+       if (oop_ret == OOP_CONTINUE) {
+               return 0;
+       }
+
+       return -1;
+}
+
+/*
+  return on failure or (with 0) if all fd events are removed
+*/
+static int oop_event_loop_wait(struct tevent_context *ev)
+{
+       void *oop_ret;
+       oop_source_sys *oop_sys = ev->additional_data;
+
+       oop_ret = oop_sys_run(oop_sys);
+       if (oop_ret == OOP_CONTINUE) {
+               return 0;
+       }
+
+       return -1;
+}
+
+static const struct event_ops event_oop_ops = {
+       .context_init   = oop_event_context_init,
+       .add_fd         = oop_event_add_fd,
+       .get_fd_flags   = oop_event_get_fd_flags,
+       .set_fd_flags   = oop_event_set_fd_flags,
+       .add_timer      = oop_event_add_timed,
+       .add_signal     = common_event_add_signal,
+       .loop_once      = oop_event_loop_once,
+       .loop_wait      = oop_event_loop_wait,
+};
+
+const struct event_ops *event_liboop_get_ops(void)
+{
+       return &event_oop_ops;
+}
diff --git a/ctdb/lib/tevent/tevent_poll.c b/ctdb/lib/tevent/tevent_poll.c
new file mode 100644 (file)
index 0000000..92fcc44
--- /dev/null
@@ -0,0 +1,725 @@
+/*
+   Unix SMB/CIFS implementation.
+   main select loop and event handling
+   Copyright (C) Andrew Tridgell       2003-2005
+   Copyright (C) Stefan Metzmacher     2005-2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/select.h"
+#include "tevent.h"
+#include "tevent_util.h"
+#include "tevent_internal.h"
+
+struct poll_event_context {
+       /* a pointer back to the generic event_context */
+       struct tevent_context *ev;
+
+       /*
+        * A DLIST for fresh fde's added by poll_event_add_fd but not
+        * picked up yet by poll_event_loop_once
+        */
+       struct tevent_fd *fresh;
+       /*
+        * A DLIST for disabled fde's.
+        */
+       struct tevent_fd *disabled;
+       /*
+        * one or more events were deleted or disabled
+        */
+       bool deleted;
+
+       /*
+        * These two arrays are maintained together.
+        */
+       struct pollfd *fds;
+       struct tevent_fd **fdes;
+       unsigned num_fds;
+
+       /*
+        * Signal fd to wake the poll() thread
+        */
+       int signal_fd;
+
+       /* information for exiting from the event loop */
+       int exit_code;
+};
+
+static int poll_event_context_destructor(struct poll_event_context *poll_ev)
+{
+       struct tevent_fd *fd, *fn;
+
+       for (fd = poll_ev->fresh; fd; fd = fn) {
+               fn = fd->next;
+               fd->event_ctx = NULL;
+               DLIST_REMOVE(poll_ev->fresh, fd);
+       }
+
+       for (fd = poll_ev->disabled; fd; fd = fn) {
+               fn = fd->next;
+               fd->event_ctx = NULL;
+               DLIST_REMOVE(poll_ev->disabled, fd);
+       }
+
+       if (poll_ev->signal_fd == -1) {
+               /*
+                * Non-threaded, no signal pipe
+                */
+               return 0;
+       }
+
+       close(poll_ev->signal_fd);
+       poll_ev->signal_fd = -1;
+
+       if (poll_ev->num_fds == 0) {
+               return 0;
+       }
+       if (poll_ev->fds[0].fd != -1) {
+               close(poll_ev->fds[0].fd);
+               poll_ev->fds[0].fd = -1;
+       }
+       return 0;
+}
+
+/*
+  create a poll_event_context structure.
+*/
+static int poll_event_context_init(struct tevent_context *ev)
+{
+       struct poll_event_context *poll_ev;
+
+       /*
+        * we might be called during tevent_re_initialise()
+        * which means we need to free our old additional_data
+        * in order to detach old fd events from the
+        * poll_ev->fresh list
+        */
+       TALLOC_FREE(ev->additional_data);
+
+       poll_ev = talloc_zero(ev, struct poll_event_context);
+       if (poll_ev == NULL) {
+               return -1;
+       }
+       poll_ev->ev = ev;
+       poll_ev->signal_fd = -1;
+       ev->additional_data = poll_ev;
+       talloc_set_destructor(poll_ev, poll_event_context_destructor);
+       return 0;
+}
+
+static bool set_nonblock(int fd)
+{
+       int val;
+
+       val = fcntl(fd, F_GETFL, 0);
+       if (val == -1) {
+               return false;
+       }
+       val |= O_NONBLOCK;
+
+       return (fcntl(fd, F_SETFL, val) != -1);
+}
+
+static int poll_event_context_init_mt(struct tevent_context *ev)
+{
+       struct poll_event_context *poll_ev;
+       struct pollfd *pfd;
+       int fds[2];
+       int ret;
+
+       ret = poll_event_context_init(ev);
+       if (ret == -1) {
+               return ret;
+       }
+
+       poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+
+       poll_ev->fds = talloc_zero(poll_ev, struct pollfd);
+       if (poll_ev->fds == NULL) {
+               return -1;
+       }
+
+       ret = pipe(fds);
+       if (ret == -1) {
+               return -1;
+       }
+
+       if (!set_nonblock(fds[0]) || !set_nonblock(fds[1])) {
+               close(fds[0]);
+               close(fds[1]);
+               return -1;
+       }
+
+       poll_ev->signal_fd = fds[1];
+
+       pfd = &poll_ev->fds[0];
+       pfd->fd = fds[0];
+       pfd->events = (POLLIN|POLLHUP);
+
+       poll_ev->num_fds = 1;
+
+       talloc_set_destructor(poll_ev, poll_event_context_destructor);
+
+       return 0;
+}
+
+static void poll_event_wake_pollthread(struct poll_event_context *poll_ev)
+{
+       char c;
+       ssize_t ret;
+
+       if (poll_ev->signal_fd == -1) {
+               return;
+       }
+       c = 0;
+       do {
+               ret = write(poll_ev->signal_fd, &c, sizeof(c));
+       } while ((ret == -1) && (errno == EINTR));
+}
+
+static void poll_event_drain_signal_fd(struct poll_event_context *poll_ev)
+{
+       char buf[16];
+       ssize_t ret;
+       int fd;
+
+       if (poll_ev->signal_fd == -1) {
+               return;
+       }
+
+       if (poll_ev->num_fds < 1) {
+               return;
+       }
+       fd = poll_ev->fds[0].fd;
+
+       do {
+               ret = read(fd, buf, sizeof(buf));
+       } while (ret == sizeof(buf));
+}
+
+/*
+  destroy an fd_event
+*/
+static int poll_event_fd_destructor(struct tevent_fd *fde)
+{
+       struct tevent_context *ev = fde->event_ctx;
+       struct poll_event_context *poll_ev;
+       uint64_t del_idx = fde->additional_flags;
+
+       if (ev == NULL) {
+               goto done;
+       }
+
+       poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+
+       if (del_idx == UINT64_MAX) {
+               struct tevent_fd **listp =
+                       (struct tevent_fd **)fde->additional_data;
+
+               DLIST_REMOVE((*listp), fde);
+               goto done;
+       }
+
+       poll_ev->fdes[del_idx] = NULL;
+       poll_ev->deleted = true;
+       poll_event_wake_pollthread(poll_ev);
+done:
+       return tevent_common_fd_destructor(fde);
+}
+
+static void poll_event_schedule_immediate(struct tevent_immediate *im,
+                                         struct tevent_context *ev,
+                                         tevent_immediate_handler_t handler,
+                                         void *private_data,
+                                         const char *handler_name,
+                                         const char *location)
+{
+       struct poll_event_context *poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+
+       tevent_common_schedule_immediate(im, ev, handler, private_data,
+                                        handler_name, location);
+       poll_event_wake_pollthread(poll_ev);
+}
+
+/*
+  Private function called by "standard" backend fallback.
+  Note this only allows fallback to "poll" backend, not "poll-mt".
+*/
+_PRIVATE_ void tevent_poll_event_add_fd_internal(struct tevent_context *ev,
+                                                struct tevent_fd *fde)
+{
+       struct poll_event_context *poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+       struct tevent_fd **listp;
+
+       if (fde->flags != 0) {
+               listp = &poll_ev->fresh;
+       } else {
+               listp = &poll_ev->disabled;
+       }
+
+       fde->additional_flags   = UINT64_MAX;
+       fde->additional_data    = listp;
+
+       DLIST_ADD((*listp), fde);
+       talloc_set_destructor(fde, poll_event_fd_destructor);
+}
+
+/*
+  add a fd based event
+  return NULL on failure (memory allocation error)
+*/
+static struct tevent_fd *poll_event_add_fd(struct tevent_context *ev,
+                                          TALLOC_CTX *mem_ctx,
+                                          int fd, uint16_t flags,
+                                          tevent_fd_handler_t handler,
+                                          void *private_data,
+                                          const char *handler_name,
+                                          const char *location)
+{
+       struct poll_event_context *poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+       struct tevent_fd *fde;
+
+       if (fd < 0) {
+               return NULL;
+       }
+
+       fde = talloc(mem_ctx ? mem_ctx : ev, struct tevent_fd);
+       if (fde == NULL) {
+               return NULL;
+       }
+       fde->event_ctx          = ev;
+       fde->fd                 = fd;
+       fde->flags              = flags;
+       fde->handler            = handler;
+       fde->close_fn           = NULL;
+       fde->private_data       = private_data;
+       fde->handler_name       = handler_name;
+       fde->location           = location;
+       fde->additional_flags   = UINT64_MAX;
+       fde->additional_data    = NULL;
+
+       tevent_poll_event_add_fd_internal(ev, fde);
+       poll_event_wake_pollthread(poll_ev);
+
+       /*
+        * poll_event_loop_poll will take care of the rest in
+        * poll_event_setup_fresh
+        */
+       return fde;
+}
+
+/*
+  set the fd event flags
+*/
+static void poll_event_set_fd_flags(struct tevent_fd *fde, uint16_t flags)
+{
+       struct tevent_context *ev = fde->event_ctx;
+       struct poll_event_context *poll_ev;
+       uint64_t idx = fde->additional_flags;
+       uint16_t pollflags;
+
+       if (ev == NULL) {
+               return;
+       }
+       poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+
+       fde->flags = flags;
+
+       if (idx == UINT64_MAX) {
+               struct tevent_fd **listp =
+                       (struct tevent_fd **)fde->additional_data;
+
+               /*
+                * We move it between the fresh and disabled lists.
+                */
+               DLIST_REMOVE((*listp), fde);
+               tevent_poll_event_add_fd_internal(ev, fde);
+               poll_event_wake_pollthread(poll_ev);
+               return;
+       }
+
+       if (fde->flags == 0) {
+               /*
+                * We need to remove it from the array
+                * and move it to the disabled list.
+                */
+               poll_ev->fdes[idx] = NULL;
+               poll_ev->deleted = true;
+               DLIST_REMOVE(ev->fd_events, fde);
+               tevent_poll_event_add_fd_internal(ev, fde);
+               poll_event_wake_pollthread(poll_ev);
+               return;
+       }
+
+       pollflags = 0;
+
+       if (flags & TEVENT_FD_READ) {
+               pollflags |= (POLLIN|POLLHUP);
+       }
+       if (flags & TEVENT_FD_WRITE) {
+               pollflags |= (POLLOUT);
+       }
+       poll_ev->fds[idx].events = pollflags;
+
+       poll_event_wake_pollthread(poll_ev);
+}
+
+static bool poll_event_setup_fresh(struct tevent_context *ev,
+                                  struct poll_event_context *poll_ev)
+{
+       struct tevent_fd *fde, *next;
+       unsigned num_fresh, num_fds;
+
+       if (poll_ev->deleted) {
+               unsigned first_fd = (poll_ev->signal_fd != -1) ? 1 : 0;
+               unsigned i;
+
+               for (i=first_fd; i < poll_ev->num_fds;) {
+                       fde = poll_ev->fdes[i];
+                       if (fde != NULL) {
+                               i++;
+                               continue;
+                       }
+
+                       /*
+                        * This fde was talloc_free()'ed. Delete it
+                        * from the arrays
+                        */
+                       poll_ev->num_fds -= 1;
+                       if (poll_ev->num_fds == i) {
+                               break;
+                       }
+                       poll_ev->fds[i] = poll_ev->fds[poll_ev->num_fds];
+                       poll_ev->fdes[i] = poll_ev->fdes[poll_ev->num_fds];
+                       if (poll_ev->fdes[i] != NULL) {
+                               poll_ev->fdes[i]->additional_flags = i;
+                       }
+               }
+       }
+       poll_ev->deleted = false;
+
+       if (poll_ev->fresh == NULL) {
+               return true;
+       }
+
+       num_fresh = 0;
+       for (fde = poll_ev->fresh; fde; fde = fde->next) {
+               num_fresh += 1;
+       }
+       num_fds = poll_ev->num_fds + num_fresh;
+
+       /*
+        * We check the length of fdes here. It is the last one
+        * enlarged, so if the realloc for poll_fd->fdes fails,
+        * poll_fd->fds will have at least the size of poll_fd->fdes
+        */
+
+       if (num_fds >= talloc_array_length(poll_ev->fdes)) {
+               struct pollfd *tmp_fds;
+               struct tevent_fd **tmp_fdes;
+               unsigned array_length;
+
+               array_length = (num_fds + 15) & ~15; /* round up to 16 */
+
+               tmp_fds = talloc_realloc(
+                       poll_ev, poll_ev->fds, struct pollfd, array_length);
+               if (tmp_fds == NULL) {
+                       return false;
+               }
+               poll_ev->fds = tmp_fds;
+
+               tmp_fdes = talloc_realloc(
+                       poll_ev, poll_ev->fdes, struct tevent_fd *,
+                       array_length);
+               if (tmp_fdes == NULL) {
+                       return false;
+               }
+               poll_ev->fdes = tmp_fdes;
+       }
+
+       for (fde = poll_ev->fresh; fde; fde = next) {
+               struct pollfd *pfd;
+
+               pfd = &poll_ev->fds[poll_ev->num_fds];
+
+               pfd->fd = fde->fd;
+               pfd->events = 0;
+               pfd->revents = 0;
+
+               if (fde->flags & TEVENT_FD_READ) {
+                       pfd->events |= (POLLIN|POLLHUP);
+               }
+               if (fde->flags & TEVENT_FD_WRITE) {
+                       pfd->events |= (POLLOUT);
+               }
+
+               fde->additional_flags = poll_ev->num_fds;
+               poll_ev->fdes[poll_ev->num_fds] = fde;
+
+               next = fde->next;
+               DLIST_REMOVE(poll_ev->fresh, fde);
+               DLIST_ADD(ev->fd_events, fde);
+
+               poll_ev->num_fds += 1;
+       }
+       return true;
+}
+
+/*
+  event loop handling using poll()
+*/
+static int poll_event_loop_poll(struct tevent_context *ev,
+                               struct timeval *tvalp)
+{
+       struct poll_event_context *poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+       int pollrtn;
+       int timeout = -1;
+       int poll_errno;
+       struct tevent_fd *fde = NULL;
+       unsigned i;
+
+       if (ev->signal_events && tevent_common_check_signal(ev)) {
+               return 0;
+       }
+
+       if (tvalp != NULL) {
+               timeout = tvalp->tv_sec * 1000;
+               timeout += (tvalp->tv_usec + 999) / 1000;
+       }
+
+       poll_event_drain_signal_fd(poll_ev);
+
+       if (!poll_event_setup_fresh(ev, poll_ev)) {
+               return -1;
+       }
+
+       tevent_trace_point_callback(poll_ev->ev, TEVENT_TRACE_BEFORE_WAIT);
+       pollrtn = poll(poll_ev->fds, poll_ev->num_fds, timeout);
+       poll_errno = errno;
+       tevent_trace_point_callback(poll_ev->ev, TEVENT_TRACE_AFTER_WAIT);
+
+       if (pollrtn == -1 && poll_errno == EINTR && ev->signal_events) {
+               tevent_common_check_signal(ev);
+               return 0;
+       }
+
+       if (pollrtn == 0 && tvalp) {
+               /* we don't care about a possible delay here */
+               tevent_common_loop_timer_delay(ev);
+               return 0;
+       }
+
+       if (pollrtn <= 0) {
+               /*
+                * No fd's ready
+                */
+               return 0;
+       }
+
+       /* at least one file descriptor is ready - check
+          which ones and call the handler, being careful to allow
+          the handler to remove itself when called */
+
+       for (fde = ev->fd_events; fde; fde = fde->next) {
+               unsigned idx = fde->additional_flags;
+               struct pollfd *pfd;
+               uint16_t flags = 0;
+
+               if (idx == UINT64_MAX) {
+                       continue;
+               }
+
+               pfd = &poll_ev->fds[idx];
+
+               if (pfd->revents & POLLNVAL) {
+                       /*
+                        * the socket is dead! this should never
+                        * happen as the socket should have first been
+                        * made readable and that should have removed
+                        * the event, so this must be a bug.
+                        *
+                        * We ignore it here to match the epoll
+                        * behavior.
+                        */
+                       tevent_debug(ev, TEVENT_DEBUG_ERROR,
+                                    "POLLNVAL on fde[%p] fd[%d] - disabling\n",
+                                    fde, pfd->fd);
+                       poll_ev->fdes[idx] = NULL;
+                       poll_ev->deleted = true;
+                       DLIST_REMOVE(ev->fd_events, fde);
+                       fde->event_ctx = NULL;
+                       continue;
+               }
+
+               if (pfd->revents & (POLLHUP|POLLERR)) {
+                       /* If we only wait for TEVENT_FD_WRITE, we
+                          should not tell the event handler about it,
+                          and remove the writable flag, as we only
+                          report errors when waiting for read events
+                          to match the select behavior. */
+                       if (!(fde->flags & TEVENT_FD_READ)) {
+                               TEVENT_FD_NOT_WRITEABLE(fde);
+                               continue;
+                       }
+                       flags |= TEVENT_FD_READ;
+               }
+               if (pfd->revents & POLLIN) {
+                       flags |= TEVENT_FD_READ;
+               }
+               if (pfd->revents & POLLOUT) {
+                       flags |= TEVENT_FD_WRITE;
+               }
+               /*
+                * Note that fde->flags could be changed when using
+                * the poll_mt backend together with threads,
+                * that why we need to check pfd->revents and fde->flags
+                */
+               flags &= fde->flags;
+               if (flags != 0) {
+                       DLIST_DEMOTE(ev->fd_events, fde, struct tevent_fd);
+                       fde->handler(ev, fde, flags, fde->private_data);
+                       return 0;
+               }
+       }
+
+       for (i = 0; i < poll_ev->num_fds; i++) {
+               if (poll_ev->fds[i].revents & POLLNVAL) {
+                       /*
+                        * the socket is dead! this should never
+                        * happen as the socket should have first been
+                        * made readable and that should have removed
+                        * the event, so this must be a bug or
+                        * a race in the poll_mt usage.
+                        */
+                       fde = poll_ev->fdes[i];
+                       tevent_debug(ev, TEVENT_DEBUG_WARNING,
+                                    "POLLNVAL on dangling fd[%d] fde[%p] - disabling\n",
+                                    poll_ev->fds[i].fd, fde);
+                       poll_ev->fdes[i] = NULL;
+                       poll_ev->deleted = true;
+                       if (fde != NULL) {
+                               DLIST_REMOVE(ev->fd_events, fde);
+                               fde->event_ctx = NULL;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+  do a single event loop using the events defined in ev
+*/
+static int poll_event_loop_once(struct tevent_context *ev,
+                               const char *location)
+{
+       struct timeval tval;
+
+       if (ev->signal_events &&
+           tevent_common_check_signal(ev)) {
+               return 0;
+       }
+
+       if (ev->immediate_events &&
+           tevent_common_loop_immediate(ev)) {
+               return 0;
+       }
+
+       tval = tevent_common_loop_timer_delay(ev);
+       if (tevent_timeval_is_zero(&tval)) {
+               return 0;
+       }
+
+       return poll_event_loop_poll(ev, &tval);
+}
+
+static int poll_event_loop_wait(struct tevent_context *ev,
+                               const char *location)
+{
+       struct poll_event_context *poll_ev = talloc_get_type_abort(
+               ev->additional_data, struct poll_event_context);
+
+       /*
+        * loop as long as we have events pending
+        */
+       while (ev->fd_events ||
+              ev->timer_events ||
+              ev->immediate_events ||
+              ev->signal_events ||
+              poll_ev->fresh ||
+              poll_ev->disabled) {
+               int ret;
+               ret = _tevent_loop_once(ev, location);
+               if (ret != 0) {
+                       tevent_debug(ev, TEVENT_DEBUG_FATAL,
+                                    "_tevent_loop_once() failed: %d - %s\n",
+                                    ret, strerror(errno));
+                       return ret;
+               }
+       }
+
+       tevent_debug(ev, TEVENT_DEBUG_WARNING,
+                    "poll_event_loop_wait() out of events\n");
+       return 0;
+}
+
+static const struct tevent_ops poll_event_ops = {
+       .context_init           = poll_event_context_init,
+       .add_fd                 = poll_event_add_fd,
+       .set_fd_close_fn        = tevent_common_fd_set_close_fn,
+       .get_fd_flags           = tevent_common_fd_get_flags,
+       .set_fd_flags           = poll_event_set_fd_flags,
+       .add_timer              = tevent_common_add_timer_v2,
+       .schedule_immediate     = tevent_common_schedule_immediate,
+       .add_signal             = tevent_common_add_signal,
+       .loop_once              = poll_event_loop_once,
+       .loop_wait              = poll_event_loop_wait,
+};
+
+_PRIVATE_ bool tevent_poll_init(void)
+{
+       return tevent_register_backend("poll", &poll_event_ops);
+}
+
+static const struct tevent_ops poll_event_mt_ops = {
+       .context_init           = poll_event_context_init_mt,
+       .add_fd                 = poll_event_add_fd,
+       .set_fd_close_fn        = tevent_common_fd_set_close_fn,
+       .get_fd_flags           = tevent_common_fd_get_flags,
+       .set_fd_flags           = poll_event_set_fd_flags,
+       .add_timer              = tevent_common_add_timer_v2,
+       .schedule_immediate     = poll_event_schedule_immediate,
+       .add_signal             = tevent_common_add_signal,
+       .loop_once              = poll_event_loop_once,
+       .loop_wait              = poll_event_loop_wait,
+};
+
+_PRIVATE_ bool tevent_poll_mt_init(void)
+{
+       return tevent_register_backend("poll_mt", &poll_event_mt_ops);
+}
diff --git a/ctdb/lib/tevent/tevent_queue.c b/ctdb/lib/tevent/tevent_queue.c
new file mode 100644 (file)
index 0000000..4750675
--- /dev/null
@@ -0,0 +1,300 @@
+/*
+   Unix SMB/CIFS implementation.
+   Infrastructure for async requests
+   Copyright (C) Volker Lendecke 2008
+   Copyright (C) Stefan Metzmacher 2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+struct tevent_queue_entry {
+       struct tevent_queue_entry *prev, *next;
+       struct tevent_queue *queue;
+
+       bool triggered;
+
+       struct tevent_req *req;
+       struct tevent_context *ev;
+
+       tevent_queue_trigger_fn_t trigger;
+       void *private_data;
+};
+
+struct tevent_queue {
+       const char *name;
+       const char *location;
+
+       bool running;
+       struct tevent_immediate *immediate;
+
+       size_t length;
+       struct tevent_queue_entry *list;
+};
+
+static void tevent_queue_immediate_trigger(struct tevent_context *ev,
+                                          struct tevent_immediate *im,
+                                          void *private_data);
+
+static int tevent_queue_entry_destructor(struct tevent_queue_entry *e)
+{
+       struct tevent_queue *q = e->queue;
+
+       if (!q) {
+               return 0;
+       }
+
+       DLIST_REMOVE(q->list, e);
+       q->length--;
+
+       if (!q->running) {
+               return 0;
+       }
+
+       if (!q->list) {
+               return 0;
+       }
+
+       if (q->list->triggered) {
+               return 0;
+       }
+
+       tevent_schedule_immediate(q->immediate,
+                                 q->list->ev,
+                                 tevent_queue_immediate_trigger,
+                                 q);
+
+       return 0;
+}
+
+static int tevent_queue_destructor(struct tevent_queue *q)
+{
+       q->running = false;
+
+       while (q->list) {
+               struct tevent_queue_entry *e = q->list;
+               talloc_free(e);
+       }
+
+       return 0;
+}
+
+struct tevent_queue *_tevent_queue_create(TALLOC_CTX *mem_ctx,
+                                         const char *name,
+                                         const char *location)
+{
+       struct tevent_queue *queue;
+
+       queue = talloc_zero(mem_ctx, struct tevent_queue);
+       if (!queue) {
+               return NULL;
+       }
+
+       queue->name = talloc_strdup(queue, name);
+       if (!queue->name) {
+               talloc_free(queue);
+               return NULL;
+       }
+       queue->immediate = tevent_create_immediate(queue);
+       if (!queue->immediate) {
+               talloc_free(queue);
+               return NULL;
+       }
+
+       queue->location = location;
+
+       /* queue is running by default */
+       queue->running = true;
+
+       talloc_set_destructor(queue, tevent_queue_destructor);
+       return queue;
+}
+
+static void tevent_queue_immediate_trigger(struct tevent_context *ev,
+                                          struct tevent_immediate *im,
+                                          void *private_data)
+{
+       struct tevent_queue *q = talloc_get_type(private_data,
+                                 struct tevent_queue);
+
+       if (!q->running) {
+               return;
+       }
+
+       q->list->triggered = true;
+       q->list->trigger(q->list->req, q->list->private_data);
+}
+
+static struct tevent_queue_entry *tevent_queue_add_internal(
+                                       struct tevent_queue *queue,
+                                       struct tevent_context *ev,
+                                       struct tevent_req *req,
+                                       tevent_queue_trigger_fn_t trigger,
+                                       void *private_data,
+                                       bool allow_direct)
+{
+       struct tevent_queue_entry *e;
+
+       e = talloc_zero(req, struct tevent_queue_entry);
+       if (e == NULL) {
+               return NULL;
+       }
+
+       e->queue = queue;
+       e->req = req;
+       e->ev = ev;
+       e->trigger = trigger;
+       e->private_data = private_data;
+
+       /*
+        * if there is no trigger, it is just a blocker
+        */
+       if (trigger == NULL) {
+               e->triggered = true;
+       }
+
+       if (queue->length > 0) {
+               /*
+                * if there are already entries in the
+                * queue do not optimize.
+                */
+               allow_direct = false;
+       }
+
+       if (req->async.fn != NULL) {
+               /*
+                * If the callers wants to optimize for the
+                * empty queue case, call the trigger only
+                * if there is no callback defined for the
+                * request yet.
+                */
+               allow_direct = false;
+       }
+
+       DLIST_ADD_END(queue->list, e, struct tevent_queue_entry *);
+       queue->length++;
+       talloc_set_destructor(e, tevent_queue_entry_destructor);
+
+       if (!queue->running) {
+               return e;
+       }
+
+       if (queue->list->triggered) {
+               return e;
+       }
+
+       /*
+        * If allowed we directly call the trigger
+        * avoiding possible delays caused by
+        * an immediate event.
+        */
+       if (allow_direct) {
+               queue->list->triggered = true;
+               queue->list->trigger(queue->list->req,
+                                    queue->list->private_data);
+               return e;
+       }
+
+       tevent_schedule_immediate(queue->immediate,
+                                 queue->list->ev,
+                                 tevent_queue_immediate_trigger,
+                                 queue);
+
+       return e;
+}
+
+bool tevent_queue_add(struct tevent_queue *queue,
+                     struct tevent_context *ev,
+                     struct tevent_req *req,
+                     tevent_queue_trigger_fn_t trigger,
+                     void *private_data)
+{
+       struct tevent_queue_entry *e;
+
+       e = tevent_queue_add_internal(queue, ev, req,
+                                     trigger, private_data, false);
+       if (e == NULL) {
+               return false;
+       }
+
+       return true;
+}
+
+struct tevent_queue_entry *tevent_queue_add_entry(
+                                       struct tevent_queue *queue,
+                                       struct tevent_context *ev,
+                                       struct tevent_req *req,
+                                       tevent_queue_trigger_fn_t trigger,
+                                       void *private_data)
+{
+       return tevent_queue_add_internal(queue, ev, req,
+                                        trigger, private_data, false);
+}
+
+struct tevent_queue_entry *tevent_queue_add_optimize_empty(
+                                       struct tevent_queue *queue,
+                                       struct tevent_context *ev,
+                                       struct tevent_req *req,
+                                       tevent_queue_trigger_fn_t trigger,
+                                       void *private_data)
+{
+       return tevent_queue_add_internal(queue, ev, req,
+                                        trigger, private_data, true);
+}
+
+void tevent_queue_start(struct tevent_queue *queue)
+{
+       if (queue->running) {
+               /* already started */
+               return;
+       }
+
+       queue->running = true;
+
+       if (!queue->list) {
+               return;
+       }
+
+       if (queue->list->triggered) {
+               return;
+       }
+
+       tevent_schedule_immediate(queue->immediate,
+                                 queue->list->ev,
+                                 tevent_queue_immediate_trigger,
+                                 queue);
+}
+
+void tevent_queue_stop(struct tevent_queue *queue)
+{
+       queue->running = false;
+}
+
+size_t tevent_queue_length(struct tevent_queue *queue)
+{
+       return queue->length;
+}
+
+bool tevent_queue_running(struct tevent_queue *queue)
+{
+       return queue->running;
+}
diff --git a/ctdb/lib/tevent/tevent_req.c b/ctdb/lib/tevent/tevent_req.c
new file mode 100644 (file)
index 0000000..d8d0c5f
--- /dev/null
@@ -0,0 +1,294 @@
+/*
+   Unix SMB/CIFS implementation.
+   Infrastructure for async requests
+   Copyright (C) Volker Lendecke 2008
+   Copyright (C) Stefan Metzmacher 2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+char *tevent_req_default_print(struct tevent_req *req, TALLOC_CTX *mem_ctx)
+{
+       return talloc_asprintf(mem_ctx,
+                              "tevent_req[%p/%s]: state[%d] error[%lld (0x%llX)] "
+                              " state[%s (%p)] timer[%p]",
+                              req, req->internal.create_location,
+                              req->internal.state,
+                              (unsigned long long)req->internal.error,
+                              (unsigned long long)req->internal.error,
+                              talloc_get_name(req->data),
+                              req->data,
+                              req->internal.timer
+                              );
+}
+
+char *tevent_req_print(TALLOC_CTX *mem_ctx, struct tevent_req *req)
+{
+       if (!req->private_print) {
+               return tevent_req_default_print(req, mem_ctx);
+       }
+
+       return req->private_print(req, mem_ctx);
+}
+
+struct tevent_req *_tevent_req_create(TALLOC_CTX *mem_ctx,
+                                   void *pdata,
+                                   size_t data_size,
+                                   const char *type,
+                                   const char *location)
+{
+       struct tevent_req *req;
+       void **ppdata = (void **)pdata;
+       void *data;
+
+       req = talloc_zero(mem_ctx, struct tevent_req);
+       if (req == NULL) {
+               return NULL;
+       }
+       req->internal.private_type      = type;
+       req->internal.create_location   = location;
+       req->internal.finish_location   = NULL;
+       req->internal.state             = TEVENT_REQ_IN_PROGRESS;
+       req->internal.trigger           = tevent_create_immediate(req);
+       if (!req->internal.trigger) {
+               talloc_free(req);
+               return NULL;
+       }
+       req->internal.defer_callback_ev = NULL;
+
+       data = talloc_zero_size(req, data_size);
+       if (data == NULL) {
+               talloc_free(req);
+               return NULL;
+       }
+       talloc_set_name_const(data, type);
+
+       req->data = data;
+
+       *ppdata = data;
+       return req;
+}
+
+void _tevent_req_notify_callback(struct tevent_req *req, const char *location)
+{
+       req->internal.finish_location = location;
+       if (req->internal.defer_callback_ev) {
+               (void)tevent_req_post(req, req->internal.defer_callback_ev);
+               req->internal.defer_callback_ev = NULL;
+               return;
+       }
+       if (req->async.fn != NULL) {
+               req->async.fn(req);
+       }
+}
+
+static void tevent_req_finish(struct tevent_req *req,
+                             enum tevent_req_state state,
+                             const char *location)
+{
+       req->internal.state = state;
+       _tevent_req_notify_callback(req, location);
+}
+
+void _tevent_req_done(struct tevent_req *req,
+                     const char *location)
+{
+       tevent_req_finish(req, TEVENT_REQ_DONE, location);
+}
+
+bool _tevent_req_error(struct tevent_req *req,
+                      uint64_t error,
+                      const char *location)
+{
+       if (error == 0) {
+               return false;
+       }
+
+       req->internal.error = error;
+       tevent_req_finish(req, TEVENT_REQ_USER_ERROR, location);
+       return true;
+}
+
+void _tevent_req_oom(struct tevent_req *req, const char *location)
+{
+       tevent_req_finish(req, TEVENT_REQ_NO_MEMORY, location);
+}
+
+bool _tevent_req_nomem(const void *p,
+                      struct tevent_req *req,
+                      const char *location)
+{
+       if (p != NULL) {
+               return false;
+       }
+       _tevent_req_oom(req, location);
+       return true;
+}
+
+/**
+ * @internal
+ *
+ * @brief Immediate event callback.
+ *
+ * @param[in]  ev       The event context to use.
+ *
+ * @param[in]  im       The immediate event.
+ *
+ * @param[in]  priv     The async request to be finished.
+ */
+static void tevent_req_trigger(struct tevent_context *ev,
+                              struct tevent_immediate *im,
+                              void *private_data)
+{
+       struct tevent_req *req = talloc_get_type(private_data,
+                                struct tevent_req);
+
+       tevent_req_finish(req, req->internal.state,
+                         req->internal.finish_location);
+}
+
+struct tevent_req *tevent_req_post(struct tevent_req *req,
+                                  struct tevent_context *ev)
+{
+       tevent_schedule_immediate(req->internal.trigger,
+                                 ev, tevent_req_trigger, req);
+       return req;
+}
+
+void tevent_req_defer_callback(struct tevent_req *req,
+                              struct tevent_context *ev)
+{
+       req->internal.defer_callback_ev = ev;
+}
+
+bool tevent_req_is_in_progress(struct tevent_req *req)
+{
+       if (req->internal.state == TEVENT_REQ_IN_PROGRESS) {
+               return true;
+       }
+
+       return false;
+}
+
+void tevent_req_received(struct tevent_req *req)
+{
+       TALLOC_FREE(req->data);
+       req->private_print = NULL;
+
+       TALLOC_FREE(req->internal.trigger);
+       TALLOC_FREE(req->internal.timer);
+
+       req->internal.state = TEVENT_REQ_RECEIVED;
+}
+
+bool tevent_req_poll(struct tevent_req *req,
+                    struct tevent_context *ev)
+{
+       while (tevent_req_is_in_progress(req)) {
+               int ret;
+
+               ret = tevent_loop_once(ev);
+               if (ret != 0) {
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+bool tevent_req_is_error(struct tevent_req *req, enum tevent_req_state *state,
+                       uint64_t *error)
+{
+       if (req->internal.state == TEVENT_REQ_DONE) {
+               return false;
+       }
+       if (req->internal.state == TEVENT_REQ_USER_ERROR) {
+               *error = req->internal.error;
+       }
+       *state = req->internal.state;
+       return true;
+}
+
+static void tevent_req_timedout(struct tevent_context *ev,
+                              struct tevent_timer *te,
+                              struct timeval now,
+                              void *private_data)
+{
+       struct tevent_req *req = talloc_get_type(private_data,
+                                struct tevent_req);
+
+       TALLOC_FREE(req->internal.timer);
+
+       tevent_req_finish(req, TEVENT_REQ_TIMED_OUT, __FUNCTION__);
+}
+
+bool tevent_req_set_endtime(struct tevent_req *req,
+                           struct tevent_context *ev,
+                           struct timeval endtime)
+{
+       TALLOC_FREE(req->internal.timer);
+
+       req->internal.timer = tevent_add_timer(ev, req, endtime,
+                                              tevent_req_timedout,
+                                              req);
+       if (tevent_req_nomem(req->internal.timer, req)) {
+               return false;
+       }
+
+       return true;
+}
+
+void tevent_req_set_callback(struct tevent_req *req, tevent_req_fn fn, void *pvt)
+{
+       req->async.fn = fn;
+       req->async.private_data = pvt;
+}
+
+void *_tevent_req_callback_data(struct tevent_req *req)
+{
+       return req->async.private_data;
+}
+
+void *_tevent_req_data(struct tevent_req *req)
+{
+       return req->data;
+}
+
+void tevent_req_set_print_fn(struct tevent_req *req, tevent_req_print_fn fn)
+{
+       req->private_print = fn;
+}
+
+void tevent_req_set_cancel_fn(struct tevent_req *req, tevent_req_cancel_fn fn)
+{
+       req->private_cancel = fn;
+}
+
+bool _tevent_req_cancel(struct tevent_req *req, const char *location)
+{
+       if (req->private_cancel == NULL) {
+               return false;
+       }
+
+       return req->private_cancel(req);
+}
diff --git a/ctdb/lib/tevent/tevent_select.c b/ctdb/lib/tevent/tevent_select.c
new file mode 100644 (file)
index 0000000..bfce246
--- /dev/null
@@ -0,0 +1,277 @@
+/* 
+   Unix SMB/CIFS implementation.
+   main select loop and event handling
+   Copyright (C) Andrew Tridgell       2003-2005
+   Copyright (C) Stefan Metzmacher     2005-2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/select.h"
+#include "tevent.h"
+#include "tevent_util.h"
+#include "tevent_internal.h"
+
+struct select_event_context {
+       /* a pointer back to the generic event_context */
+       struct tevent_context *ev;
+
+       /* the maximum file descriptor number in fd_events */
+       int maxfd;
+
+       /* information for exiting from the event loop */
+       int exit_code;
+};
+
+/*
+  create a select_event_context structure.
+*/
+static int select_event_context_init(struct tevent_context *ev)
+{
+       struct select_event_context *select_ev;
+
+       /*
+        * We might be called during tevent_re_initialise()
+        * which means we need to free our old additional_data.
+        */
+       TALLOC_FREE(ev->additional_data);
+
+       select_ev = talloc_zero(ev, struct select_event_context);
+       if (!select_ev) return -1;
+       select_ev->ev = ev;
+
+       ev->additional_data = select_ev;
+       return 0;
+}
+
+/*
+  recalculate the maxfd
+*/
+static void calc_maxfd(struct select_event_context *select_ev)
+{
+       struct tevent_fd *fde;
+
+       select_ev->maxfd = 0;
+       for (fde = select_ev->ev->fd_events; fde; fde = fde->next) {
+               if (fde->fd > select_ev->maxfd) {
+                       select_ev->maxfd = fde->fd;
+               }
+       }
+}
+
+
+/* to mark the ev->maxfd invalid
+ * this means we need to recalculate it
+ */
+#define EVENT_INVALID_MAXFD (-1)
+
+/*
+  destroy an fd_event
+*/
+static int select_event_fd_destructor(struct tevent_fd *fde)
+{
+       struct tevent_context *ev = fde->event_ctx;
+       struct select_event_context *select_ev = NULL;
+
+       if (ev) {
+               select_ev = talloc_get_type(ev->additional_data,
+                                           struct select_event_context);
+
+               if (select_ev->maxfd == fde->fd) {
+                       select_ev->maxfd = EVENT_INVALID_MAXFD;
+               }
+       }
+
+       return tevent_common_fd_destructor(fde);
+}
+
+/*
+  add a fd based event
+  return NULL on failure (memory allocation error)
+*/
+static struct tevent_fd *select_event_add_fd(struct tevent_context *ev, TALLOC_CTX *mem_ctx,
+                                            int fd, uint16_t flags,
+                                            tevent_fd_handler_t handler,
+                                            void *private_data,
+                                            const char *handler_name,
+                                            const char *location)
+{
+       struct select_event_context *select_ev = talloc_get_type(ev->additional_data,
+                                                          struct select_event_context);
+       struct tevent_fd *fde;
+
+       if (fd < 0 || fd >= FD_SETSIZE) {
+               errno = EBADF;
+               return NULL;
+       }
+
+       fde = tevent_common_add_fd(ev, mem_ctx, fd, flags,
+                                  handler, private_data,
+                                  handler_name, location);
+       if (!fde) return NULL;
+
+       if ((select_ev->maxfd != EVENT_INVALID_MAXFD)
+           && (fde->fd > select_ev->maxfd)) {
+               select_ev->maxfd = fde->fd;
+       }
+       talloc_set_destructor(fde, select_event_fd_destructor);
+
+       return fde;
+}
+
+/*
+  event loop handling using select()
+*/
+static int select_event_loop_select(struct select_event_context *select_ev, struct timeval *tvalp)
+{
+       fd_set r_fds, w_fds;
+       struct tevent_fd *fde;
+       int selrtn;
+       int select_errno;
+
+       /* we maybe need to recalculate the maxfd */
+       if (select_ev->maxfd == EVENT_INVALID_MAXFD) {
+               calc_maxfd(select_ev);
+       }
+
+       FD_ZERO(&r_fds);
+       FD_ZERO(&w_fds);
+
+       /* setup any fd events */
+       for (fde = select_ev->ev->fd_events; fde; fde = fde->next) {
+               if (fde->fd < 0 || fde->fd >= FD_SETSIZE) {
+                       tevent_debug(select_ev->ev, TEVENT_DEBUG_FATAL,
+                                    "ERROR: EBADF fd[%d] >= %d "
+                                    "select_event_loop_once\n",
+                                    fde->fd, FD_SETSIZE);
+                       errno = EBADF;
+                       return -1;
+               }
+
+               if (fde->flags & TEVENT_FD_READ) {
+                       FD_SET(fde->fd, &r_fds);
+               }
+               if (fde->flags & TEVENT_FD_WRITE) {
+                       FD_SET(fde->fd, &w_fds);
+               }
+       }
+
+       if (select_ev->ev->signal_events &&
+           tevent_common_check_signal(select_ev->ev)) {
+               return 0;
+       }
+
+       tevent_trace_point_callback(select_ev->ev, TEVENT_TRACE_BEFORE_WAIT);
+       selrtn = select(select_ev->maxfd+1, &r_fds, &w_fds, NULL, tvalp);
+       select_errno = errno;
+       tevent_trace_point_callback(select_ev->ev, TEVENT_TRACE_AFTER_WAIT);
+
+       if (selrtn == -1 && select_errno == EINTR &&
+           select_ev->ev->signal_events) {
+               tevent_common_check_signal(select_ev->ev);
+               return 0;
+       }
+
+       if (selrtn == -1 && select_errno == EBADF) {
+               /* the socket is dead! this should never
+                  happen as the socket should have first been
+                  made readable and that should have removed
+                  the event, so this must be a bug. This is a
+                  fatal error. */
+               tevent_debug(select_ev->ev, TEVENT_DEBUG_FATAL,
+                            "ERROR: EBADF on select_event_loop_once\n");
+               select_ev->exit_code = EBADF;
+               return -1;
+       }
+
+       if (selrtn == 0 && tvalp) {
+               /* we don't care about a possible delay here */
+               tevent_common_loop_timer_delay(select_ev->ev);
+               return 0;
+       }
+
+       if (selrtn > 0) {
+               /* at least one file descriptor is ready - check
+                  which ones and call the handler, being careful to allow
+                  the handler to remove itself when called */
+               for (fde = select_ev->ev->fd_events; fde; fde = fde->next) {
+                       uint16_t flags = 0;
+
+                       if (FD_ISSET(fde->fd, &r_fds) && (fde->flags & TEVENT_FD_READ)) {
+                               flags |= TEVENT_FD_READ;
+                       }
+                       if (FD_ISSET(fde->fd, &w_fds) && (fde->flags & TEVENT_FD_WRITE)) {
+                               flags |= TEVENT_FD_WRITE;
+                       }
+                       if (flags) {
+                               DLIST_DEMOTE(select_ev->ev->fd_events, fde, struct tevent_fd);
+                               fde->handler(select_ev->ev, fde, flags, fde->private_data);
+                               break;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+  do a single event loop using the events defined in ev 
+*/
+static int select_event_loop_once(struct tevent_context *ev, const char *location)
+{
+       struct select_event_context *select_ev = talloc_get_type(ev->additional_data,
+                                                          struct select_event_context);
+       struct timeval tval;
+
+       if (ev->signal_events &&
+           tevent_common_check_signal(ev)) {
+               return 0;
+       }
+
+       if (ev->immediate_events &&
+           tevent_common_loop_immediate(ev)) {
+               return 0;
+       }
+
+       tval = tevent_common_loop_timer_delay(ev);
+       if (tevent_timeval_is_zero(&tval)) {
+               return 0;
+       }
+
+       return select_event_loop_select(select_ev, &tval);
+}
+
+static const struct tevent_ops select_event_ops = {
+       .context_init           = select_event_context_init,
+       .add_fd                 = select_event_add_fd,
+       .set_fd_close_fn        = tevent_common_fd_set_close_fn,
+       .get_fd_flags           = tevent_common_fd_get_flags,
+       .set_fd_flags           = tevent_common_fd_set_flags,
+       .add_timer              = tevent_common_add_timer_v2,
+       .schedule_immediate     = tevent_common_schedule_immediate,
+       .add_signal             = tevent_common_add_signal,
+       .loop_once              = select_event_loop_once,
+       .loop_wait              = tevent_common_loop_wait,
+};
+
+_PRIVATE_ bool tevent_select_init(void)
+{
+       return tevent_register_backend("select", &select_event_ops);
+}
diff --git a/ctdb/lib/tevent/tevent_signal.c b/ctdb/lib/tevent/tevent_signal.c
new file mode 100644 (file)
index 0000000..b5a56ef
--- /dev/null
@@ -0,0 +1,484 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   common events code for signal events
+
+   Copyright (C) Andrew Tridgell       2007
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+/* maximum number of SA_SIGINFO signals to hold in the queue.
+  NB. This *MUST* be a power of 2, in order for the ring buffer
+  wrap to work correctly. Thanks to Petr Vandrovec <petr@vandrovec.name>
+  for this. */
+
+#define TEVENT_SA_INFO_QUEUE_COUNT 64
+
+struct tevent_sigcounter {
+       uint32_t count;
+       uint32_t seen;
+};
+
+#define TEVENT_SIG_INCREMENT(s) (s).count++
+#define TEVENT_SIG_SEEN(s, n) (s).seen += (n)
+#define TEVENT_SIG_PENDING(s) ((s).seen != (s).count)
+
+struct tevent_common_signal_list {
+       struct tevent_common_signal_list *prev, *next;
+       struct tevent_signal *se;
+};
+
+/*
+  the poor design of signals means that this table must be static global
+*/
+static struct tevent_sig_state {
+       struct tevent_common_signal_list *sig_handlers[TEVENT_NUM_SIGNALS+1];
+       struct sigaction *oldact[TEVENT_NUM_SIGNALS+1];
+       struct tevent_sigcounter signal_count[TEVENT_NUM_SIGNALS+1];
+       struct tevent_sigcounter got_signal;
+#ifdef SA_SIGINFO
+       /* with SA_SIGINFO we get quite a lot of info per signal */
+       siginfo_t *sig_info[TEVENT_NUM_SIGNALS+1];
+       struct tevent_sigcounter sig_blocked[TEVENT_NUM_SIGNALS+1];
+#endif
+} *sig_state;
+
+/*
+  return number of sigcounter events not processed yet
+*/
+static uint32_t tevent_sig_count(struct tevent_sigcounter s)
+{
+       return s.count - s.seen;
+}
+
+/*
+  signal handler - redirects to registered signals
+*/
+static void tevent_common_signal_handler(int signum)
+{
+       char c = 0;
+       struct tevent_common_signal_list *sl;
+       struct tevent_context *ev = NULL;
+       int saved_errno = errno;
+
+       TEVENT_SIG_INCREMENT(sig_state->signal_count[signum]);
+       TEVENT_SIG_INCREMENT(sig_state->got_signal);
+
+       /* Write to each unique event context. */
+       for (sl = sig_state->sig_handlers[signum]; sl; sl = sl->next) {
+               if (sl->se->event_ctx && sl->se->event_ctx != ev) {
+                       ev = sl->se->event_ctx;
+                       /* doesn't matter if this pipe overflows */
+                       (void) write(ev->pipe_fds[1], &c, 1);
+               }
+       }
+
+       errno = saved_errno;
+}
+
+#ifdef SA_SIGINFO
+/*
+  signal handler with SA_SIGINFO - redirects to registered signals
+*/
+static void tevent_common_signal_handler_info(int signum, siginfo_t *info,
+                                             void *uctx)
+{
+       uint32_t count = tevent_sig_count(sig_state->signal_count[signum]);
+       /* sig_state->signal_count[signum].seen % TEVENT_SA_INFO_QUEUE_COUNT
+        * is the base of the unprocessed signals in the ringbuffer. */
+       uint32_t ofs = (sig_state->signal_count[signum].seen + count) %
+                               TEVENT_SA_INFO_QUEUE_COUNT;
+       sig_state->sig_info[signum][ofs] = *info;
+
+       tevent_common_signal_handler(signum);
+
+       /* handle SA_SIGINFO */
+       if (count+1 == TEVENT_SA_INFO_QUEUE_COUNT) {
+               /* we've filled the info array - block this signal until
+                  these ones are delivered */
+#ifdef HAVE_UCONTEXT_T
+               /*
+                * This is the only way for this to work.
+                * By default signum is blocked inside this
+                * signal handler using a temporary mask,
+                * but what we really need to do now is
+                * block it in the callers mask, so it
+                * stays blocked when the temporary signal
+                * handler mask is replaced when we return
+                * from here. The callers mask can be found
+                * in the ucontext_t passed in as the
+                * void *uctx argument.
+                */
+               ucontext_t *ucp = (ucontext_t *)uctx;
+               sigaddset(&ucp->uc_sigmask, signum);
+#else
+               /*
+                * WARNING !!! WARNING !!!!
+                *
+                * This code doesn't work.
+                * By default signum is blocked inside this
+                * signal handler, but calling sigprocmask
+                * modifies the temporary signal mask being
+                * used *inside* this handler, which will be
+                * replaced by the callers signal mask once
+                * we return from here. See Samba
+                * bug #9550 for details.
+                */
+               sigset_t set;
+               sigemptyset(&set);
+               sigaddset(&set, signum);
+               sigprocmask(SIG_BLOCK, &set, NULL);
+#endif
+               TEVENT_SIG_INCREMENT(sig_state->sig_blocked[signum]);
+       }
+}
+#endif
+
+static int tevent_common_signal_list_destructor(struct tevent_common_signal_list *sl)
+{
+       if (sig_state->sig_handlers[sl->se->signum]) {
+               DLIST_REMOVE(sig_state->sig_handlers[sl->se->signum], sl);
+       }
+       return 0;
+}
+
+/*
+  destroy a signal event
+*/
+static int tevent_signal_destructor(struct tevent_signal *se)
+{
+       struct tevent_common_signal_list *sl;
+       sl = talloc_get_type(se->additional_data,
+                            struct tevent_common_signal_list);
+
+       if (se->event_ctx) {
+               DLIST_REMOVE(se->event_ctx->signal_events, se);
+       }
+
+       talloc_free(sl);
+
+       if (sig_state->sig_handlers[se->signum] == NULL) {
+               /* restore old handler, if any */
+               if (sig_state->oldact[se->signum]) {
+                       sigaction(se->signum, sig_state->oldact[se->signum], NULL);
+                       sig_state->oldact[se->signum] = NULL;
+               }
+#ifdef SA_SIGINFO
+               if (se->sa_flags & SA_SIGINFO) {
+                       if (sig_state->sig_info[se->signum]) {
+                               talloc_free(sig_state->sig_info[se->signum]);
+                               sig_state->sig_info[se->signum] = NULL;
+                       }
+               }
+#endif
+       }
+
+       return 0;
+}
+
+/*
+  this is part of the pipe hack needed to avoid the signal race condition
+*/
+static void signal_pipe_handler(struct tevent_context *ev, struct tevent_fd *fde, 
+                               uint16_t flags, void *_private)
+{
+       char c[16];
+       /* its non-blocking, doesn't matter if we read too much */
+       (void) read(fde->fd, c, sizeof(c));
+}
+
+/*
+  add a signal event
+  return NULL on failure (memory allocation error)
+*/
+struct tevent_signal *tevent_common_add_signal(struct tevent_context *ev,
+                                              TALLOC_CTX *mem_ctx,
+                                              int signum,
+                                              int sa_flags,
+                                              tevent_signal_handler_t handler,
+                                              void *private_data,
+                                              const char *handler_name,
+                                              const char *location)
+{
+       struct tevent_signal *se;
+       struct tevent_common_signal_list *sl;
+       sigset_t set, oldset;
+
+       if (signum >= TEVENT_NUM_SIGNALS) {
+               errno = EINVAL;
+               return NULL;
+       }
+
+       /* the sig_state needs to be on a global context as it can last across
+          multiple event contexts */
+       if (sig_state == NULL) {
+               sig_state = talloc_zero(NULL, struct tevent_sig_state);
+               if (sig_state == NULL) {
+                       return NULL;
+               }
+       }
+
+       se = talloc(mem_ctx?mem_ctx:ev, struct tevent_signal);
+       if (se == NULL) return NULL;
+
+       se->event_ctx           = ev;
+       se->signum              = signum;
+       se->sa_flags            = sa_flags;
+       se->handler             = handler;
+       se->private_data        = private_data;
+       se->handler_name        = handler_name;
+       se->location            = location;
+       se->additional_data     = NULL;
+
+       sl = talloc(se, struct tevent_common_signal_list);
+       if (!sl) {
+               talloc_free(se);
+               return NULL;
+       }
+       sl->se = se;
+       se->additional_data     = sl;
+
+       /* Ensure, no matter the destruction order, that we always have a handle on the global sig_state */
+       if (!talloc_reference(se, sig_state)) {
+               talloc_free(se);
+               return NULL;
+       }
+
+       /* we need to setup the pipe hack handler if not already
+          setup */
+       if (ev->pipe_fde == NULL) {
+               if (pipe(ev->pipe_fds) == -1) {
+                       talloc_free(se);
+                       return NULL;
+               }
+               ev_set_blocking(ev->pipe_fds[0], false);
+               ev_set_blocking(ev->pipe_fds[1], false);
+               ev->pipe_fde = tevent_add_fd(ev, ev, ev->pipe_fds[0],
+                                            TEVENT_FD_READ,
+                                            signal_pipe_handler, NULL);
+               if (!ev->pipe_fde) {
+                       close(ev->pipe_fds[0]);
+                       close(ev->pipe_fds[1]);
+                       talloc_free(se);
+                       return NULL;
+               }
+       }
+
+       /* only install a signal handler if not already installed */
+       if (sig_state->sig_handlers[signum] == NULL) {
+               struct sigaction act;
+               ZERO_STRUCT(act);
+               act.sa_handler = tevent_common_signal_handler;
+               act.sa_flags = sa_flags;
+#ifdef SA_SIGINFO
+               if (sa_flags & SA_SIGINFO) {
+                       act.sa_handler   = NULL;
+                       act.sa_sigaction = tevent_common_signal_handler_info;
+                       if (sig_state->sig_info[signum] == NULL) {
+                               sig_state->sig_info[signum] =
+                                       talloc_zero_array(sig_state, siginfo_t,
+                                                         TEVENT_SA_INFO_QUEUE_COUNT);
+                               if (sig_state->sig_info[signum] == NULL) {
+                                       talloc_free(se);
+                                       return NULL;
+                               }
+                       }
+               }
+#endif
+               sig_state->oldact[signum] = talloc(sig_state, struct sigaction);
+               if (sig_state->oldact[signum] == NULL) {
+                       talloc_free(se);
+                       return NULL;                    
+               }
+               if (sigaction(signum, &act, sig_state->oldact[signum]) == -1) {
+                       talloc_free(se);
+                       return NULL;
+               }
+       }
+
+       DLIST_ADD(se->event_ctx->signal_events, se);
+
+       /* Make sure the signal doesn't come in while we're mangling list. */
+       sigemptyset(&set);
+       sigaddset(&set, signum);
+       sigprocmask(SIG_BLOCK, &set, &oldset);
+       DLIST_ADD(sig_state->sig_handlers[signum], sl);
+       sigprocmask(SIG_SETMASK, &oldset, NULL);
+
+       talloc_set_destructor(se, tevent_signal_destructor);
+       talloc_set_destructor(sl, tevent_common_signal_list_destructor);
+
+       return se;
+}
+
+struct tevent_se_exists {
+       struct tevent_se_exists **myself;
+};
+
+static int tevent_se_exists_destructor(struct tevent_se_exists *s)
+{
+       *s->myself = NULL;
+       return 0;
+}
+
+/*
+  check if a signal is pending
+  return != 0 if a signal was pending
+*/
+int tevent_common_check_signal(struct tevent_context *ev)
+{
+       int i;
+
+       if (!sig_state || !TEVENT_SIG_PENDING(sig_state->got_signal)) {
+               return 0;
+       }
+       
+       for (i=0;i<TEVENT_NUM_SIGNALS+1;i++) {
+               struct tevent_common_signal_list *sl, *next;
+               struct tevent_sigcounter counter = sig_state->signal_count[i];
+               uint32_t count = tevent_sig_count(counter);
+#ifdef SA_SIGINFO
+               /* Ensure we null out any stored siginfo_t entries
+                * after processing for debugging purposes. */
+               bool clear_processed_siginfo = false;
+#endif
+
+               if (count == 0) {
+                       continue;
+               }
+               for (sl=sig_state->sig_handlers[i];sl;sl=next) {
+                       struct tevent_signal *se = sl->se;
+                       struct tevent_se_exists *exists;
+
+                       next = sl->next;
+
+                       /*
+                        * We have to be careful to not touch "se"
+                        * after it was deleted in its handler. Thus
+                        * we allocate a child whose destructor will
+                        * tell by nulling out itself that its parent
+                        * is gone.
+                        */
+                       exists = talloc(se, struct tevent_se_exists);
+                       if (exists == NULL) {
+                               continue;
+                       }
+                       exists->myself = &exists;
+                       talloc_set_destructor(
+                               exists, tevent_se_exists_destructor);
+
+#ifdef SA_SIGINFO
+                       if (se->sa_flags & SA_SIGINFO) {
+                               uint32_t j;
+
+                               clear_processed_siginfo = true;
+
+                               for (j=0;j<count;j++) {
+                                       /* sig_state->signal_count[i].seen
+                                        * % TEVENT_SA_INFO_QUEUE_COUNT is
+                                        * the base position of the unprocessed
+                                        * signals in the ringbuffer. */
+                                       uint32_t ofs = (counter.seen + j)
+                                               % TEVENT_SA_INFO_QUEUE_COUNT;
+                                       se->handler(ev, se, i, 1,
+                                                   (void*)&sig_state->sig_info[i][ofs], 
+                                                   se->private_data);
+                                       if (!exists) {
+                                               break;
+                                       }
+                               }
+#ifdef SA_RESETHAND
+                               if (exists && (se->sa_flags & SA_RESETHAND)) {
+                                       talloc_free(se);
+                               }
+#endif
+                               talloc_free(exists);
+                               continue;
+                       }
+#endif
+                       se->handler(ev, se, i, count, NULL, se->private_data);
+#ifdef SA_RESETHAND
+                       if (exists && (se->sa_flags & SA_RESETHAND)) {
+                               talloc_free(se);
+                       }
+#endif
+                       talloc_free(exists);
+               }
+
+#ifdef SA_SIGINFO
+               if (clear_processed_siginfo) {
+                       uint32_t j;
+                       for (j=0;j<count;j++) {
+                               uint32_t ofs = (counter.seen + j)
+                                       % TEVENT_SA_INFO_QUEUE_COUNT;
+                               memset((void*)&sig_state->sig_info[i][ofs],
+                                       '\0',
+                                       sizeof(siginfo_t));
+                       }
+               }
+#endif
+
+               TEVENT_SIG_SEEN(sig_state->signal_count[i], count);
+               TEVENT_SIG_SEEN(sig_state->got_signal, count);
+
+#ifdef SA_SIGINFO
+               if (TEVENT_SIG_PENDING(sig_state->sig_blocked[i])) {
+                       /* We'd filled the queue, unblock the
+                          signal now the queue is empty again.
+                          Note we MUST do this after the
+                          TEVENT_SIG_SEEN(sig_state->signal_count[i], count)
+                          call to prevent a new signal running
+                          out of room in the sig_state->sig_info[i][]
+                          ring buffer. */
+                       sigset_t set;
+                       sigemptyset(&set);
+                       sigaddset(&set, i);
+                       TEVENT_SIG_SEEN(sig_state->sig_blocked[i],
+                                tevent_sig_count(sig_state->sig_blocked[i]));
+                       sigprocmask(SIG_UNBLOCK, &set, NULL);
+               }
+#endif
+       }
+
+       return 1;
+}
+
+void tevent_cleanup_pending_signal_handlers(struct tevent_signal *se)
+{
+       struct tevent_common_signal_list *sl;
+       sl = talloc_get_type(se->additional_data,
+                            struct tevent_common_signal_list);
+
+       tevent_common_signal_list_destructor(sl);
+
+       if (sig_state->sig_handlers[se->signum] == NULL) {
+               if (sig_state->oldact[se->signum]) {
+                       sigaction(se->signum, sig_state->oldact[se->signum], NULL);
+                       sig_state->oldact[se->signum] = NULL;
+               }
+       }
+       return;
+}
diff --git a/ctdb/lib/tevent/tevent_standard.c b/ctdb/lib/tevent/tevent_standard.c
new file mode 100644 (file)
index 0000000..2584994
--- /dev/null
@@ -0,0 +1,233 @@
+/* 
+   Unix SMB/CIFS implementation.
+   main select loop and event handling
+   Copyright (C) Stefan Metzmacher      2013
+   Copyright (C) Jeremy Allison         2013
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+  This is SAMBA's default event loop code
+
+  - we try to use epoll if configure detected support for it
+    otherwise we use poll()
+  - if epoll is broken on the system or the kernel doesn't support it
+    at runtime we fallback to poll()
+*/
+
+#include "replace.h"
+#include "tevent.h"
+#include "tevent_util.h"
+#include "tevent_internal.h"
+
+struct std_event_glue {
+       const struct tevent_ops *epoll_ops;
+       const struct tevent_ops *poll_ops;
+       struct tevent_ops *glue_ops;
+       bool fallback_replay;
+};
+
+static int std_event_context_init(struct tevent_context *ev);
+
+static const struct tevent_ops std_event_ops = {
+       .context_init           = std_event_context_init,
+};
+
+/*
+  If this function gets called. epoll failed at runtime.
+  Move us to using poll instead. If we return false here,
+  caller should abort().
+*/
+static bool std_fallback_to_poll(struct tevent_context *ev, bool replay)
+{
+       void *glue_ptr = talloc_parent(ev->ops);
+       struct std_event_glue *glue =
+               talloc_get_type_abort(glue_ptr,
+               struct std_event_glue);
+       int ret;
+       struct tevent_fd *fde;
+       struct tevent_fd *fde_next;
+
+       glue->fallback_replay = replay;
+
+       /* First switch all the ops to poll. */
+       glue->epoll_ops = NULL;
+
+       /*
+        * Set custom_ops the same as poll.
+        */
+       *glue->glue_ops = *glue->poll_ops;
+       glue->glue_ops->context_init = std_event_context_init;
+
+       /* Next initialize the poll backend. */
+       ret = glue->poll_ops->context_init(ev);
+       if (ret != 0) {
+               return false;
+       }
+
+       /*
+        * Now we have to change all the existing file descriptor
+        * events from the epoll backend to the poll backend.
+        */
+       for (fde = ev->fd_events; fde; fde = fde_next) {
+               /*
+                * We must remove this fde off the ev->fd_events list.
+                */
+               fde_next = fde->next;
+
+               /* Remove from the ev->fd_events list. */
+               DLIST_REMOVE(ev->fd_events, fde);
+
+               /* Re-add this event as a poll backend event. */
+               tevent_poll_event_add_fd_internal(ev, fde);
+       }
+
+       return true;
+}
+
+static int std_event_loop_once(struct tevent_context *ev, const char *location)
+{
+       void *glue_ptr = talloc_parent(ev->ops);
+       struct std_event_glue *glue =
+               talloc_get_type_abort(glue_ptr,
+               struct std_event_glue);
+       int ret;
+
+       ret = glue->epoll_ops->loop_once(ev, location);
+       if (glue->epoll_ops != NULL) {
+               /* No fallback */
+               return ret;
+       }
+
+       if (!glue->fallback_replay) {
+               /*
+                * The problem happened while modifying an event.
+                * An event handler was triggered in this case
+                * and there is no need to call loop_once() again.
+                */
+               return ret;
+       }
+
+       return glue->poll_ops->loop_once(ev, location);
+}
+
+static int std_event_loop_wait(struct tevent_context *ev, const char *location)
+{
+       void *glue_ptr = talloc_parent(ev->ops);
+       struct std_event_glue *glue =
+               talloc_get_type_abort(glue_ptr,
+               struct std_event_glue);
+       int ret;
+
+       ret = glue->epoll_ops->loop_wait(ev, location);
+       if (glue->epoll_ops != NULL) {
+               /* No fallback */
+               return ret;
+       }
+
+       return glue->poll_ops->loop_wait(ev, location);
+}
+/*
+  Initialize the epoll backend and allow it to call a
+  switch function if epoll fails at runtime.
+*/
+static int std_event_context_init(struct tevent_context *ev)
+{
+       struct std_event_glue *glue;
+       int ret;
+
+       /*
+        * If this is the first initialization
+        * we need to set up the allocated ops
+        * pointers.
+        */
+
+       if (ev->ops == &std_event_ops) {
+               glue = talloc_zero(ev, struct std_event_glue);
+               if (glue == NULL) {
+                       return -1;
+               }
+
+               glue->epoll_ops = tevent_find_ops_byname("epoll");
+
+               glue->poll_ops = tevent_find_ops_byname("poll");
+               if (glue->poll_ops == NULL) {
+                       return -1;
+               }
+
+               /*
+                * Allocate space for our custom ops.
+                * Allocate as a child of our epoll_ops pointer
+                * so we can easily get to it using talloc_parent.
+                */
+               glue->glue_ops = talloc_zero(glue, struct tevent_ops);
+               if (glue->glue_ops == NULL) {
+                       talloc_free(glue);
+                       return -1;
+               }
+
+               ev->ops = glue->glue_ops;
+       } else {
+               void *glue_ptr = talloc_parent(ev->ops);
+               glue = talloc_get_type_abort(glue_ptr, struct std_event_glue);
+       }
+
+       if (glue->epoll_ops != NULL) {
+               /*
+                * Set custom_ops the same as epoll,
+                * except re-init using std_event_context_init()
+                * and use std_event_loop_once() to add the
+                * ability to fallback to a poll backend on
+                * epoll runtime error.
+                */
+               *glue->glue_ops = *glue->epoll_ops;
+               glue->glue_ops->context_init = std_event_context_init;
+               glue->glue_ops->loop_once = std_event_loop_once;
+               glue->glue_ops->loop_wait = std_event_loop_wait;
+
+               ret = glue->epoll_ops->context_init(ev);
+               if (ret == -1) {
+                       goto fallback;
+               }
+#ifdef HAVE_EPOLL
+               if (!tevent_epoll_set_panic_fallback(ev, std_fallback_to_poll)) {
+                       TALLOC_FREE(ev->additional_data);
+                       goto fallback;
+               }
+#endif
+
+               return ret;
+       }
+
+fallback:
+       glue->epoll_ops = NULL;
+
+       /*
+        * Set custom_ops the same as poll.
+        */
+       *glue->glue_ops = *glue->poll_ops;
+       glue->glue_ops->context_init = std_event_context_init;
+
+       return glue->poll_ops->context_init(ev);
+}
+
+_PRIVATE_ bool tevent_standard_init(void)
+{
+       return tevent_register_backend("standard", &std_event_ops);
+}
diff --git a/ctdb/lib/tevent/tevent_timed.c b/ctdb/lib/tevent/tevent_timed.c
new file mode 100644 (file)
index 0000000..920d39f
--- /dev/null
@@ -0,0 +1,355 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   common events code for timed events
+
+   Copyright (C) Andrew Tridgell       2003-2006
+   Copyright (C) Stefan Metzmacher     2005-2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "system/time.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+/**
+  compare two timeval structures. 
+  Return -1 if tv1 < tv2
+  Return 0 if tv1 == tv2
+  Return 1 if tv1 > tv2
+*/
+int tevent_timeval_compare(const struct timeval *tv1, const struct timeval *tv2)
+{
+       if (tv1->tv_sec  > tv2->tv_sec)  return 1;
+       if (tv1->tv_sec  < tv2->tv_sec)  return -1;
+       if (tv1->tv_usec > tv2->tv_usec) return 1;
+       if (tv1->tv_usec < tv2->tv_usec) return -1;
+       return 0;
+}
+
+/**
+  return a zero timeval
+*/
+struct timeval tevent_timeval_zero(void)
+{
+       struct timeval tv;
+       tv.tv_sec = 0;
+       tv.tv_usec = 0;
+       return tv;
+}
+
+/**
+  return a timeval for the current time
+*/
+struct timeval tevent_timeval_current(void)
+{
+       struct timeval tv;
+       gettimeofday(&tv, NULL);
+       return tv;
+}
+
+/**
+  return a timeval struct with the given elements
+*/
+struct timeval tevent_timeval_set(uint32_t secs, uint32_t usecs)
+{
+       struct timeval tv;
+       tv.tv_sec = secs;
+       tv.tv_usec = usecs;
+       return tv;
+}
+
+/**
+  return the difference between two timevals as a timeval
+  if tv1 comes after tv2, then return a zero timeval
+  (this is *tv2 - *tv1)
+*/
+struct timeval tevent_timeval_until(const struct timeval *tv1,
+                                   const struct timeval *tv2)
+{
+       struct timeval t;
+       if (tevent_timeval_compare(tv1, tv2) >= 0) {
+               return tevent_timeval_zero();
+       }
+       t.tv_sec = tv2->tv_sec - tv1->tv_sec;
+       if (tv1->tv_usec > tv2->tv_usec) {
+               t.tv_sec--;
+               t.tv_usec = 1000000 - (tv1->tv_usec - tv2->tv_usec);
+       } else {
+               t.tv_usec = tv2->tv_usec - tv1->tv_usec;
+       }
+       return t;
+}
+
+/**
+  return true if a timeval is zero
+*/
+bool tevent_timeval_is_zero(const struct timeval *tv)
+{
+       return tv->tv_sec == 0 && tv->tv_usec == 0;
+}
+
+struct timeval tevent_timeval_add(const struct timeval *tv, uint32_t secs,
+                                 uint32_t usecs)
+{
+       struct timeval tv2 = *tv;
+       tv2.tv_sec += secs;
+       tv2.tv_usec += usecs;
+       tv2.tv_sec += tv2.tv_usec / 1000000;
+       tv2.tv_usec = tv2.tv_usec % 1000000;
+
+       return tv2;
+}
+
+/**
+  return a timeval in the future with a specified offset
+*/
+struct timeval tevent_timeval_current_ofs(uint32_t secs, uint32_t usecs)
+{
+       struct timeval tv = tevent_timeval_current();
+       return tevent_timeval_add(&tv, secs, usecs);
+}
+
+/*
+  destroy a timed event
+*/
+static int tevent_common_timed_destructor(struct tevent_timer *te)
+{
+       if (te->event_ctx == NULL) {
+               return 0;
+       }
+
+       tevent_debug(te->event_ctx, TEVENT_DEBUG_TRACE,
+                    "Destroying timer event %p \"%s\"\n",
+                    te, te->handler_name);
+
+       if (te->event_ctx->last_zero_timer == te) {
+               te->event_ctx->last_zero_timer = DLIST_PREV(te);
+       }
+       DLIST_REMOVE(te->event_ctx->timer_events, te);
+
+       return 0;
+}
+
+static int tevent_common_timed_deny_destructor(struct tevent_timer *te)
+{
+       return -1;
+}
+
+/*
+  add a timed event
+  return NULL on failure (memory allocation error)
+*/
+static struct tevent_timer *tevent_common_add_timer_internal(
+                                       struct tevent_context *ev,
+                                       TALLOC_CTX *mem_ctx,
+                                       struct timeval next_event,
+                                       tevent_timer_handler_t handler,
+                                       void *private_data,
+                                       const char *handler_name,
+                                       const char *location,
+                                       bool optimize_zero)
+{
+       struct tevent_timer *te, *prev_te, *cur_te;
+
+       te = talloc(mem_ctx?mem_ctx:ev, struct tevent_timer);
+       if (te == NULL) return NULL;
+
+       te->event_ctx           = ev;
+       te->next_event          = next_event;
+       te->handler             = handler;
+       te->private_data        = private_data;
+       te->handler_name        = handler_name;
+       te->location            = location;
+       te->additional_data     = NULL;
+
+       if (ev->timer_events == NULL) {
+               ev->last_zero_timer = NULL;
+       }
+
+       /* keep the list ordered */
+       prev_te = NULL;
+       if (optimize_zero && tevent_timeval_is_zero(&te->next_event)) {
+               /*
+                * Some callers use zero tevent_timer
+                * instead of tevent_immediate events.
+                *
+                * As these can happen very often,
+                * we remember the last zero timer
+                * in the list.
+                */
+               prev_te = ev->last_zero_timer;
+               ev->last_zero_timer = te;
+       } else {
+               /*
+                * we traverse the list from the tail
+                * because it's much more likely that
+                * timers are added at the end of the list
+                */
+               for (cur_te = DLIST_TAIL(ev->timer_events);
+                    cur_te != NULL;
+                    cur_te = DLIST_PREV(cur_te))
+               {
+                       int ret;
+
+                       /*
+                        * if the new event comes before the current
+                        * we continue searching
+                        */
+                       ret = tevent_timeval_compare(&te->next_event,
+                                                    &cur_te->next_event);
+                       if (ret < 0) {
+                               continue;
+                       }
+
+                       break;
+               }
+
+               prev_te = cur_te;
+       }
+
+       DLIST_ADD_AFTER(ev->timer_events, te, prev_te);
+
+       talloc_set_destructor(te, tevent_common_timed_destructor);
+
+       tevent_debug(ev, TEVENT_DEBUG_TRACE,
+                    "Added timed event \"%s\": %p\n",
+                    handler_name, te);
+       return te;
+}
+
+struct tevent_timer *tevent_common_add_timer(struct tevent_context *ev,
+                                            TALLOC_CTX *mem_ctx,
+                                            struct timeval next_event,
+                                            tevent_timer_handler_t handler,
+                                            void *private_data,
+                                            const char *handler_name,
+                                            const char *location)
+{
+       /*
+        * do not use optimization, there are broken Samba
+        * versions which use tevent_common_add_timer()
+        * without using tevent_common_loop_timer_delay(),
+        * it just uses DLIST_REMOVE(ev->timer_events, te)
+        * and would leave ev->last_zero_timer behind.
+        */
+       return tevent_common_add_timer_internal(ev, mem_ctx, next_event,
+                                               handler, private_data,
+                                               handler_name, location,
+                                               false);
+}
+
+struct tevent_timer *tevent_common_add_timer_v2(struct tevent_context *ev,
+                                               TALLOC_CTX *mem_ctx,
+                                               struct timeval next_event,
+                                               tevent_timer_handler_t handler,
+                                               void *private_data,
+                                               const char *handler_name,
+                                               const char *location)
+{
+       /*
+        * Here we turn on last_zero_timer optimization
+        */
+       return tevent_common_add_timer_internal(ev, mem_ctx, next_event,
+                                               handler, private_data,
+                                               handler_name, location,
+                                               true);
+}
+
+/*
+  do a single event loop using the events defined in ev
+
+  return the delay until the next timed event,
+  or zero if a timed event was triggered
+*/
+struct timeval tevent_common_loop_timer_delay(struct tevent_context *ev)
+{
+       struct timeval current_time = tevent_timeval_zero();
+       struct tevent_timer *te = ev->timer_events;
+
+       if (!te) {
+               /* have a default tick time of 30 seconds. This guarantees
+                  that code that uses its own timeout checking will be
+                  able to proceed eventually */
+               return tevent_timeval_set(30, 0);
+       }
+
+       /*
+        * work out the right timeout for the next timed event
+        *
+        * avoid the syscall to gettimeofday() if the timed event should
+        * be triggered directly
+        *
+        * if there's a delay till the next timed event, we're done
+        * with just returning the delay
+        */
+       if (!tevent_timeval_is_zero(&te->next_event)) {
+               struct timeval delay;
+
+               current_time = tevent_timeval_current();
+
+               delay = tevent_timeval_until(&current_time, &te->next_event);
+               if (!tevent_timeval_is_zero(&delay)) {
+                       return delay;
+               }
+       }
+
+       /*
+        * ok, we have a timed event that we'll process ...
+        */
+
+       /* deny the handler to free the event */
+       talloc_set_destructor(te, tevent_common_timed_deny_destructor);
+
+       /* We need to remove the timer from the list before calling the
+        * handler because in a semi-async inner event loop called from the
+        * handler we don't want to come across this event again -- vl */
+       if (ev->last_zero_timer == te) {
+               ev->last_zero_timer = DLIST_PREV(te);
+       }
+       DLIST_REMOVE(ev->timer_events, te);
+
+       tevent_debug(te->event_ctx, TEVENT_DEBUG_TRACE,
+                    "Running timer event %p \"%s\"\n",
+                    te, te->handler_name);
+
+       /*
+        * If the timed event was registered for a zero current_time,
+        * then we pass a zero timeval here too! To avoid the
+        * overhead of gettimeofday() calls.
+        *
+        * otherwise we pass the current time
+        */
+       te->handler(ev, te, current_time, te->private_data);
+
+       /* The destructor isn't necessary anymore, we've already removed the
+        * event from the list. */
+       talloc_set_destructor(te, NULL);
+
+       tevent_debug(te->event_ctx, TEVENT_DEBUG_TRACE,
+                    "Ending timer event %p \"%s\"\n",
+                    te, te->handler_name);
+
+       talloc_free(te);
+
+       return tevent_timeval_zero();
+}
+
diff --git a/ctdb/lib/tevent/tevent_util.c b/ctdb/lib/tevent/tevent_util.c
new file mode 100644 (file)
index 0000000..16af8f3
--- /dev/null
@@ -0,0 +1,107 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   Copyright (C) Andrew Tridgell 2005
+   Copyright (C) Jelmer Vernooij 2005
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "talloc.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+#include <fcntl.h>
+
+/**
+  return the number of elements in a string list
+*/
+size_t ev_str_list_length(const char **list)
+{
+       size_t ret;
+       for (ret=0;list && list[ret];ret++) /* noop */ ;
+       return ret;
+}
+
+/**
+  add an entry to a string list
+*/
+const char **ev_str_list_add(const char **list, const char *s)
+{
+       size_t len = ev_str_list_length(list);
+       const char **ret;
+
+       ret = talloc_realloc(NULL, list, const char *, len+2);
+       if (ret == NULL) return NULL;
+
+       ret[len] = talloc_strdup(ret, s);
+       if (ret[len] == NULL) return NULL;
+
+       ret[len+1] = NULL;
+
+       return ret;
+}
+
+
+/**
+ Set a fd into blocking/nonblocking mode. Uses POSIX O_NONBLOCK if available,
+ else
+  if SYSV use O_NDELAY
+  if BSD use FNDELAY
+**/
+
+int ev_set_blocking(int fd, bool set)
+{
+       int val;
+#ifdef O_NONBLOCK
+#define FLAG_TO_SET O_NONBLOCK
+#else
+#ifdef SYSV
+#define FLAG_TO_SET O_NDELAY
+#else /* BSD */
+#define FLAG_TO_SET FNDELAY
+#endif
+#endif
+
+       if((val = fcntl(fd, F_GETFL, 0)) == -1)
+               return -1;
+       if(set) /* Turn blocking on - ie. clear nonblock flag */
+               val &= ~FLAG_TO_SET;
+       else
+               val |= FLAG_TO_SET;
+       return fcntl( fd, F_SETFL, val);
+#undef FLAG_TO_SET
+}
+
+bool ev_set_close_on_exec(int fd)
+{
+#ifdef FD_CLOEXEC
+       int val;
+
+       val = fcntl(fd, F_GETFD, 0);
+       if (val >= 0) {
+               val |= FD_CLOEXEC;
+               val = fcntl(fd, F_SETFD, val);
+               if (val != -1) {
+                       return true;
+               }
+       }
+#endif
+       return false;
+}
diff --git a/ctdb/lib/tevent/tevent_util.h b/ctdb/lib/tevent/tevent_util.h
new file mode 100644 (file)
index 0000000..311be60
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+   Unix SMB/CIFS implementation.
+
+   Copyright (C) Andrew Tridgell 1998-2010
+   Copyright (C) Jelmer Vernooij 2005
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* To use these macros you must have a structure containing a next and
+   prev pointer */
+
+#ifndef _DLINKLIST_H
+#define _DLINKLIST_H
+
+/*
+  February 2010 - changed list format to have a prev pointer from the
+  list head. This makes DLIST_ADD_END() O(1) even though we only have
+  one list pointer.
+
+  The scheme is as follows:
+
+     1) with no entries in the list:
+          list_head == NULL
+
+     2) with 1 entry in the list:
+          list_head->next == NULL
+          list_head->prev == list_head
+
+     3) with 2 entries in the list:
+          list_head->next == element2
+          list_head->prev == element2
+         element2->prev == list_head
+         element2->next == NULL
+
+     4) with N entries in the list:
+          list_head->next == element2
+          list_head->prev == elementN
+         elementN->prev == element{N-1}
+         elementN->next == NULL
+
+  This allows us to find the tail of the list by using
+  list_head->prev, which means we can add to the end of the list in
+  O(1) time
+
+
+  Note that the 'type' arguments below are no longer needed, but
+  are kept for now to prevent an incompatible argument change
+ */
+
+
+/*
+   add an element at the front of a list
+*/
+#define DLIST_ADD(list, p) \
+do { \
+        if (!(list)) { \
+               (p)->prev = (list) = (p);  \
+               (p)->next = NULL; \
+       } else { \
+               (p)->prev = (list)->prev; \
+               (list)->prev = (p); \
+               (p)->next = (list); \
+               (list) = (p); \
+       } \
+} while (0)
+
+/*
+   remove an element from a list
+   Note that the element doesn't have to be in the list. If it
+   isn't then this is a no-op
+*/
+#define DLIST_REMOVE(list, p) \
+do { \
+       if ((p) == (list)) { \
+               if ((p)->next) (p)->next->prev = (p)->prev; \
+               (list) = (p)->next; \
+       } else if ((list) && (p) == (list)->prev) {     \
+               (p)->prev->next = NULL; \
+               (list)->prev = (p)->prev; \
+       } else { \
+               if ((p)->prev) (p)->prev->next = (p)->next; \
+               if ((p)->next) (p)->next->prev = (p)->prev; \
+       } \
+       if ((p) != (list)) (p)->next = (p)->prev = NULL;        \
+} while (0)
+
+/*
+   find the head of the list given any element in it.
+   Note that this costs O(N), so you should avoid this macro
+   if at all possible!
+*/
+#define DLIST_HEAD(p, result_head) \
+do { \
+       (result_head) = (p); \
+       while (DLIST_PREV(result_head)) (result_head) = (result_head)->prev; \
+} while(0)
+
+/* return the last element in the list */
+#define DLIST_TAIL(list) ((list)?(list)->prev:NULL)
+
+/* return the previous element in the list. */
+#define DLIST_PREV(p) (((p)->prev && (p)->prev->next != NULL)?(p)->prev:NULL)
+
+/* insert 'p' after the given element 'el' in a list. If el is NULL then
+   this is the same as a DLIST_ADD() */
+#define DLIST_ADD_AFTER(list, p, el) \
+do { \
+        if (!(list) || !(el)) { \
+               DLIST_ADD(list, p); \
+       } else { \
+               (p)->prev = (el);   \
+               (p)->next = (el)->next;         \
+               (el)->next = (p);               \
+               if ((p)->next) (p)->next->prev = (p);   \
+               if ((list)->prev == (el)) (list)->prev = (p); \
+       }\
+} while (0)
+
+
+/*
+   add to the end of a list.
+   Note that 'type' is ignored
+*/
+#define DLIST_ADD_END(list, p, type)                   \
+do { \
+       if (!(list)) { \
+               DLIST_ADD(list, p); \
+       } else { \
+               DLIST_ADD_AFTER(list, p, (list)->prev); \
+       } \
+} while (0)
+
+/* promote an element to the from of a list */
+#define DLIST_PROMOTE(list, p) \
+do { \
+          DLIST_REMOVE(list, p); \
+          DLIST_ADD(list, p); \
+} while (0)
+
+/*
+   demote an element to the end of a list.
+   Note that 'type' is ignored
+*/
+#define DLIST_DEMOTE(list, p, type)                    \
+do { \
+       DLIST_REMOVE(list, p); \
+       DLIST_ADD_END(list, p, NULL);           \
+} while (0)
+
+/*
+   concatenate two lists - putting all elements of the 2nd list at the
+   end of the first list.
+   Note that 'type' is ignored
+*/
+#define DLIST_CONCATENATE(list1, list2, type)  \
+do { \
+       if (!(list1)) { \
+               (list1) = (list2); \
+       } else { \
+               (list1)->prev->next = (list2); \
+               if (list2) { \
+                       void *_tmplist = (void *)(list1)->prev; \
+                       (list1)->prev = (list2)->prev; \
+                       (list2)->prev = _tmplist; \
+               } \
+       } \
+} while (0)
+
+#endif /* _DLINKLIST_H */
+
+const char **ev_str_list_add(const char **list, const char *s);
+int ev_set_blocking(int fd, bool set);
+size_t ev_str_list_length(const char **list);
+bool ev_set_close_on_exec(int fd);
+
+/* Defined here so we can build against older talloc versions that don't
+ * have this define yet. */
+#ifndef TALLOC_FREE
+#define TALLOC_FREE(ctx) do { talloc_free(ctx); ctx=NULL; } while(0)
+#endif
diff --git a/ctdb/lib/tevent/tevent_wakeup.c b/ctdb/lib/tevent/tevent_wakeup.c
new file mode 100644 (file)
index 0000000..82c3942
--- /dev/null
@@ -0,0 +1,70 @@
+/*
+   Unix SMB/CIFS implementation.
+   Infrastructure for async requests
+   Copyright (C) Volker Lendecke 2008
+   Copyright (C) Stefan Metzmacher 2009
+
+     ** NOTE! The following LGPL license applies to the tevent
+     ** library. This does NOT imply that all of Samba is released
+     ** under the LGPL
+
+   This library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 3 of the License, or (at your option) any later version.
+
+   This library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this library; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "replace.h"
+#include "tevent.h"
+#include "tevent_internal.h"
+#include "tevent_util.h"
+
+struct tevent_wakeup_state {
+       struct timeval wakeup_time;
+};
+
+struct tevent_req *tevent_wakeup_send(TALLOC_CTX *mem_ctx,
+                                     struct tevent_context *ev,
+                                     struct timeval wakeup_time)
+{
+       struct tevent_req *req;
+       struct tevent_wakeup_state *state;
+
+       req = tevent_req_create(mem_ctx, &state,
+                               struct tevent_wakeup_state);
+       if (!req) {
+               return NULL;
+       }
+       state->wakeup_time = wakeup_time;
+
+       if (!tevent_req_set_endtime(req, ev, wakeup_time)) {
+               goto post;
+       }
+
+       return req;
+post:
+       return tevent_req_post(req, ev);
+}
+
+bool tevent_wakeup_recv(struct tevent_req *req)
+{
+       enum tevent_req_state state;
+       uint64_t error;
+
+       if (tevent_req_is_error(req, &state, &error)) {
+               if (state == TEVENT_REQ_TIMED_OUT) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
diff --git a/ctdb/lib/tevent/wscript b/ctdb/lib/tevent/wscript
new file mode 100755 (executable)
index 0000000..02bddb8
--- /dev/null
@@ -0,0 +1,138 @@
+#!/usr/bin/env python
+
+APPNAME = 'tevent'
+VERSION = '0.9.18'
+
+blddir = 'bin'
+
+import sys, os
+
+# find the buildtools directory
+srcdir = '.'
+while not os.path.exists(srcdir+'/buildtools') and len(srcdir.split('/')) < 5:
+    srcdir = '../' + srcdir
+sys.path.insert(0, srcdir + '/buildtools/wafsamba')
+
+import wafsamba, samba_dist, Options, Logs
+
+samba_dist.DIST_DIRS('lib/tevent:. lib/replace:lib/replace lib/talloc:lib/talloc buildtools:buildtools')
+
+def set_options(opt):
+    opt.BUILTIN_DEFAULT('replace')
+    opt.PRIVATE_EXTENSION_DEFAULT('tevent', noextension='tevent')
+    opt.RECURSE('lib/replace')
+    opt.RECURSE('lib/talloc')
+    if opt.IN_LAUNCH_DIR():
+        opt.add_option('--disable-python',
+                       help=("disable the pytevent module"),
+                       action="store_true", dest='disable_python', default=False)
+
+
+def configure(conf):
+    conf.RECURSE('lib/replace')
+    conf.RECURSE('lib/talloc')
+
+    conf.env.standalone_tevent = conf.IN_LAUNCH_DIR()
+
+    if not conf.env.standalone_tevent:
+        if conf.CHECK_BUNDLED_SYSTEM_PKG('tevent', minversion=VERSION,
+                                     onlyif='talloc', implied_deps='replace talloc'):
+            conf.define('USING_SYSTEM_TEVENT', 1)
+            if conf.CHECK_BUNDLED_SYSTEM_PYTHON('pytevent', 'tevent', minversion=VERSION):
+                conf.define('USING_SYSTEM_PYTEVENT', 1)
+
+    if conf.CHECK_FUNCS('epoll_create', headers='sys/epoll.h'):
+        conf.DEFINE('HAVE_EPOLL', 1)
+
+    tevent_num_signals = 64
+    v = conf.CHECK_VALUEOF('NSIG', headers='signal.h')
+    if v is not None:
+        tevent_num_signals = max(tevent_num_signals, v)
+    v = conf.CHECK_VALUEOF('_NSIG', headers='signal.h')
+    if v is not None:
+        tevent_num_signals = max(tevent_num_signals, v)
+    v = conf.CHECK_VALUEOF('SIGRTMAX', headers='signal.h')
+    if v is not None:
+        tevent_num_signals = max(tevent_num_signals, v)
+    v = conf.CHECK_VALUEOF('SIGRTMIN', headers='signal.h')
+    if v is not None:
+        tevent_num_signals = max(tevent_num_signals, v*2)
+
+    if not conf.CONFIG_SET('USING_SYSTEM_TEVENT'):
+        conf.DEFINE('TEVENT_NUM_SIGNALS', tevent_num_signals)
+
+    conf.env.disable_python = getattr(Options.options, 'disable_python', False)
+
+    if not conf.env.disable_python:
+        # also disable if we don't have the python libs installed
+        conf.find_program('python', var='PYTHON')
+        conf.check_tool('python')
+        conf.check_python_version((2,4,2))
+        conf.SAMBA_CHECK_PYTHON_HEADERS(mandatory=False)
+        if not conf.env.HAVE_PYTHON_H:
+            Logs.warn('Disabling pytevent as python devel libs not found')
+            conf.env.disable_python = True
+
+    conf.SAMBA_CONFIG_H()
+
+    conf.SAMBA_CHECK_UNDEFINED_SYMBOL_FLAGS()
+
+def build(bld):
+    bld.RECURSE('lib/replace')
+    bld.RECURSE('lib/talloc')
+
+    SRC = '''tevent.c tevent_debug.c tevent_fd.c tevent_immediate.c
+             tevent_queue.c tevent_req.c tevent_select.c
+             tevent_poll.c
+             tevent_signal.c tevent_standard.c tevent_timed.c tevent_util.c tevent_wakeup.c'''
+
+    if bld.CONFIG_SET('HAVE_EPOLL'):
+        SRC += ' tevent_epoll.c'
+
+    if bld.env.standalone_tevent:
+        bld.env.PKGCONFIGDIR = '${LIBDIR}/pkgconfig'
+        private_library = False
+    else:
+        private_library = True
+
+    if not bld.CONFIG_SET('USING_SYSTEM_TEVENT'):
+        bld.SAMBA_LIBRARY('tevent',
+                          SRC,
+                          deps='replace talloc',
+                          enabled= not bld.CONFIG_SET('USING_SYSTEM_TEVENT'),
+                          includes='.',
+                          abi_directory='ABI',
+                          abi_match='tevent_* _tevent_*',
+                          vnum=VERSION,
+                          public_headers='tevent.h',
+                          public_headers_install=not private_library,
+                          pc_files='tevent.pc',
+                          private_library=private_library)
+
+    if not bld.CONFIG_SET('USING_SYSTEM_PYTEVENT') and not bld.env.disable_python:
+        bld.SAMBA_PYTHON('pytevent',
+                         'pytevent.c',
+                         deps='tevent',
+                         realname='_tevent.so',
+                         cflags='-DPACKAGE_VERSION=\"%s\"' % VERSION)
+        # install out various python scripts for use by make test
+        bld.SAMBA_SCRIPT('tevent_python',
+                         pattern='tevent.py',
+                         installdir='python')
+
+        bld.INSTALL_WILDCARD('${PYTHONARCHDIR}', 'tevent.py', flat=False)
+
+
+def test(ctx):
+    '''test tevent'''
+    print("The tevent testsuite is part of smbtorture in samba4")
+
+
+def dist():
+    '''makes a tarball for distribution'''
+    samba_dist.dist()
+
+def reconfigure(ctx):
+    '''reconfigure if config scripts have changed'''
+    import samba_utils
+    samba_utils.reconfigure(ctx)
diff --git a/ctdb/lib/util/db_wrap.c b/ctdb/lib/util/db_wrap.c
new file mode 100644 (file)
index 0000000..1b2bf7e
--- /dev/null
@@ -0,0 +1,103 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   database wrap functions
+
+   Copyright (C) Andrew Tridgell 2004
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+  the stupidity of the unix fcntl locking design forces us to never
+  allow a database file to be opened twice in the same process. These
+  wrappers provide convenient access to a tdb or ldb, taking advantage
+  of talloc destructors to ensure that only a single open is done
+*/
+
+#include "includes.h"
+#include "lib/util/dlinklist.h"
+#include "tdb.h"
+#include "db_wrap.h"
+
+static struct tdb_wrap *tdb_list;
+
+
+
+/* destroy the last connection to a tdb */
+static int tdb_wrap_destructor(struct tdb_wrap *w)
+{
+       tdb_close(w->tdb);
+       DLIST_REMOVE(tdb_list, w);
+       return 0;
+}                               
+
+static void log_fn(struct tdb_context *tdb, enum tdb_debug_level level, const char *fmt, ...)
+{
+       if (level <= TDB_DEBUG_ERROR) {
+               va_list ap;
+               this_log_level = level;
+               char newfmt[strlen(tdb_name(tdb)) + 1 + strlen(fmt) + 1];
+               sprintf(newfmt, "%s:%s", tdb_name(tdb), fmt);
+               va_start(ap, fmt);
+               do_debug_v(newfmt, ap);
+               va_end(ap);
+       }
+}
+
+
+/*
+  wrapped connection to a tdb database
+  to close just talloc_free() the tdb_wrap pointer
+ */
+struct tdb_wrap *tdb_wrap_open(TALLOC_CTX *mem_ctx,
+                              const char *name, int hash_size, int tdb_flags,
+                              int open_flags, mode_t mode)
+{
+       struct tdb_wrap *w;
+       struct tdb_logging_context log_ctx;
+
+       log_ctx.log_fn = log_fn;
+       log_ctx.log_private = NULL;
+
+       for (w=tdb_list;w;w=w->next) {
+               if (strcmp(name, w->name) == 0) {
+                       return talloc_reference(mem_ctx, w);
+               }
+       }
+
+       w = talloc(mem_ctx, struct tdb_wrap);
+       if (w == NULL) {
+               return NULL;
+       }
+
+       w->name = talloc_strdup(w, name);
+       if (w->name == NULL) {
+               talloc_free(w);
+               return NULL;
+       }
+
+       w->tdb = tdb_open_ex(name, hash_size, tdb_flags, 
+                            open_flags, mode, &log_ctx, NULL);
+       if (w->tdb == NULL) {
+               talloc_free(w);
+               return NULL;
+       }
+
+       talloc_set_destructor(w, tdb_wrap_destructor);
+
+       DLIST_ADD(tdb_list, w);
+
+       return w;
+}
diff --git a/ctdb/lib/util/db_wrap.h b/ctdb/lib/util/db_wrap.h
new file mode 100644 (file)
index 0000000..5ae5fd1
--- /dev/null
@@ -0,0 +1,36 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   database wrap headers
+
+   Copyright (C) Andrew Tridgell 2004
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _DB_WRAP_H
+#define _DB_WRAP_H
+
+struct tdb_wrap {
+       struct tdb_context *tdb;
+
+       const char *name;
+       struct tdb_wrap *next, *prev;
+};
+
+struct tdb_wrap *tdb_wrap_open(TALLOC_CTX *mem_ctx,
+                              const char *name, int hash_size, int tdb_flags,
+                              int open_flags, mode_t mode);
+
+#endif /* _DB_WRAP_H */
diff --git a/ctdb/lib/util/debug.c b/ctdb/lib/util/debug.c
new file mode 100644 (file)
index 0000000..e9365d8
--- /dev/null
@@ -0,0 +1,136 @@
+/*
+   Unix SMB/CIFS implementation.
+   ctdb debug functions
+   Copyright (C) Volker Lendecke 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/time.h"
+#include <unistd.h>
+#include <ctype.h>
+
+static void _do_debug_v(const char *format, va_list ap)
+{
+       struct timeval t;
+       char *s = NULL;
+       struct tm *tm;
+       char tbuf[100];
+       int ret;
+
+       ret = vasprintf(&s, format, ap);
+       if (ret == -1) {
+               fprintf(stderr, "vasprintf failed in _do_debug_v, cannot print debug message.\n");
+               fflush(stderr);
+               return;
+       }
+
+       t = timeval_current();
+       tm = localtime(&t.tv_sec);
+
+       strftime(tbuf,sizeof(tbuf)-1,"%Y/%m/%d %H:%M:%S", tm);
+
+       fprintf(stderr, "%s.%06u [%s%5u]: %s", tbuf, (unsigned)t.tv_usec,
+               debug_extra, (unsigned)getpid(), s);
+       fflush(stderr);
+       free(s);
+}
+
+/* default logging function */
+void (*do_debug_v)(const char *, va_list ap) = _do_debug_v;
+const char *debug_extra = "";
+
+void do_debug(const char *format, ...)
+{
+       va_list ap;
+
+       va_start(ap, format);
+       do_debug_v(format, ap);
+       va_end(ap);
+}
+
+
+static void _do_debug_add_v(const char *format, va_list ap)
+{
+       char *s = NULL;
+       int ret;
+
+       ret = vasprintf(&s, format, ap);
+       if (ret == -1) {
+               fprintf(stderr, "vasprintf failed in _do_debug_add_v, cannot print debug message.\n");
+               fflush(stderr);
+               return;
+       }
+
+       fprintf(stderr, "%s", s);
+       fflush(stderr);
+       free(s);
+}
+
+/* default logging function */
+void (*do_debug_add_v)(const char *, va_list ap) = _do_debug_add_v;
+
+void do_debug_add(const char *format, ...)
+{
+       va_list ap;
+
+       va_start(ap, format);
+       do_debug_add_v(format, ap);
+       va_end(ap);
+}
+
+static void print_asc(int level, const uint8_t *buf, size_t len)
+{
+       int i;
+       for (i=0;i<len;i++) {
+               DEBUGADD(level,("%c", isprint(buf[i])?buf[i]:'.'));
+       }
+}
+
+void dump_data(int level, const uint8_t *buf, size_t len)
+{
+       int i=0;
+
+       if (len<=0) return;
+
+       if (!DEBUGLVL(level)) return;
+
+       DEBUG(level, (__location__ " dump data of size %i:\n", (int)len));
+       DEBUGADD(level,("[%03X] ",i));
+       for (i=0;i<len;) {
+               DEBUGADD(level,("%02X ",(int)buf[i]));
+               i++;
+               if (i%8 == 0) DEBUGADD(level,(" "));
+               if (i%16 == 0) {
+                       print_asc(level,&buf[i-16],8); DEBUGADD(level,(" "));
+                       print_asc(level,&buf[i-8],8); DEBUGADD(level,("\n"));
+                       if (i<len) DEBUGADD(level,("[%03X] ",i));
+               }
+       }
+       if (i%16) {
+               int n;
+               n = 16 - (i%16);
+               DEBUGADD(level,(" "));
+               if (n>8) DEBUGADD(level,(" "));
+               while (n--) DEBUGADD(level,("   "));
+               n = MIN(8,i%16);
+               print_asc(level,&buf[i-(i%16)],n); DEBUGADD(level,( " " ));
+               n = (i%16) - n;
+               if (n>0) print_asc(level,&buf[i-n],n);
+               DEBUGADD(level,("\n"));
+       }
+       DEBUG(level, (__location__ " dump data of size %i finished\n", (int)len));
+}
+
diff --git a/ctdb/lib/util/debug.h b/ctdb/lib/util/debug.h
new file mode 100644 (file)
index 0000000..27490a3
--- /dev/null
@@ -0,0 +1,27 @@
+/*
+   Unix SMB/CIFS implementation.
+   ctdb debug functions
+   Copyright (C) Volker Lendecke 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+void (*do_debug_v)(const char *, va_list ap);
+const char *debug_extra;
+void (*do_debug_add_v)(const char *, va_list ap);
+void log_ringbuffer(const char *format, ...);
+void do_debug(const char *format, ...) PRINTF_ATTRIBUTE(1, 2);
+void do_debug_add(const char *format, ...) PRINTF_ATTRIBUTE(1, 2);
+void dump_data(int level, const uint8_t *buf1, size_t len);
+
diff --git a/ctdb/lib/util/dlinklist.h b/ctdb/lib/util/dlinklist.h
new file mode 100644 (file)
index 0000000..6d525f9
--- /dev/null
@@ -0,0 +1,181 @@
+/* 
+   Unix SMB/CIFS implementation.
+   some simple double linked list macros
+
+   Copyright (C) Andrew Tridgell 1998-2010
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* To use these macros you must have a structure containing a next and
+   prev pointer */
+
+#ifndef _DLINKLIST_H
+#define _DLINKLIST_H
+
+/*
+  February 2010 - changed list format to have a prev pointer from the
+  list head. This makes DLIST_ADD_END() O(1) even though we only have
+  one list pointer.
+
+  The scheme is as follows:
+
+     1) with no entries in the list:
+          list_head == NULL
+
+     2) with 1 entry in the list:
+          list_head->next == NULL
+          list_head->prev == list_head
+
+     3) with 2 entries in the list:
+          list_head->next == element2
+          list_head->prev == element2
+         element2->prev == list_head
+         element2->next == NULL
+
+     4) with N entries in the list:
+          list_head->next == element2
+          list_head->prev == elementN
+         elementN->prev == element{N-1}
+         elementN->next == NULL
+
+  This allows us to find the tail of the list by using
+  list_head->prev, which means we can add to the end of the list in
+  O(1) time
+
+
+  Note that the 'type' arguments below are no longer needed, but
+  are kept for now to prevent an incompatible argument change
+ */
+
+
+/*
+   add an element at the front of a list
+*/
+#define DLIST_ADD(list, p) \
+do { \
+        if (!(list)) { \
+               (p)->prev = (list) = (p);  \
+               (p)->next = NULL; \
+       } else { \
+               (p)->prev = (list)->prev; \
+               (list)->prev = (p); \
+               (p)->next = (list); \
+               (list) = (p); \
+       } \
+} while (0)
+
+/*
+   remove an element from a list
+   Note that the element doesn't have to be in the list. If it
+   isn't then this is a no-op
+*/
+#define DLIST_REMOVE(list, p) \
+do { \
+       if ((p) == (list)) { \
+               if ((p)->next) (p)->next->prev = (p)->prev; \
+               (list) = (p)->next; \
+       } else if ((list) && (p) == (list)->prev) {     \
+               (p)->prev->next = NULL; \
+               (list)->prev = (p)->prev; \
+       } else { \
+               if ((p)->prev) (p)->prev->next = (p)->next; \
+               if ((p)->next) (p)->next->prev = (p)->prev; \
+       } \
+       if ((p) != (list)) (p)->next = (p)->prev = NULL;        \
+} while (0)
+
+/*
+   find the head of the list given any element in it.
+   Note that this costs O(N), so you should avoid this macro
+   if at all possible!
+*/
+#define DLIST_HEAD(p, result_head) \
+do { \
+       (result_head) = (p); \
+       while (DLIST_PREV(result_head)) (result_head) = (result_head)->prev; \
+} while(0)
+
+/* return the last element in the list */
+#define DLIST_TAIL(list) ((list)?(list)->prev:NULL)
+
+/* return the previous element in the list. */
+#define DLIST_PREV(p) (((p)->prev && (p)->prev->next != NULL)?(p)->prev:NULL)
+
+/* insert 'p' after the given element 'el' in a list. If el is NULL then
+   this is the same as a DLIST_ADD() */
+#define DLIST_ADD_AFTER(list, p, el) \
+do { \
+        if (!(list) || !(el)) { \
+               DLIST_ADD(list, p); \
+       } else { \
+               (p)->prev = (el);   \
+               (p)->next = (el)->next;         \
+               (el)->next = (p);               \
+               if ((p)->next) (p)->next->prev = (p);   \
+               if ((list)->prev == (el)) (list)->prev = (p); \
+       }\
+} while (0)
+
+
+/*
+   add to the end of a list.
+   Note that 'type' is ignored
+*/
+#define DLIST_ADD_END(list, p, type)                   \
+do { \
+       if (!(list)) { \
+               DLIST_ADD(list, p); \
+       } else { \
+               DLIST_ADD_AFTER(list, p, (list)->prev); \
+       } \
+} while (0)
+
+/* promote an element to the from of a list */
+#define DLIST_PROMOTE(list, p) \
+do { \
+          DLIST_REMOVE(list, p); \
+          DLIST_ADD(list, p); \
+} while (0)
+
+/*
+   demote an element to the end of a list.
+   Note that 'type' is ignored
+*/
+#define DLIST_DEMOTE(list, p, type)                    \
+do { \
+       DLIST_REMOVE(list, p); \
+       DLIST_ADD_END(list, p, NULL);           \
+} while (0)
+
+/*
+   concatenate two lists - putting all elements of the 2nd list at the
+   end of the first list.
+   Note that 'type' is ignored
+*/
+#define DLIST_CONCATENATE(list1, list2, type)  \
+do { \
+       if (!(list1)) { \
+               (list1) = (list2); \
+       } else { \
+               (list1)->prev->next = (list2); \
+               if (list2) { \
+                       void *_tmplist = (void *)(list1)->prev; \
+                       (list1)->prev = (list2)->prev; \
+                       (list2)->prev = _tmplist; \
+               } \
+       } \
+} while (0)
+
+#endif /* _DLINKLIST_H */
diff --git a/ctdb/lib/util/fault.c b/ctdb/lib/util/fault.c
new file mode 100644 (file)
index 0000000..3dddd0e
--- /dev/null
@@ -0,0 +1,238 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Critical Fault handling
+   Copyright (C) Andrew Tridgell 1992-1998
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/wait.h"
+#include "system/filesys.h"
+
+/**
+ * @file
+ * @brief Fault handling
+ */
+
+/* the registered fault handler */
+static struct {
+       const char *name;
+       void (*fault_handler)(int sig);
+} fault_handlers;
+
+static const char *progname;
+
+#ifdef HAVE_BACKTRACE
+#include <execinfo.h>
+#elif HAVE_LIBEXC_H
+#include <libexc.h>
+#endif
+
+/**
+ * Write backtrace to debug log
+ */
+_PUBLIC_ void call_backtrace(void)
+{
+#ifdef HAVE_BACKTRACE
+#ifndef BACKTRACE_STACK_SIZE
+#define BACKTRACE_STACK_SIZE 64
+#endif
+       void *backtrace_stack[BACKTRACE_STACK_SIZE];
+       size_t backtrace_size;
+       char **backtrace_strings;
+
+       /* get the backtrace (stack frames) */
+       backtrace_size = backtrace(backtrace_stack,BACKTRACE_STACK_SIZE);
+       backtrace_strings = backtrace_symbols(backtrace_stack, backtrace_size);
+
+       DEBUG(0, ("BACKTRACE: %lu stack frames:\n", 
+                 (unsigned long)backtrace_size));
+       
+       if (backtrace_strings) {
+               int i;
+
+               for (i = 0; i < backtrace_size; i++)
+                       DEBUGADD(0, (" #%u %s\n", i, backtrace_strings[i]));
+
+               /* Leak the backtrace_strings, rather than risk what free() might do */
+       }
+
+#elif HAVE_LIBEXC
+
+#define NAMESIZE 32 /* Arbitrary */
+#ifndef BACKTRACE_STACK_SIZE
+#define BACKTRACE_STACK_SIZE 64
+#endif
+
+       /* The IRIX libexc library provides an API for unwinding the stack. See
+        * libexc(3) for details. Apparantly trace_back_stack leaks memory, but
+        * since we are about to abort anyway, it hardly matters.
+        *
+        * Note that if we paniced due to a SIGSEGV or SIGBUS (or similar) this
+        * will fail with a nasty message upon failing to open the /proc entry.
+        */
+       {
+               uint64_t        addrs[BACKTRACE_STACK_SIZE];
+               char *          names[BACKTRACE_STACK_SIZE];
+               char            namebuf[BACKTRACE_STACK_SIZE * NAMESIZE];
+
+               int             i;
+               int             levels;
+
+               ZERO_ARRAY(addrs);
+               ZERO_ARRAY(names);
+               ZERO_ARRAY(namebuf);
+
+               for (i = 0; i < BACKTRACE_STACK_SIZE; i++) {
+                       names[i] = namebuf + (i * NAMESIZE);
+               }
+
+               levels = trace_back_stack(0, addrs, names,
+                               BACKTRACE_STACK_SIZE, NAMESIZE);
+
+               DEBUG(0, ("BACKTRACE: %d stack frames:\n", levels));
+               for (i = 0; i < levels; i++) {
+                       DEBUGADD(0, (" #%d 0x%llx %s\n", i, addrs[i], names[i]));
+               }
+     }
+#undef NAMESIZE
+#else
+       DEBUG(0, ("call_backtrace: not implemented\n"));
+#endif
+}
+
+_PUBLIC_ const char *panic_action = NULL;
+_PUBLIC_ void (*pre_panic_action_hook)(void) = NULL;
+_PUBLIC_ void (*post_panic_action_hook)(void) = NULL;
+
+/**
+ Something really nasty happened - panic !
+**/
+_PUBLIC_ void smb_panic(const char *why)
+{
+       int result;
+
+       if (panic_action && *panic_action) {
+               char pidstr[20];
+               char cmdstring[200];
+               strlcpy(cmdstring, panic_action, sizeof(cmdstring));
+               snprintf(pidstr, sizeof(pidstr), "%u", getpid());
+               all_string_sub(cmdstring, "%PID%", pidstr, sizeof(cmdstring));
+               if (progname) {
+                       all_string_sub(cmdstring, "%PROG%", progname, sizeof(cmdstring));
+               }
+               DEBUG(0, ("smb_panic(): calling panic action [%s]\n", cmdstring));
+
+               if (pre_panic_action_hook) {
+                       pre_panic_action_hook();
+               }
+
+               result = system(cmdstring);
+
+               if (post_panic_action_hook) {
+                       post_panic_action_hook();
+               }
+
+               if (result == -1)
+                       DEBUG(0, ("smb_panic(): fork failed in panic action: %s\n",
+                                 strerror(errno)));
+               else
+                       DEBUG(0, ("smb_panic(): action returned status %d\n",
+                                 WEXITSTATUS(result)));
+       }
+       DEBUG(0,("PANIC: %s\n", why));
+
+       call_backtrace();
+
+#ifdef SIGABRT
+       CatchSignal(SIGABRT, SIG_DFL);
+#endif
+       abort();
+}
+
+/**
+report a fault
+**/
+_NORETURN_ static void fault_report(int sig)
+{
+       static int counter;
+       
+       if (counter) _exit(1);
+
+       DEBUG(0,("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"));
+       DEBUG(0,("INTERNAL ERROR: Signal %d in %s pid %d",sig, progname, (int)getpid()));
+       DEBUG(0,("\nPlease read the file BUGS.txt in the distribution\n"));
+       DEBUG(0,("=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"));
+
+       smb_panic("internal error");
+
+       exit(1);
+}
+
+/**
+catch serious errors
+**/
+_NORETURN_ static void sig_fault(int sig)
+{
+       if (fault_handlers.fault_handler) {
+               /* we have a fault handler, call it. It may not return. */
+               fault_handlers.fault_handler(sig);
+       }
+       /* If it returns or doesn't exist, use regular reporter */
+       fault_report(sig);
+}
+
+/**
+setup our fault handlers
+**/
+_PUBLIC_ void fault_setup(const char *pname)
+{
+       if (progname == NULL) {
+               progname = pname;
+       }
+#ifdef SIGSEGV
+       CatchSignal(SIGSEGV, sig_fault);
+#endif
+#ifdef SIGBUS
+       CatchSignal(SIGBUS, sig_fault);
+#endif
+#ifdef SIGABRT
+       CatchSignal(SIGABRT, sig_fault);
+#endif
+#ifdef SIGFPE
+       CatchSignal(SIGFPE, sig_fault);
+#endif
+}
+
+/**
+  register a fault handler. 
+  Should only be called once in the execution of smbd.
+*/
+_PUBLIC_ bool register_fault_handler(const char *name, 
+                                    void (*fault_handler)(int sig))
+{
+       if (fault_handlers.name != NULL) {
+               /* it's already registered! */
+               DEBUG(2,("fault handler '%s' already registered - failed '%s'\n", 
+                        fault_handlers.name, name));
+               return false;
+       }
+
+       fault_handlers.name = name;
+       fault_handlers.fault_handler = fault_handler;
+
+       DEBUG(2,("fault handler '%s' registered\n", name));
+       return true;
+}
diff --git a/ctdb/lib/util/fault.m4 b/ctdb/lib/util/fault.m4
new file mode 100644 (file)
index 0000000..da077af
--- /dev/null
@@ -0,0 +1,15 @@
+AC_CHECK_HEADERS(execinfo.h)
+AC_SEARCH_LIBS_EXT(backtrace, [execinfo], EXECINFO_LIBS)
+AC_CHECK_FUNC_EXT(backtrace, $EXECINFO_LIBS)
+
+
+if test x"$ac_cv_header_execinfo_h" = x"yes" -a x"$ac_cv_func_ext_backtrace" = x"yes";then
+       SMB_ENABLE(EXECINFO, YES)
+       EXECINFO_CFLAGS="$CFLAGS"
+       EXECINFO_CPPFLAGS="$CPPFLAGS"
+       EXECINFO_LDFLAGS="$LDFLAGS"
+else
+       SMB_ENABLE(EXECINFO,NO)
+fi
+
+SMB_EXT_LIB(EXECINFO, [${EXECINFO_LIBS}], [${EXECINFO_CFLAGS}], [${EXECINFO_CPPFLAGS}], [${EXECINFO_LDFLAGS}])
diff --git a/ctdb/lib/util/idtree.c b/ctdb/lib/util/idtree.c
new file mode 100644 (file)
index 0000000..09dc237
--- /dev/null
@@ -0,0 +1,387 @@
+/* 
+   Unix SMB/CIFS implementation.
+
+   very efficient functions to manage mapping a id (such as a fnum) to
+   a pointer. This is used for fnum and search id allocation.
+
+   Copyright (C) Andrew Tridgell 2004
+
+   This code is derived from lib/idr.c in the 2.6 Linux kernel, which was 
+   written by Jim Houston jim.houston@ccur.com, and is
+   Copyright (C) 2002 by Concurrent Computer Corporation
+    
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+  see the section marked "public interface" below for documentation
+*/
+
+/**
+ * @file
+ */
+
+#include "includes.h"
+
+#define IDR_BITS 5
+#define IDR_FULL 0xfffffffful
+#if 0 /* unused */
+#define TOP_LEVEL_FULL (IDR_FULL >> 30)
+#endif
+#define IDR_SIZE (1 << IDR_BITS)
+#define IDR_MASK ((1 << IDR_BITS)-1)
+#define MAX_ID_SHIFT (sizeof(int)*8 - 1)
+#define MAX_ID_BIT (1U << MAX_ID_SHIFT)
+#define MAX_ID_MASK (MAX_ID_BIT - 1)
+#define MAX_LEVEL (MAX_ID_SHIFT + IDR_BITS - 1) / IDR_BITS
+#define IDR_FREE_MAX MAX_LEVEL + MAX_LEVEL
+
+#define set_bit(bit, v) (v) |= (1<<(bit))
+#define clear_bit(bit, v) (v) &= ~(1<<(bit))
+#define test_bit(bit, v) ((v) & (1<<(bit)))
+                                  
+struct idr_layer {
+       uint32_t                 bitmap;
+       struct idr_layer        *ary[IDR_SIZE];
+       int                      count;
+};
+
+struct idr_context {
+       struct idr_layer *top;
+       struct idr_layer *id_free;
+       int               layers;
+       int               id_free_cnt;
+};
+
+static struct idr_layer *alloc_layer(struct idr_context *idp)
+{
+       struct idr_layer *p;
+
+       if (!(p = idp->id_free))
+               return NULL;
+       idp->id_free = p->ary[0];
+       idp->id_free_cnt--;
+       p->ary[0] = NULL;
+       return p;
+}
+
+static int find_next_bit(uint32_t bm, int maxid, int n)
+{
+       while (n<maxid && !test_bit(n, bm)) n++;
+       return n;
+}
+
+static void free_layer(struct idr_context *idp, struct idr_layer *p)
+{
+       p->ary[0] = idp->id_free;
+       idp->id_free = p;
+       idp->id_free_cnt++;
+}
+
+static int idr_pre_get(struct idr_context *idp)
+{
+       while (idp->id_free_cnt < IDR_FREE_MAX) {
+               struct idr_layer *new = talloc_zero(idp, struct idr_layer);
+               if(new == NULL)
+                       return (0);
+               free_layer(idp, new);
+       }
+       return 1;
+}
+
+static int sub_alloc(struct idr_context *idp, void *ptr, int *starting_id)
+{
+       int n, m, sh;
+       struct idr_layer *p, *new;
+       struct idr_layer *pa[MAX_LEVEL+1];
+       unsigned int l, id, oid;
+       uint32_t bm;
+
+       memset(pa, 0, sizeof(pa));
+
+       id = *starting_id;
+restart:
+       p = idp->top;
+       l = idp->layers;
+       pa[l--] = NULL;
+       while (1) {
+               /*
+                * We run around this while until we reach the leaf node...
+                */
+               n = (id >> (IDR_BITS*l)) & IDR_MASK;
+               bm = ~p->bitmap;
+               m = find_next_bit(bm, IDR_SIZE, n);
+               if (m == IDR_SIZE) {
+                       /* no space available go back to previous layer. */
+                       l++;
+                       oid = id;
+                       id = (id | ((1 << (IDR_BITS*l))-1)) + 1;
+
+                       /* if already at the top layer, we need to grow */
+                       if (!(p = pa[l])) {
+                               *starting_id = id;
+                               return -2;
+                       }
+
+                       /* If we need to go up one layer, continue the
+                        * loop; otherwise, restart from the top.
+                        */
+                       sh = IDR_BITS * (l + 1);
+                       if (oid >> sh == id >> sh)
+                               continue;
+                       else
+                               goto restart;
+               }
+               if (m != n) {
+                       sh = IDR_BITS*l;
+                       id = ((id >> sh) ^ n ^ m) << sh;
+               }
+               if ((id >= MAX_ID_BIT) || (id < 0))
+                       return -1;
+               if (l == 0)
+                       break;
+               /*
+                * Create the layer below if it is missing.
+                */
+               if (!p->ary[m]) {
+                       if (!(new = alloc_layer(idp)))
+                               return -1;
+                       p->ary[m] = new;
+                       p->count++;
+               }
+               pa[l--] = p;
+               p = p->ary[m];
+       }
+       /*
+        * We have reached the leaf node, plant the
+        * users pointer and return the raw id.
+        */
+       p->ary[m] = (struct idr_layer *)ptr;
+       set_bit(m, p->bitmap);
+       p->count++;
+       /*
+        * If this layer is full mark the bit in the layer above
+        * to show that this part of the radix tree is full.
+        * This may complete the layer above and require walking
+        * up the radix tree.
+        */
+       n = id;
+       while (p->bitmap == IDR_FULL) {
+               if (!(p = pa[++l]))
+                       break;
+               n = n >> IDR_BITS;
+               set_bit((n & IDR_MASK), p->bitmap);
+       }
+       return(id);
+}
+
+static int idr_get_new_above_int(struct idr_context *idp, void *ptr, int starting_id)
+{
+       struct idr_layer *p, *new;
+       int layers, v, id;
+
+       idr_pre_get(idp);
+       
+       id = starting_id;
+build_up:
+       p = idp->top;
+       layers = idp->layers;
+       if (!p) {
+               if (!(p = alloc_layer(idp)))
+                       return -1;
+               layers = 1;
+       }
+       /*
+        * Add a new layer to the top of the tree if the requested
+        * id is larger than the currently allocated space.
+        */
+       while ((layers < MAX_LEVEL) && (id >= (1 << (layers*IDR_BITS)))) {
+               layers++;
+               if (!p->count)
+                       continue;
+               if (!(new = alloc_layer(idp))) {
+                       /*
+                        * The allocation failed.  If we built part of
+                        * the structure tear it down.
+                        */
+                       for (new = p; p && p != idp->top; new = p) {
+                               p = p->ary[0];
+                               new->ary[0] = NULL;
+                               new->bitmap = new->count = 0;
+                               free_layer(idp, new);
+                       }
+                       return -1;
+               }
+               new->ary[0] = p;
+               new->count = 1;
+               if (p->bitmap == IDR_FULL)
+                       set_bit(0, new->bitmap);
+               p = new;
+       }
+       idp->top = p;
+       idp->layers = layers;
+       v = sub_alloc(idp, ptr, &id);
+       if (v == -2)
+               goto build_up;
+       return(v);
+}
+
+static int sub_remove(struct idr_context *idp, int shift, int id)
+{
+       struct idr_layer *p = idp->top;
+       struct idr_layer **pa[1+MAX_LEVEL];
+       struct idr_layer ***paa = &pa[0];
+       int n;
+
+       *paa = NULL;
+       *++paa = &idp->top;
+
+       while ((shift > 0) && p) {
+               n = (id >> shift) & IDR_MASK;
+               clear_bit(n, p->bitmap);
+               *++paa = &p->ary[n];
+               p = p->ary[n];
+               shift -= IDR_BITS;
+       }
+       n = id & IDR_MASK;
+       if (p != NULL && test_bit(n, p->bitmap)) {
+               clear_bit(n, p->bitmap);
+               p->ary[n] = NULL;
+               while(*paa && ! --((**paa)->count)){
+                       free_layer(idp, **paa);
+                       **paa-- = NULL;
+               }
+               if ( ! *paa )
+                       idp->layers = 0;
+               return 0;
+       }
+       return -1;
+}
+
+static void *_idr_find(struct idr_context *idp, int id)
+{
+       int n;
+       struct idr_layer *p;
+
+       n = idp->layers * IDR_BITS;
+       p = idp->top;
+       /*
+        * This tests to see if bits outside the current tree are
+        * present.  If so, tain't one of ours!
+        */
+       if (n + IDR_BITS < 31 &&
+           ((id & ~(~0 << MAX_ID_SHIFT)) >> (n + IDR_BITS))) {
+            return NULL;
+       }
+
+       /* Mask off upper bits we don't use for the search. */
+       id &= MAX_ID_MASK;
+
+       while (n >= IDR_BITS && p) {
+               n -= IDR_BITS;
+               p = p->ary[(id >> n) & IDR_MASK];
+       }
+       return((void *)p);
+}
+
+static int _idr_remove(struct idr_context *idp, int id)
+{
+       struct idr_layer *p;
+
+       /* Mask off upper bits we don't use for the search. */
+       id &= MAX_ID_MASK;
+
+       if (sub_remove(idp, (idp->layers - 1) * IDR_BITS, id) == -1) {
+               return -1;
+       }
+
+       if ( idp->top && idp->top->count == 1 && 
+            (idp->layers > 1) &&
+            idp->top->ary[0]) {
+               /* We can drop a layer */
+               p = idp->top->ary[0];
+               idp->top->bitmap = idp->top->count = 0;
+               free_layer(idp, idp->top);
+               idp->top = p;
+               --idp->layers;
+       }
+       while (idp->id_free_cnt >= IDR_FREE_MAX) {
+               p = alloc_layer(idp);
+               talloc_free(p);
+       }
+       return 0;
+}
+
+/************************************************************************
+  this is the public interface
+**************************************************************************/
+
+/**
+  initialise a idr tree. The context return value must be passed to
+  all subsequent idr calls. To destroy the idr tree use talloc_free()
+  on this context
+ */
+_PUBLIC_ struct idr_context *idr_init(TALLOC_CTX *mem_ctx)
+{
+       return talloc_zero(mem_ctx, struct idr_context);
+}
+
+/**
+  allocate the next available id, and assign 'ptr' into its slot.
+  you can retrieve later this pointer using idr_find()
+*/
+_PUBLIC_ int idr_get_new(struct idr_context *idp, void *ptr, int limit)
+{
+       int ret = idr_get_new_above_int(idp, ptr, 0);
+       if (ret > limit) {
+               idr_remove(idp, ret);
+               return -1;
+       }
+       return ret;
+}
+
+/**
+   allocate a new id, giving the first available value greater than or
+   equal to the given starting id
+*/
+_PUBLIC_ int idr_get_new_above(struct idr_context *idp, void *ptr, int starting_id, int limit)
+{
+       int ret = idr_get_new_above_int(idp, ptr, starting_id);
+       if (ret > limit) {
+               idr_remove(idp, ret);
+               return -1;
+       }
+       return ret;
+}
+
+/**
+  find a pointer value previously set with idr_get_new given an id
+*/
+_PUBLIC_ void *idr_find(struct idr_context *idp, int id)
+{
+       return _idr_find(idp, id);
+}
+
+/**
+  remove an id from the idr tree
+*/
+_PUBLIC_ int idr_remove(struct idr_context *idp, int id)
+{
+       int ret;
+       ret = _idr_remove((struct idr_context *)idp, id);
+       if (ret != 0) {
+               DEBUG(0,("WARNING: attempt to remove unset id %d in idtree\n", id));
+       }
+       return ret;
+}
diff --git a/ctdb/lib/util/signal.c b/ctdb/lib/util/signal.c
new file mode 100644 (file)
index 0000000..ead947e
--- /dev/null
@@ -0,0 +1,144 @@
+/* 
+   Unix SMB/CIFS implementation.
+   signal handling functions
+
+   Copyright (C) Andrew Tridgell 1998
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/wait.h"
+
+/**
+ * @file
+ * @brief Signal handling
+ */
+
+/****************************************************************************
+ Catch child exits and reap the child zombie status.
+****************************************************************************/
+
+static void sig_cld(int signum)
+{
+       while (waitpid((pid_t)-1,(int *)NULL, WNOHANG) > 0)
+               ;
+
+       /*
+        * Turns out it's *really* important not to
+        * restore the signal handler here if we have real POSIX
+        * signal handling. If we do, then we get the signal re-delivered
+        * immediately - hey presto - instant loop ! JRA.
+        */
+
+#if !defined(HAVE_SIGACTION)
+       CatchSignal(SIGCLD, sig_cld);
+#endif
+}
+
+/****************************************************************************
+catch child exits - leave status;
+****************************************************************************/
+
+static void sig_cld_leave_status(int signum)
+{
+       /*
+        * Turns out it's *really* important not to
+        * restore the signal handler here if we have real POSIX
+        * signal handling. If we do, then we get the signal re-delivered
+        * immediately - hey presto - instant loop ! JRA.
+        */
+
+#if !defined(HAVE_SIGACTION)
+       CatchSignal(SIGCLD, sig_cld_leave_status);
+#endif
+}
+
+/**
+ Block sigs.
+**/
+
+void BlockSignals(bool block, int signum)
+{
+#ifdef HAVE_SIGPROCMASK
+       sigset_t set;
+       sigemptyset(&set);
+       sigaddset(&set,signum);
+       sigprocmask(block?SIG_BLOCK:SIG_UNBLOCK,&set,NULL);
+#elif defined(HAVE_SIGBLOCK)
+       if (block) {
+               sigblock(sigmask(signum));
+       } else {
+               sigsetmask(siggetmask() & ~sigmask(signum));
+       }
+#else
+       /* yikes! This platform can't block signals? */
+       static int done;
+       if (!done) {
+               DEBUG(0,("WARNING: No signal blocking available\n"));
+               done=1;
+       }
+#endif
+}
+
+/**
+ Catch a signal. This should implement the following semantics:
+
+ 1) The handler remains installed after being called.
+ 2) The signal should be blocked during handler execution.
+**/
+
+void (*CatchSignal(int signum,void (*handler)(int )))(int)
+{
+#ifdef HAVE_SIGACTION
+       struct sigaction act;
+       struct sigaction oldact;
+
+       ZERO_STRUCT(act);
+
+       act.sa_handler = handler;
+#ifdef SA_RESTART
+       /*
+        * We *want* SIGALRM to interrupt a system call.
+        */
+       if(signum != SIGALRM)
+               act.sa_flags = SA_RESTART;
+#endif
+       sigemptyset(&act.sa_mask);
+       sigaddset(&act.sa_mask,signum);
+       sigaction(signum,&act,&oldact);
+       return oldact.sa_handler;
+#else /* !HAVE_SIGACTION */
+       /* FIXME: need to handle sigvec and systems with broken signal() */
+       return signal(signum, handler);
+#endif
+}
+
+/**
+ Ignore SIGCLD via whatever means is necessary for this OS.
+**/
+
+void CatchChild(void)
+{
+       CatchSignal(SIGCLD, sig_cld);
+}
+
+/**
+ Catch SIGCLD but leave the child around so it's status can be reaped.
+**/
+
+void CatchChildLeaveStatus(void)
+{
+       CatchSignal(SIGCLD, sig_cld_leave_status);
+}
diff --git a/ctdb/lib/util/signal.m4 b/ctdb/lib/util/signal.m4
new file mode 100644 (file)
index 0000000..c6d7f72
--- /dev/null
@@ -0,0 +1 @@
+AC_CHECK_FUNCS(sigprocmask sigblock sigaction)
diff --git a/ctdb/lib/util/strlist.c b/ctdb/lib/util/strlist.c
new file mode 100644 (file)
index 0000000..48a6e45
--- /dev/null
@@ -0,0 +1,52 @@
+/* 
+   Unix SMB/CIFS implementation.
+   
+   Copyright (C) Andrew Tridgell 2005
+   Copyright (C) Jelmer Vernooij 2005
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/locale.h"
+
+/**
+  return the number of elements in a string list
+*/
+_PUBLIC_ size_t str_list_length(const char **list)
+{
+       size_t ret;
+       for (ret=0;list && list[ret];ret++) /* noop */ ;
+       return ret;
+}
+
+
+/**
+  add an entry to a string list
+*/
+_PUBLIC_ const char **str_list_add(const char **list, const char *s)
+{
+       size_t len = str_list_length(list);
+       const char **ret;
+
+       ret = talloc_realloc(NULL, list, const char *, len+2);
+       if (ret == NULL) return NULL;
+
+       ret[len] = talloc_strdup(ret, s);
+       if (ret[len] == NULL) return NULL;
+
+       ret[len+1] = NULL;
+
+       return ret;
+}
diff --git a/ctdb/lib/util/substitute.c b/ctdb/lib/util/substitute.c
new file mode 100644 (file)
index 0000000..32945a7
--- /dev/null
@@ -0,0 +1,167 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Samba utility functions
+   
+   Copyright (C) Andrew Tridgell 1992-2001
+   Copyright (C) Simo Sorce      2001-2002
+   Copyright (C) Martin Pool     2003
+   Copyright (C) James Peach    2005
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+
+/**
+ * @file
+ * @brief Substitute utilities.
+ **/
+
+/**
+ Substitute a string for a pattern in another string. Make sure there is 
+ enough room!
+
+ This routine looks for pattern in s and replaces it with 
+ insert. It may do multiple replacements.
+
+ Any of " ; ' $ or ` in the insert string are replaced with _
+ if len==0 then the string cannot be extended. This is different from the old
+ use of len==0 which was for no length checks to be done.
+**/
+
+_PUBLIC_ void string_sub(char *s, const char *pattern, const char *insert, size_t len)
+{
+       char *p;
+       ssize_t ls, lp, li, i;
+
+       if (!insert || !pattern || !*pattern || !s)
+               return;
+
+       ls = (ssize_t)strlen(s);
+       lp = (ssize_t)strlen(pattern);
+       li = (ssize_t)strlen(insert);
+
+       if (len == 0)
+               len = ls + 1; /* len is number of *bytes* */
+
+       while (lp <= ls && (p = strstr(s, pattern))) {
+               if (ls + (li-lp) >= len) {
+                       DEBUG(0,("ERROR: string overflow by %d in string_sub(%.50s, %d)\n", 
+                                (int)(ls + (li-lp) - len),
+                                pattern, (int)len));
+                       break;
+               }
+               if (li != lp) {
+                       memmove(p+li,p+lp,strlen(p+lp)+1);
+               }
+               for (i=0;i<li;i++) {
+                       switch (insert[i]) {
+                       case '`':
+                       case '"':
+                       case '\'':
+                       case ';':
+                       case '$':
+                       case '%':
+                       case '\r':
+                       case '\n':
+                               p[i] = '_';
+                               break;
+                       default:
+                               p[i] = insert[i];
+                       }
+               }
+               s = p + li;
+               ls += (li-lp);
+       }
+}
+
+/**
+ * Talloc'ed version of string_sub
+ */
+_PUBLIC_ char *string_sub_talloc(TALLOC_CTX *mem_ctx, const char *s, 
+                               const char *pattern, const char *insert)
+{
+       const char *p;
+       char *ret;
+       size_t len, alloc_len;
+
+       if (insert == NULL || pattern == NULL || !*pattern || s == NULL)
+               return NULL;
+
+       /* determine length needed */
+       len = strlen(s);
+       
+       for (p = strstr(s, pattern); p != NULL; 
+            p = strstr(p+strlen(pattern), pattern)) {
+               len += strlen(insert) - strlen(pattern);
+       }
+
+       alloc_len = MAX(len, strlen(s))+1;
+       ret = talloc_array(mem_ctx, char, alloc_len);
+       if (ret == NULL)
+               return NULL;
+       strncpy(ret, s, alloc_len);
+       string_sub(ret, pattern, insert, alloc_len);
+
+       ret = talloc_realloc(mem_ctx, ret, char, len+1);
+       if (ret == NULL)
+               return NULL;
+
+       SMB_ASSERT(ret[len] == '\0');
+
+       talloc_set_name_const(ret, ret);
+
+       return ret;
+}
+
+/**
+ Similar to string_sub() but allows for any character to be substituted. 
+ Use with caution!
+ if len==0 then the string cannot be extended. This is different from the old
+ use of len==0 which was for no length checks to be done.
+**/
+
+_PUBLIC_ void all_string_sub(char *s,const char *pattern,const char *insert, size_t len)
+{
+       char *p;
+       ssize_t ls,lp,li;
+
+       if (!insert || !pattern || !s)
+               return;
+
+       ls = (ssize_t)strlen(s);
+       lp = (ssize_t)strlen(pattern);
+       li = (ssize_t)strlen(insert);
+
+       if (!*pattern)
+               return;
+       
+       if (len == 0)
+               len = ls + 1; /* len is number of *bytes* */
+       
+       while (lp <= ls && (p = strstr(s,pattern))) {
+               if (ls + (li-lp) >= len) {
+                       DEBUG(0,("ERROR: string overflow by %d in all_string_sub(%.50s, %d)\n", 
+                                (int)(ls + (li-lp) - len),
+                                pattern, (int)len));
+                       break;
+               }
+               if (li != lp) {
+                       memmove(p+li,p+lp,strlen(p+lp)+1);
+               }
+               memcpy(p, insert, li);
+               s = p + li;
+               ls += (li-lp);
+       }
+}
diff --git a/ctdb/lib/util/util.c b/ctdb/lib/util/util.c
new file mode 100644 (file)
index 0000000..af52805
--- /dev/null
@@ -0,0 +1,52 @@
+/* 
+   Unix SMB/CIFS implementation.
+   
+   Copyright (C) Andrew Tridgell 2005
+   
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+
+
+/**
+ Set a fd into blocking/nonblocking mode. Uses POSIX O_NONBLOCK if available,
+ else
+  if SYSV use O_NDELAY
+  if BSD use FNDELAY
+**/
+
+_PUBLIC_ int set_blocking(int fd, bool set)
+{
+       int val;
+#ifdef O_NONBLOCK
+#define FLAG_TO_SET O_NONBLOCK
+#else
+#ifdef SYSV
+#define FLAG_TO_SET O_NDELAY
+#else /* BSD */
+#define FLAG_TO_SET FNDELAY
+#endif
+#endif
+
+       if((val = fcntl(fd, F_GETFL, 0)) == -1)
+               return -1;
+       if(set) /* Turn blocking on - ie. clear nonblock flag */
+               val &= ~FLAG_TO_SET;
+       else
+               val |= FLAG_TO_SET;
+       return fcntl( fd, F_SETFL, val);
+#undef FLAG_TO_SET
+}
diff --git a/ctdb/lib/util/util.h b/ctdb/lib/util/util.h
new file mode 100644 (file)
index 0000000..467fba8
--- /dev/null
@@ -0,0 +1,651 @@
+/* 
+   Unix SMB/CIFS implementation.
+   Utility functions for Samba
+   Copyright (C) Andrew Tridgell 1992-1999
+   Copyright (C) Jelmer Vernooij 2005
+    
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _SAMBA_UTIL_H_
+#define _SAMBA_UTIL_H_
+
+/**
+ * @file
+ * @brief Helpful macros
+ */
+
+struct smbsrv_tcon;
+
+#ifdef _SAMBA_BUILD_
+extern const char *logfile;
+#endif
+extern const char *panic_action;
+extern void (*pre_panic_action_hook)(void);
+extern void (*post_panic_action_hook)(void);
+
+/**
+ * assert macros 
+ */
+#ifdef DEVELOPER
+#define SMB_ASSERT(b) do { if (!(b)) { \
+        DEBUG(0,("PANIC: assert failed at %s(%d): %s\n", \
+                __FILE__, __LINE__, #b)); smb_panic("assert failed: " #b); }} while(0)
+#else
+/* redefine the assert macro for non-developer builds */
+#define SMB_ASSERT(b) do { if (!(b)) { \
+        DEBUG(0,("PANIC: assert failed at %s(%d): %s\n", \
+           __FILE__, __LINE__, #b)); }} while (0)
+#endif
+
+#if _SAMBA_BUILD_ == 4
+#ifdef VALGRIND
+#define strlen(x) valgrind_strlen(x)
+size_t valgrind_strlen(const char *s);
+#endif
+#endif
+
+#ifndef ABS
+#define ABS(a) ((a)>0?(a):(-(a)))
+#endif
+
+/**
+ * Write backtrace to debug log
+ */
+_PUBLIC_ void call_backtrace(void);
+
+/**
+ Something really nasty happened - panic !
+**/
+_PUBLIC_ _NORETURN_ void smb_panic(const char *why);
+
+/**
+setup our fault handlers
+**/
+_PUBLIC_ void fault_setup(const char *pname);
+
+/**
+  register a fault handler. 
+  Should only be called once in the execution of smbd.
+*/
+_PUBLIC_ bool register_fault_handler(const char *name, void (*fault_handler)(int sig));
+
+/* The following definitions come from lib/util/signal.c  */
+
+
+/**
+ Block sigs.
+**/
+void BlockSignals(bool block, int signum);
+
+/**
+ Catch a signal. This should implement the following semantics:
+
+ 1) The handler remains installed after being called.
+ 2) The signal should be blocked during handler execution.
+**/
+void (*CatchSignal(int signum,void (*handler)(int )))(int);
+
+/**
+ Ignore SIGCLD via whatever means is necessary for this OS.
+**/
+void CatchChild(void);
+
+/**
+ Catch SIGCLD but leave the child around so it's status can be reaped.
+**/
+void CatchChildLeaveStatus(void);
+
+
+/* The following definitions come from lib/util/util_str.c  */
+
+
+/**
+ Trim the specified elements off the front and back of a string.
+**/
+_PUBLIC_ bool trim_string(char *s, const char *front, const char *back);
+
+/**
+ Find the number of 'c' chars in a string
+**/
+_PUBLIC_ _PURE_ size_t count_chars(const char *s, char c);
+
+/**
+ Safe string copy into a known length string. maxlength does not
+ include the terminating zero.
+**/
+_PUBLIC_ char *safe_strcpy(char *dest,const char *src, size_t maxlength);
+
+/**
+ Safe string cat into a string. maxlength does not
+ include the terminating zero.
+**/
+_PUBLIC_ char *safe_strcat(char *dest, const char *src, size_t maxlength);
+
+/**
+ Routine to get hex characters and turn them into a 16 byte array.
+ the array can be variable length, and any non-hex-numeric
+ characters are skipped.  "0xnn" or "0Xnn" is specially catered
+ for.
+
+ valid examples: "0A5D15"; "0x15, 0x49, 0xa2"; "59\ta9\te3\n"
+
+
+**/
+_PUBLIC_ size_t strhex_to_str(char *p, size_t p_len, const char *strhex, size_t strhex_len);
+
+#ifdef _SAMBA_BUILD_
+/** 
+ * Parse a hex string and return a data blob. 
+ */
+_PUBLIC_ _PURE_ DATA_BLOB strhex_to_data_blob(TALLOC_CTX *mem_ctx, const char *strhex) ;
+#endif
+
+/**
+ * Routine to print a buffer as HEX digits, into an allocated string.
+ */
+_PUBLIC_ void hex_encode(const unsigned char *buff_in, size_t len, char **out_hex_buffer);
+
+/**
+ * talloc version of hex_encode()
+ */
+_PUBLIC_ char *hex_encode_talloc(TALLOC_CTX *mem_ctx, const unsigned char *buff_in, size_t len);
+
+/**
+ Substitute a string for a pattern in another string. Make sure there is 
+ enough room!
+
+ This routine looks for pattern in s and replaces it with 
+ insert. It may do multiple replacements.
+
+ Any of " ; ' $ or ` in the insert string are replaced with _
+ if len==0 then the string cannot be extended. This is different from the old
+ use of len==0 which was for no length checks to be done.
+**/
+_PUBLIC_ void string_sub(char *s,const char *pattern, const char *insert, size_t len);
+
+
+_PUBLIC_ char *string_sub_talloc(TALLOC_CTX *mem_ctx, const char *s, 
+                               const char *pattern, const char *insert);
+
+/**
+ Similar to string_sub() but allows for any character to be substituted. 
+ Use with caution!
+ if len==0 then the string cannot be extended. This is different from the old
+ use of len==0 which was for no length checks to be done.
+**/
+_PUBLIC_ void all_string_sub(char *s,const char *pattern,const char *insert, size_t len);
+
+/**
+ Unescape a URL encoded string, in place.
+**/
+_PUBLIC_ void rfc1738_unescape(char *buf);
+
+/**
+  format a string into length-prefixed dotted domain format, as used in NBT
+  and in some ADS structures
+**/
+_PUBLIC_ const char *str_format_nbt_domain(TALLOC_CTX *mem_ctx, const char *s);
+
+/**
+ * Add a string to an array of strings.
+ *
+ * num should be a pointer to an integer that holds the current 
+ * number of elements in strings. It will be updated by this function.
+ */
+_PUBLIC_ bool add_string_to_array(TALLOC_CTX *mem_ctx,
+                        const char *str, const char ***strings, int *num);
+
+/**
+  varient of strcmp() that handles NULL ptrs
+**/
+_PUBLIC_ int strcmp_safe(const char *s1, const char *s2);
+
+/**
+return the number of bytes occupied by a buffer in ASCII format
+the result includes the null termination
+limited by 'n' bytes
+**/
+_PUBLIC_ size_t ascii_len_n(const char *src, size_t n);
+
+/**
+ Set a boolean variable from the text value stored in the passed string.
+ Returns true in success, false if the passed string does not correctly 
+ represent a boolean.
+**/
+_PUBLIC_ bool set_boolean(const char *boolean_string, bool *boolean);
+
+/**
+ * Parse a string containing a boolean value.
+ *
+ * val will be set to the read value.
+ *
+ * @retval true if a boolean value was parsed, false otherwise.
+ */
+_PUBLIC_ bool conv_str_bool(const char * str, bool * val);
+
+#if _SAMBA_BUILD_ == 4
+/**
+ * Convert a size specification like 16K into an integral number of bytes. 
+ **/
+_PUBLIC_ bool conv_str_size(const char * str, uint64_t * val);
+#endif
+
+/**
+ * Parse a uint64_t value from a string
+ *
+ * val will be set to the value read.
+ *
+ * @retval true if parsing was successful, false otherwise
+ */
+_PUBLIC_ bool conv_str_u64(const char * str, uint64_t * val);
+
+/**
+return the number of bytes occupied by a buffer in CH_UTF16 format
+the result includes the null termination
+**/
+_PUBLIC_ size_t utf16_len(const void *buf);
+
+/**
+return the number of bytes occupied by a buffer in CH_UTF16 format
+the result includes the null termination
+limited by 'n' bytes
+**/
+_PUBLIC_ size_t utf16_len_n(const void *src, size_t n);
+_PUBLIC_ size_t ucs2_align(const void *base_ptr, const void *p, int flags);
+
+/**
+Do a case-insensitive, whitespace-ignoring string compare.
+**/
+_PUBLIC_ int strwicmp(const char *psz1, const char *psz2);
+
+/**
+ String replace.
+**/
+_PUBLIC_ void string_replace(char *s, char oldc, char newc);
+
+/**
+ * Compare 2 strings.
+ *
+ * @note The comparison is case-insensitive.
+ **/
+_PUBLIC_ bool strequal(const char *s1, const char *s2);
+
+/* The following definitions come from lib/util/util_strlist.c  */
+#ifdef _SAMBA_BUILD_
+
+/* separators for lists */
+#ifndef LIST_SEP
+#define LIST_SEP " \t,\n\r"
+#endif
+
+/**
+  build a null terminated list of strings from a input string and a
+  separator list. The separator list must contain characters less than
+  or equal to 0x2f for this to work correctly on multi-byte strings
+*/
+_PUBLIC_ char **str_list_make(TALLOC_CTX *mem_ctx, const char *string, const char *sep);
+
+/**
+ * build a null terminated list of strings from an argv-like input string 
+ * Entries are seperated by spaces and can be enclosed by quotes. 
+ * Does NOT support escaping
+ */
+_PUBLIC_ const char **str_list_make_shell(TALLOC_CTX *mem_ctx, const char *string, const char *sep);
+
+/**
+ * join a list back to one string 
+ */
+_PUBLIC_ char *str_list_join(TALLOC_CTX *mem_ctx, const char **list, char seperator);
+
+/** join a list back to one (shell-like) string; entries 
+ * seperated by spaces, using quotes where necessary */
+_PUBLIC_ char *str_list_join_shell(TALLOC_CTX *mem_ctx, const char **list, char sep);
+
+/**
+  return the number of elements in a string list
+*/
+_PUBLIC_ size_t str_list_length(const char * const *list);
+
+/**
+  copy a string list
+*/
+_PUBLIC_ char **str_list_copy(TALLOC_CTX *mem_ctx, const char **list);
+
+/**
+   Return true if all the elements of the list match exactly.
+ */
+_PUBLIC_ bool str_list_equal(const char **list1, const char **list2);
+
+/**
+  add an entry to a string list
+*/
+_PUBLIC_ const char **str_list_add(const char **list, const char *s);
+
+/**
+  remove an entry from a string list
+*/
+_PUBLIC_ void str_list_remove(const char **list, const char *s);
+
+/**
+  return true if a string is in a list
+*/
+_PUBLIC_ bool str_list_check(const char **list, const char *s);
+
+/**
+  return true if a string is in a list, case insensitively
+*/
+_PUBLIC_ bool str_list_check_ci(const char **list, const char *s);
+#endif
+
+/* The following definitions come from lib/util/util_file.c  */
+
+
+#ifdef _SAMBA_BUILD_
+/**
+read a line from a file with possible \ continuation chars. 
+Blanks at the start or end of a line are stripped.
+The string will be allocated if s2 is NULL
+**/
+_PUBLIC_ char *fgets_slash(char *s2,int maxlen,XFILE *f);
+#endif
+
+/**
+ * Read one line (data until next newline or eof) and allocate it 
+ */
+_PUBLIC_ char *afdgets(int fd, TALLOC_CTX *mem_ctx, size_t hint);
+
+#ifdef _SAMBA_BUILD_
+/**
+load a file into memory from a fd.
+**/
+_PUBLIC_ char *fd_load(int fd, size_t *size, size_t maxsize, TALLOC_CTX *mem_ctx);
+
+
+char **file_lines_parse(char *p, size_t size, int *numlines, TALLOC_CTX *mem_ctx);
+
+/**
+load a file into memory
+**/
+_PUBLIC_ char *file_load(const char *fname, size_t *size, size_t maxsize, TALLOC_CTX *mem_ctx);
+#endif
+
+/**
+mmap (if possible) or read a file
+**/
+_PUBLIC_ void *map_file(const char *fname, size_t size);
+
+#ifdef _SAMBA_BUILD_
+/**
+load a file into memory and return an array of pointers to lines in the file
+must be freed with talloc_free(). 
+**/
+_PUBLIC_ char **file_lines_load(const char *fname, int *numlines, size_t maxsize, TALLOC_CTX *mem_ctx);
+#endif
+
+/**
+load a fd into memory and return an array of pointers to lines in the file
+must be freed with talloc_free(). If convert is true calls unix_to_dos on
+the list.
+**/
+_PUBLIC_ char **fd_lines_load(int fd, int *numlines, size_t maxsize, TALLOC_CTX *mem_ctx);
+
+/**
+take a list of lines and modify them to produce a list where \ continues
+a line
+**/
+_PUBLIC_ void file_lines_slashcont(char **lines);
+
+/**
+  save a lump of data into a file. Mostly used for debugging 
+*/
+_PUBLIC_ bool file_save(const char *fname, const void *packet, size_t length);
+_PUBLIC_ int vfdprintf(int fd, const char *format, va_list ap) PRINTF_ATTRIBUTE(2,0);
+_PUBLIC_ int fdprintf(int fd, const char *format, ...) PRINTF_ATTRIBUTE(2,3);
+_PUBLIC_ bool large_file_support(const char *path);
+
+/* The following definitions come from lib/util/util.c  */
+
+
+/**
+ Find a suitable temporary directory. The result should be copied immediately
+ as it may be overwritten by a subsequent call.
+**/
+_PUBLIC_ const char *tmpdir(void);
+
+/**
+ Check if a file exists - call vfs_file_exist for samba files.
+**/
+_PUBLIC_ bool file_exist(const char *fname);
+
+/**
+ Check a files mod time.
+**/
+_PUBLIC_ time_t file_modtime(const char *fname);
+
+/**
+ Check if a directory exists.
+**/
+_PUBLIC_ bool directory_exist(const char *dname);
+
+/**
+ * Try to create the specified directory if it didn't exist.
+ *
+ * @retval true if the directory already existed and has the right permissions 
+ * or was successfully created.
+ */
+_PUBLIC_ bool directory_create_or_exist(const char *dname, uid_t uid, 
+                              mode_t dir_perms);
+
+/**
+ Set a fd into blocking/nonblocking mode. Uses POSIX O_NONBLOCK if available,
+ else
+  if SYSV use O_NDELAY
+  if BSD use FNDELAY
+**/
+_PUBLIC_ int set_blocking(int fd, bool set);
+
+/**
+ Sleep for a specified number of milliseconds.
+**/
+_PUBLIC_ void msleep(unsigned int t);
+
+/**
+ Get my own name, return in malloc'ed storage.
+**/
+_PUBLIC_ char* get_myname(void);
+
+/**
+ Return true if a string could be a pure IP address.
+**/
+_PUBLIC_ bool is_ipaddress(const char *str);
+
+/**
+ Interpret an internet address or name into an IP address in 4 byte form.
+**/
+_PUBLIC_ uint32_t interpret_addr(const char *str);
+
+/**
+ A convenient addition to interpret_addr().
+**/
+_PUBLIC_ struct in_addr interpret_addr2(const char *str);
+
+/**
+ Check if an IP is the 0.0.0.0.
+**/
+_PUBLIC_ bool is_zero_ip_v4(struct in_addr ip);
+
+/**
+ Are two IPs on the same subnet?
+**/
+_PUBLIC_ bool same_net_v4(struct in_addr ip1,struct in_addr ip2,struct in_addr mask);
+
+_PUBLIC_ bool is_ipaddress_v4(const char *str);
+
+/**
+ Check if a process exists. Does this work on all unixes?
+**/
+_PUBLIC_ bool process_exists_by_pid(pid_t pid);
+
+/**
+ Simple routine to do POSIX file locking. Cruft in NFS and 64->32 bit mapping
+ is dealt with in posix.c
+**/
+_PUBLIC_ bool fcntl_lock(int fd, int op, off_t offset, off_t count, int type);
+
+/**
+ malloc that aborts with smb_panic on fail or zero size.
+**/
+_PUBLIC_ void *smb_xmalloc(size_t size);
+
+/**
+ Memdup with smb_panic on fail.
+**/
+_PUBLIC_ void *smb_xmemdup(const void *p, size_t size);
+
+/**
+ strdup that aborts on malloc fail.
+**/
+_PUBLIC_ char *smb_xstrdup(const char *s);
+
+char *smb_xstrndup(const char *s, size_t n);
+
+/**
+ Like strdup but for memory.
+**/
+_PUBLIC_ void *memdup(const void *p, size_t size);
+
+/**
+ * see if a range of memory is all zero. A NULL pointer is considered
+ * to be all zero 
+ */
+_PUBLIC_ bool all_zero(const uint8_t *ptr, size_t size);
+
+/**
+  realloc an array, checking for integer overflow in the array size
+*/
+_PUBLIC_ void *realloc_array(void *ptr, size_t el_size, unsigned count, bool free_on_fail);
+
+void *malloc_array(size_t el_size, unsigned int count);
+
+/* The following definitions come from lib/util/fsusage.c  */
+
+
+/**
+ * Retrieve amount of free disk space.
+ * this does all of the system specific guff to get the free disk space.
+ * It is derived from code in the GNU fileutils package, but has been
+ * considerably mangled for use here 
+ *
+ * results are returned in *dfree and *dsize, in 512 byte units
+*/
+_PUBLIC_ int sys_fsusage(const char *path, uint64_t *dfree, uint64_t *dsize);
+
+/* The following definitions come from lib/util/ms_fnmatch.c  */
+
+
+/**
+ * @file
+ * @brief MS-style Filename matching
+ */
+
+#if _SAMBA_BUILD_ == 4
+/* protocol types. It assumes that higher protocols include lower protocols
+   as subsets. FIXME: Move to one of the smb-specific headers */
+enum protocol_types {
+       PROTOCOL_NONE,
+       PROTOCOL_CORE,
+       PROTOCOL_COREPLUS,
+       PROTOCOL_LANMAN1,
+       PROTOCOL_LANMAN2,
+       PROTOCOL_NT1,
+       PROTOCOL_SMB2
+};
+
+int ms_fnmatch(const char *pattern, const char *string, enum protocol_types protocol);
+
+/** a generic fnmatch function - uses for non-CIFS pattern matching */
+int gen_fnmatch(const char *pattern, const char *string);
+#endif
+
+/* The following definitions come from lib/util/mutex.c  */
+
+
+#ifdef _SAMBA_BUILD_
+/**
+  register a set of mutex/rwlock handlers. 
+  Should only be called once in the execution of smbd.
+*/
+_PUBLIC_ bool register_mutex_handlers(const char *name, struct mutex_ops *ops);
+#endif
+
+/* The following definitions come from lib/util/idtree.c  */
+
+
+/**
+  initialise a idr tree. The context return value must be passed to
+  all subsequent idr calls. To destroy the idr tree use talloc_free()
+  on this context
+ */
+_PUBLIC_ struct idr_context *idr_init(TALLOC_CTX *mem_ctx);
+
+/**
+  allocate the next available id, and assign 'ptr' into its slot.
+  you can retrieve later this pointer using idr_find()
+*/
+_PUBLIC_ int idr_get_new(struct idr_context *idp, void *ptr, int limit);
+
+/**
+   allocate a new id, giving the first available value greater than or
+   equal to the given starting id
+*/
+_PUBLIC_ int idr_get_new_above(struct idr_context *idp, void *ptr, int starting_id, int limit);
+
+/**
+  allocate a new id randomly in the given range
+*/
+_PUBLIC_ int idr_get_new_random(struct idr_context *idp, void *ptr, int limit);
+
+/**
+  find a pointer value previously set with idr_get_new given an id
+*/
+_PUBLIC_ void *idr_find(struct idr_context *idp, int id);
+
+/**
+  remove an id from the idr tree
+*/
+_PUBLIC_ int idr_remove(struct idr_context *idp, int id);
+
+/* The following definitions come from lib/util/become_daemon.c  */
+
+#if _SAMBA_BUILD_ == 4
+/**
+ Become a daemon, discarding the controlling terminal.
+**/
+_PUBLIC_ void become_daemon(bool fork);
+#endif
+
+/**
+ * Load a ini-style file.
+ */
+bool pm_process( const char *fileName,
+                 bool (*sfunc)(const char *, void *),
+                 bool (*pfunc)(const char *, const char *, void *),
+                                void *userdata);
+
+bool unmap_file(void *start, size_t size);
+
+#define CONST_DISCARD(type, ptr)      ((type) ((void *) (ptr)))
+
+#endif /* _SAMBA_UTIL_H_ */
diff --git a/ctdb/lib/util/util_file.c b/ctdb/lib/util/util_file.c
new file mode 100644 (file)
index 0000000..3a90201
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+  functions taken from samba4 for quick prototyping of ctdb. These are
+  not intended to remain part of ctdb
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+
+
+static char *fd_load(int fd, size_t *size, TALLOC_CTX *mem_ctx)
+{
+       struct stat sbuf;
+       char *p;
+
+       if (fstat(fd, &sbuf) != 0) return NULL;
+
+       p = (char *)talloc_size(mem_ctx, sbuf.st_size+1);
+       if (!p) return NULL;
+
+       if (read(fd, p, sbuf.st_size) != sbuf.st_size) {
+               talloc_free(p);
+               return NULL;
+       }
+       p[sbuf.st_size] = 0;
+
+       if (size) *size = sbuf.st_size;
+
+       return p;
+}
+
+
+static char *file_load(const char *fname, size_t *size, TALLOC_CTX *mem_ctx)
+{
+       int fd;
+       char *p;
+
+       if (!fname || !*fname) return NULL;
+       
+       fd = open(fname,O_RDONLY);
+       if (fd == -1) return NULL;
+
+       p = fd_load(fd, size, mem_ctx);
+
+       close(fd);
+
+       return p;
+}
+
+
+/**
+parse a buffer into lines
+'p' will be freed on error, and otherwise will be made a child of the returned array
+**/
+static char **file_lines_parse(char *p, size_t size, int *numlines, TALLOC_CTX *mem_ctx)
+{
+       int i;
+       char *s, **ret;
+
+       if (!p) return NULL;
+
+       for (s = p, i=0; s < p+size; s++) {
+               if (s[0] == '\n') i++;
+       }
+
+       ret = talloc_array(mem_ctx, char *, i+2);
+       if (!ret) {
+               talloc_free(p);
+               return NULL;
+       }       
+       
+       talloc_steal(ret, p);
+       
+       memset(ret, 0, sizeof(ret[0])*(i+2));
+       if (numlines) *numlines = i;
+
+       ret[0] = p;
+       for (s = p, i=0; s < p+size; s++) {
+               if (s[0] == '\n') {
+                       s[0] = 0;
+                       i++;
+                       ret[i] = s+1;
+               }
+               if (s[0] == '\r') s[0] = 0;
+       }
+
+       return ret;
+}
+
+
+/**
+load a file into memory and return an array of pointers to lines in the file
+must be freed with talloc_free(). 
+**/
+_PUBLIC_ char **file_lines_load(const char *fname, int *numlines, TALLOC_CTX *mem_ctx)
+{
+       char *p;
+       size_t size;
+
+       p = file_load(fname, &size, mem_ctx);
+       if (!p) return NULL;
+
+       return file_lines_parse(p, size, numlines, mem_ctx);
+}
+
+char *hex_encode_talloc(TALLOC_CTX *mem_ctx, const unsigned char *buff_in, size_t len)
+{
+       int i;
+       char *hex_buffer;
+
+       hex_buffer = talloc_array(mem_ctx, char, (len*2)+1);
+
+       for (i = 0; i < len; i++)
+               slprintf(&hex_buffer[i*2], 3, "%02X", buff_in[i]);
+
+       return hex_buffer;
+}
+
+uint8_t *hex_decode_talloc(TALLOC_CTX *mem_ctx, const char *hex_in, size_t *len)
+{
+       int i, num;
+       uint8_t *buffer;
+
+       *len = strlen(hex_in) / 2;
+       buffer = talloc_array(mem_ctx, unsigned char, *len);
+
+       for (i=0; i<*len; i++) {
+               sscanf(&hex_in[i*2], "%02X", &num);
+               buffer[i] = (uint8_t)num;
+       }
+
+       return buffer;
+}
diff --git a/ctdb/lib/util/util_time.c b/ctdb/lib/util/util_time.c
new file mode 100644 (file)
index 0000000..be14f26
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+  functions taken from samba4 for quick prototyping of ctdb. These are
+  not intended to remain part of ctdb
+*/
+
+#include "includes.h"
+#include "system/time.h"
+#include "system/filesys.h"
+
+
+/**
+  return a zero timeval
+*/
+struct timeval timeval_zero(void)
+{
+       struct timeval tv;
+       tv.tv_sec = 0;
+       tv.tv_usec = 0;
+       return tv;
+}
+
+/**
+  return True if a timeval is zero
+*/
+bool timeval_is_zero(const struct timeval *tv)
+{
+       return tv->tv_sec == 0 && tv->tv_usec == 0;
+}
+
+/**
+  return a timeval for the current time
+*/
+struct timeval timeval_current(void)
+{
+       struct timeval tv;
+       gettimeofday(&tv, NULL);
+       return tv;
+}
+
+double timeval_elapsed(struct timeval *tv)
+{
+       struct timeval tv2 = timeval_current();
+       return (tv2.tv_sec - tv->tv_sec) + 
+              (tv2.tv_usec - tv->tv_usec)*1.0e-6;
+}
+
+double timeval_delta(struct timeval *tv2, struct timeval *tv)
+{
+       return (tv2->tv_sec - tv->tv_sec) + 
+              (tv2->tv_usec - tv->tv_usec)*1.0e-6;
+}
+
+/**
+  return a timeval struct with the given elements
+*/
+_PUBLIC_ struct timeval timeval_set(uint32_t secs, uint32_t usecs)
+{
+       struct timeval tv;
+       tv.tv_sec = secs;
+       tv.tv_usec = usecs;
+       return tv;
+}
+
+_PUBLIC_ int timeval_compare(const struct timeval *tv1, const struct timeval *tv2)
+{
+       if (tv1->tv_sec  > tv2->tv_sec)  return 1;
+       if (tv1->tv_sec  < tv2->tv_sec)  return -1;
+       if (tv1->tv_usec > tv2->tv_usec) return 1;
+       if (tv1->tv_usec < tv2->tv_usec) return -1;
+       return 0;
+}
+
+_PUBLIC_ struct timeval timeval_until(const struct timeval *tv1,
+                                     const struct timeval *tv2)
+{
+       struct timeval t;
+       if (timeval_compare(tv1, tv2) >= 0) {
+               return timeval_zero();
+       }
+       t.tv_sec = tv2->tv_sec - tv1->tv_sec;
+       if (tv1->tv_usec > tv2->tv_usec) {
+               t.tv_sec--;
+               t.tv_usec = 1000000 - (tv1->tv_usec - tv2->tv_usec);
+       } else {
+               t.tv_usec = tv2->tv_usec - tv1->tv_usec;
+       }
+       return t;
+}
+
+static struct timeval timeval_add(const struct timeval *tv,
+                          uint32_t secs, uint32_t usecs)
+{
+       struct timeval tv2 = *tv;
+       const unsigned int million = 1000000;
+       tv2.tv_sec += secs;
+       tv2.tv_usec += usecs;
+       tv2.tv_sec += tv2.tv_usec / million;
+       tv2.tv_usec = tv2.tv_usec % million;
+       return tv2;
+}
+
+
+_PUBLIC_ struct timeval timeval_current_ofs(uint32_t secs, uint32_t usecs)
+{
+       struct timeval tv = timeval_current();
+       return timeval_add(&tv, secs, usecs);
+}
+
diff --git a/ctdb/packaging/RPM/ctdb.spec.in b/ctdb/packaging/RPM/ctdb.spec.in
new file mode 100644 (file)
index 0000000..b7d1bef
--- /dev/null
@@ -0,0 +1,1053 @@
+%define with_systemd  %{?_with_systemd: 1} %{?!_with_systemd: 0}
+%define initdir %{_sysconfdir}/init.d
+Name: ctdb
+Summary: Clustered TDB
+Vendor: Samba Team
+Packager: Samba Team <samba@samba.org>
+Version: @VERSION@
+Release: @RELEASE@
+Epoch: 0
+License: GNU GPL version 3
+Group: System Environment/Daemons
+URL: http://ctdb.samba.org/
+
+Source: ctdb-%{version}.tar.gz
+
+# Packages
+Requires: coreutils, sed, gawk, iptables, iproute, procps, ethtool, sudo
+# Commands - package name might vary
+Requires: /usr/bin/killall, /bin/kill, /bin/netstat
+
+Provides: ctdb = %{version}
+
+Prefix: /usr
+BuildRoot: %{_tmppath}/%{name}-%{version}-root
+
+# Allow build with system libraries
+# To enable, run rpmbuild with,
+#      "--with system_talloc"
+#      "--with system_tdb"
+#      "--with system_tevent"
+%define with_included_talloc %{?_with_system_talloc: 0} %{?!_with_system_talloc: 1}
+%define with_included_tdb %{?_with_system_tdb: 0} %{?!_with_system_tdb: 1}
+%define with_included_tevent %{?_with_system_tevent: 0} %{?!_with_system_tevent: 1}
+
+# Required minimum library versions when building with system libraries
+%define libtalloc_version 2.0.8
+%define libtdb_version 1.2.11
+%define libtevent_version 0.9.18
+
+%if ! %with_included_talloc
+BuildRequires: libtalloc-devel >= %{libtalloc_version}
+Requires: libtalloc >= %{libtalloc_version}
+%endif
+%if ! %with_included_tdb
+BuildRequires: libtdb-devel >= %{libtdb_version}
+Requires: libtdb >= %{libtdb_version}
+%endif
+%if ! %with_included_tevent
+BuildRequires: libtevent-devel >= %{libtevent_version}
+Requires: libtevent >= %{libtevent_version}
+%endif
+
+# To build the ctdb-pcp-pmda package, run rpmbuild with "--with pmda"
+%define with_pcp_pmda  %{?_with_pmda: 1} %{?!_with_pmda: 0}
+%if %with_pcp_pmda
+BuildRequires: pcp-libs-devel
+%endif
+
+%if %{with_systemd}
+BuildRequires: systemd-units
+%endif
+
+%description
+ctdb is the clustered database used by samba
+
+#######################################################################
+
+
+
+%prep
+%setup -q
+# setup the init script and sysconfig file
+%setup -T -D -n ctdb-%{version} -q
+
+%build
+
+## check for ccache
+if ccache -h >/dev/null 2>&1 ; then
+       CC="ccache gcc"
+else
+       CC="gcc"
+fi
+
+export CC
+
+## always run autogen.sh
+./autogen.sh
+
+CFLAGS="$RPM_OPT_FLAGS $EXTRA -D_GNU_SOURCE" ./configure \
+%if %with_included_talloc
+       --with-included-talloc \
+%endif
+%if %with_included_tdb
+       --with-included-tdb \
+%endif
+%if %with_included_tevent
+       --with-included-tevent \
+%endif
+%if %with_pcp_pmda
+       --enable-pmda \
+%endif
+       --prefix=%{_prefix} \
+       --sysconfdir=%{_sysconfdir} \
+       --mandir=%{_mandir} \
+       --localstatedir="/var"
+
+make docdir=%{_docdir} showflags
+make docdir=%{_docdir}
+
+%install
+# Clean up in case there is trash left from a previous build
+rm -rf $RPM_BUILD_ROOT
+
+# Create the target build directory hierarchy
+mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig
+mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/sudoers.d
+
+make DESTDIR=$RPM_BUILD_ROOT docdir=%{_docdir} install install_tests
+
+install -m644 config/ctdb.sysconfig $RPM_BUILD_ROOT%{_sysconfdir}/sysconfig/ctdb
+
+%if %{with_systemd}
+mkdir -p $RPM_BUILD_ROOT%{_unitdir}
+install -m 755 config/ctdb.service $RPM_BUILD_ROOT%{_unitdir}
+%else
+mkdir -p $RPM_BUILD_ROOT%{initdir}
+install -m755 config/ctdb.init $RPM_BUILD_ROOT%{initdir}/ctdb
+%endif
+
+cp config/events.d/README README.eventscripts
+cp config/notify.d.README README.notify.d
+
+# Remove "*.old" files
+find $RPM_BUILD_ROOT -name "*.old" -exec rm -f {} \;
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+
+#######################################################################
+## Files section                                                     ##
+#######################################################################
+
+%files
+%defattr(-,root,root)
+
+%config(noreplace) %{_sysconfdir}/sysconfig/ctdb
+%config(noreplace) %{_sysconfdir}/ctdb/notify.sh
+%config(noreplace) %{_sysconfdir}/ctdb/debug-hung-script.sh
+%config(noreplace) %{_sysconfdir}/ctdb/ctdb-crash-cleanup.sh
+%config(noreplace) %{_sysconfdir}/ctdb/gcore_trace.sh
+%config(noreplace) %{_sysconfdir}/ctdb/debug_locks.sh
+
+%if %{with_systemd}
+%{_unitdir}/ctdb.service
+%else
+%attr(755,root,root) %{initdir}/ctdb
+%endif
+
+%attr(755,root,root) %{_sysconfdir}/ctdb/notify.d
+
+%doc README COPYING NEWS
+%doc README.eventscripts README.notify.d
+%doc doc/recovery-process.txt
+%doc doc/*.html
+%doc doc/examples
+%{_sysconfdir}/sudoers.d/ctdb
+%{_sysconfdir}/ctdb/functions
+%{_sysconfdir}/ctdb/events.d/00.ctdb
+%{_sysconfdir}/ctdb/events.d/01.reclock
+%{_sysconfdir}/ctdb/events.d/10.interface
+%{_sysconfdir}/ctdb/events.d/13.per_ip_routing
+%{_sysconfdir}/ctdb/events.d/11.natgw
+%{_sysconfdir}/ctdb/events.d/11.routing
+%{_sysconfdir}/ctdb/events.d/20.multipathd
+%{_sysconfdir}/ctdb/events.d/31.clamd
+%{_sysconfdir}/ctdb/events.d/40.fs_use
+%{_sysconfdir}/ctdb/events.d/40.vsftpd
+%{_sysconfdir}/ctdb/events.d/41.httpd
+%{_sysconfdir}/ctdb/events.d/49.winbind
+%{_sysconfdir}/ctdb/events.d/50.samba
+%{_sysconfdir}/ctdb/events.d/60.nfs
+%{_sysconfdir}/ctdb/events.d/60.ganesha
+%{_sysconfdir}/ctdb/events.d/62.cnfs
+%{_sysconfdir}/ctdb/events.d/70.iscsi
+%{_sysconfdir}/ctdb/events.d/91.lvs
+%{_sysconfdir}/ctdb/events.d/99.timeout
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/10.statd.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/20.nfsd.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/30.lockd.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/40.mountd.check
+%config(noreplace) %{_sysconfdir}/ctdb/nfs-rpc-checks.d/50.rquotad.check
+%{_sysconfdir}/ctdb/statd-callout
+%{_sbindir}/ctdbd
+%{_sbindir}/ctdbd_wrapper
+%{_bindir}/ctdb
+%{_bindir}/ctdb_lock_helper
+%{_bindir}/smnotify
+%{_bindir}/ping_pong
+%{_bindir}/ltdbtool
+%{_bindir}/ctdb_diagnostics
+%{_bindir}/onnode
+%{_mandir}/man1/ctdb.1.gz
+%{_mandir}/man1/ctdbd.1.gz
+%{_mandir}/man1/onnode.1.gz
+%{_mandir}/man1/ltdbtool.1.gz
+%{_mandir}/man1/ping_pong.1.gz
+%{_libdir}/pkgconfig/ctdb.pc
+
+
+%package devel
+Summary: CTDB development libraries
+Group: Development/Libraries
+
+%description devel
+development libraries for ctdb
+
+%files devel
+%defattr(-,root,root)
+%{_includedir}/ctdb.h
+%{_includedir}/ctdb_client.h
+%{_includedir}/ctdb_protocol.h
+%{_includedir}/ctdb_private.h
+%{_includedir}/ctdb_typesafe_cb.h
+
+%package tests
+Summary: CTDB test suite
+Group: Development/Tools
+Requires: ctdb = %{version}
+Requires: nc
+
+%description tests
+test suite for ctdb
+
+%files tests
+%defattr(-,root,root)
+%dir %{_datadir}/%{name}-tests
+%{_datadir}/%{name}-tests/*
+%dir %{_libdir}/%{name}-tests
+%{_libdir}/%{name}-tests/*
+%{_bindir}/ctdb_run_tests
+%{_bindir}/ctdb_run_cluster_tests
+%doc tests/README
+
+%if %with_pcp_pmda
+
+%package pcp-pmda
+Summary: CTDB PCP pmda support
+Group: Development/Tools
+Requires: ctdb = %{version}
+Requires: pcp-libs
+
+%description pcp-pmda
+Performance Co-Pilot (PCP) support for CTDB
+
+%files pcp-pmda
+%dir /var/lib/pcp/pmdas/ctdb
+/var/lib/pcp/pmdas/ctdb/*
+
+%endif
+
+
+
+%changelog
+* Thu Mar 1 2012 : Version 1.13
+ - This is the new stable branch for modern features for ctdb.
+   Main new features are performance/scaling improvements for
+   concurrnet fetch and fetch_lock operations.
+* Tue Nov 8 2011 : Version 1.12
+ - Add new tunable : AllowClientDBAttach that can be used to stop
+   client db access during maintenance operations
+ - Updated logging for interfaces that are missing or dont exist but are
+   configured to be used.
+ - Add timeout argument to ctdb_cmdline_client
+ - PDMA support
+ - Initial support for 'readonly' delegations for ctdb databases
+   This will when finished greatly improve performance for contended hot
+   records that are used for just read-access.
+ - New 'ctdb cattdb' command
+ - Massive updates to tests and eventscripts
+ - LCP2 ip allocation algorithm
+ - Record Fetch collapse. Collapse multiple fetch-lock requests from cients
+   to a single network fetch and defer other concurrent requests until the 
+   initial fetch completes, and then service the deferred calls locally.
+   This will greatly improve performance for contended hot records 
+   where clients request write-locks.
+* Thu Sep 1 2011 : Version 1.11
+ - Major updates.
+* Tue May 25 2010 : Version 1.10
+ - New version 1.10
+* Tue May 25 2010 : Version 1.9
+ - Lots of changes
+* Wed Mar 25 2010 : Version 1.0.114
+ - Lots of changes from Metze
+* Wed Jan 13 2010 : Version 1.0.113
+ - Incorrect use of dup2() could cause ctdb to spin eating 100% cpu.
+* Tue Jan 12 2010 : Version 1.0.112
+  - Revert the use of wbinfo --ping-dc as it is proving too unreliable.
+  - Minor testsuite changes.
+* Fri Dec 18 2009 : Version 1.0.111
+ - Fix a logging bug when an eventscript is aborted that could cause a crash.
+ - Add back cb_status that was lost in a previous commit.
+* Fri Dec 18 2009 : Version 1.0.110
+ - Metxe: fix for filedescriptor leak in the new eventscript code.
+ - Rusty: fix for a crash bug in the eventscript code.
+* Thu Dec 17 2009 : Version 1.0.109
+ - Massive eventscript updates. (bz58828)
+ - Nice the daemon instead of using realtime scheduler, also use mlockall() to
+   reduce the risk of blockign due to paging.
+ - Workarounds for valgrind when forking once for each script. Valgrind consumes
+   massive cpu when terminating the scripts on virtual systems.
+ - Sync the tdb library with upstream, and use the new TDB_DISALLOW_NESTING flag.
+ - Add new command "ctdb dumpdbbackup"
+ - Start using the new tdb check framework to validate tdb files upon startup.
+ - A new framework where we can control health for individual tdb databases.
+ - Fix a crash bug in the logging code.
+ - New transaction code for persistent databases.
+ - Various other smaller fixes.
+* Mon Dec 7 2009 : Version 1.0.108
+ - Transaction updates from Michael Adam.
+ - Use the new wbinfo --ping-dc instead of -p in the eventscript for samba
+   the check if winbindd is ok.
+ - Add a better "process-exist" for samba so it will automatically
+   reap smbd's on stopped and banned nodes to reclaim subrecords.
+   This will be done a bit differently in the next release.
+ - Use a statically allocated buffer for the 'first-time' capture buffer
+   to reduce the pressure on malloc/free.
+* Wed Dec 2 2009 : Version 1.0.107
+ - fix for rusty to solve a double-free that can happen when there are
+   multiple packets queued and the connection is destroyed before
+   all packets are processed.
+* Tue Dec 1 2009 : Version 1.0.106
+ - Buildscript changes from Michael Adam
+ - Dont do a full recovery when there is a mismatch detected for ip addresses,
+   just do a less disruptive ip-reallocation
+ - When starting ctdbd, wait until all initial recoveries have finished 
+   before we issue the "startup" event.
+   So dont start services or monitoring until the cluster has
+   stabilized.
+ - Major eventscript overhaul by Ronnie, Rusty and Martins and fixes of a few
+   bugs found.
+* Thu Nov 19 2009 : Version 1.0.105
+ - Fix a bug where we could SEGV if multiple concurrent "ctdb eventscript ..."
+   are used and some of them block.
+ - Monitor the daemon from the syslog child process so we shutdown cleanly when
+   the main daemon terminates.
+ - Add a 500k line ringbuffer in memory where all log messages are stored.
+ - Add a "ctdb getlog <level>" command to pull log messages from the in memory
+   ringbuffer.
+ - From martin : fixes to cifs and nfs autotests
+ - from michael a : fix a bashism in 11.natgw
+* Fri Nov 6 2009 : Version 1.0.104
+ - Suggestion from Metze, we can now use killtcp to kill local connections
+   for nfs so change the killtcp script to kill both directions of an NFS
+   connection.
+   We used to deliberately only kill one direction in these cases due to
+   limitations.
+ - Suggestion from christian Ambach, when using natgw, try to avoid using a
+   UNHEALTHY node as the natgw master.
+ - From Michael Adam: Fix a SEGV bug in the recent change to the eventscripts
+   to allow the timeout to apply to each individual script.
+ - fix a talloc bug in teh vacuuming code that produced nasty valgrind
+   warnings.
+ - From Rusty: Set up ulimit to create core files for ctdb, and spawned
+   processes by default. This is useful for debugging and testing but can be
+   disabled by setting CTDB_SUPRESS_COREFILE=yes in the sysconfig file.
+ - Remove the wbinfo -t check from the startup check that winbindd is happy.
+ - Enhance the test for bond devices so we also check if the sysadmin have
+   disabled all slave devices using "ifdown".
+* Tue Nov 3 2009 : Version 1.0.103
+ - Dont use vacuuming on persistent databases
+ - Michael A : transaction updates to persistent databases
+ - Dont activate service automatically when installing the RPM. Leave this to the admin.
+ - Create a child process to send all log messages to, to prevent a hung/slow syslogd
+   from blocking the main daemon. In this case, discard log messages instead and let the child
+   process block.
+ - Michael A: updates to log messages
+* Thu Oct 29 2009 : Version 1.0.102
+ - Wolfgang: fix for the vacuuming code
+ - Wolfgang: stronger tests for persistent database filename tests
+ - Improve the log message when we refuse to startup since wbinfo -t fails
+   to make it easier to spot in the log.
+ - Update the uptime command output and the man page to indicate that
+   "time since last ..." if from either the last recovery OR the last failover
+ - Michael A: transaction updates
+* Wed Oct 28 2009 : Version 1.0.101
+ - create a separate context for non-monitoring events so they dont interfere with the monitor event
+ - make sure to return status 0 in teh callback when we abort an event
+* Wed Oct 28 2009 : Version 1.0.100
+ - Change eventscript handling to allow EventScriptTimeout for each individual script instead of for all scripts as a whole.
+ - Enhanced logging from the eventscripts, log the name and the duration for each script as it finishes.
+ - Add a check to use wbinfo -t for the startup event of samba
+ - TEMP: allow clients to attach to databases even when teh node is in recovery mode
+ - dont run the monitor event as frequently after an event has failed
+ - DEBUG: in the eventloops, check the local time and warn if the time changes backward or rapidly forward
+ - From Metze, fix a bug where recovery master becoming unhealthy did not trigger an ip failover.
+ - Disable the multipath script by default
+ - Automatically re-activate the reclock checking if the reclock file is specified at runtime. Update manpage to reflect this.
+ - Add a mechanism where samba can register a SRVID and if samba unexpectedly disconnects, a message will be broadcasted to all other samba daemons.
+ - Log the pstree on hung scripts to a file in /tmp isntead of /var/log/messages
+ - change ban count before unhealthy/banned to 10
+* Thu Oct 22 2009 : Version 1.0.99
+ - Fix a SEGV in the new db priority code.
+ - From Wolfgang : eliminate a ctdb_fatal() if there is a dmaster violation detected.
+ - During testing we often add/delete eventscripts at runtime. This could cause an eventscript to fail and mark the node unhealthy if an eventscript was deleted while we were listing the names. Handle the errorcode and make sure the node does not becomne unhealthy in this case.
+ - Lower the debuglevel for the messages when ctdb creates a filedescruiptor so we dont spam the logs with these messages.
+ - Dont have the RPM automatically restart ctdb
+ - Volker : add a missing transaction_cancel() in the handling of persistent databases
+ - Treat interfaces with the anme ethX* as bond devices in 10.interfaces so we do the correct test for if they are up or not.
+* Tue Oct 20 2009 : Version 1.0.98
+ - Fix for the vacuuming database from Wolfgang M
+ - Create a directory where the test framework can put temporary overrides
+   to variables and functions.
+ - Wait a lot longer before shutting down the node when the reclock file
+   is incorrectly configured, and log where it is configured.
+ - Try to avoid running the "monitor" event when databases are frozen.
+ - Add logging for every time we create a filedescriptor so we can trap
+   fd leaks.
+* Thu Oct 14 2009 : Version 1.0.97
+ - From martins : update onnode.
+   Update onnode to allow specifying an alternative nodes file from
+   the command line and also to be able to specify hostnames on the
+   list of targets :
+   onnode host1,host2,...   
+* Tue Oct 13 2009 : Version 1.0.96
+ - Add more debugging output when eventscripts have trouble. Print a 
+   "pstree -p" to the log when scripts have hung.
+ - Update the initscript,  only print the "No reclock file used" warning
+   when we do "service ctdb start", dont also print them for all other
+   actions.
+ - When changing between unhealthy/healthy state, push a request to the
+   recovery master to perform an ip reallocation   instead of waiting for the
+   recovery master to pull and check the state change.
+ - Fix a bug in the new db-priority handling where a pre-.95 recovery master
+   could no longer lock the databases on a post-.95 daemon.
+ - Always create the nfs state directories during the "monitor" event.
+   This makes it easier to configure and enable nfs at runtime.
+ - From Volker, forward-port a simper deadlock avoiding patch from the 1.0.82
+   branch. This is a simpler versionof the "db priority lock order" patch
+   that went into 1.0.95, and will be kept for a few versions until samba
+   has been updated to use the functionality from 1.0.95.
+* Mon Oct 12 2009 : Version 1.0.95
+ - Add database priorities. Allow samba to set the priority of databases
+   and lock the databases in priority order during recovery
+   to avoid a deadlock when samba locks one database then blocks indefinitely
+   while waiting for the second databaso to become locked.
+ - Be aggressive and ban nodes where the recovery transaction start call
+   fails.
+* Thu Oct 10 2009 : Version 1.0.94
+ - Be very aggressive and quickly ban nodes that can not freeze their databases
+* Tue Oct 8 2009 : Version 1.0.93
+ - When adding an ip, make sure to update this assignment on all nodes
+   so it wont show up as -1 on other nodes.
+ - When adding an ip and immediately deleting it, it was possible that
+   the daemon would crash accessing already freed memory.
+   Readjust the memory hierarchy so the destructors are called in the right order.
+ - Add a handshake to the recovery daemon to eliminate some rare cases where
+   addip/delip might cause a recovery to occur.
+ - updated onnode documenation from Martin S
+ - Updates to the natgw eventscript to allow disabling natgw at runtime
+* Fri Oct 2 2009 : Version 1.0.92
+ - Test updates and merge from martin
+ - Add notification for "startup"
+ - Add documentation for notification
+ - from martin, a fix for restarting vsftpd in the eventscript
+* Tue Sep 29 2009 : Version 1.0.91
+ - New vacuum and repack design from Wolgang Mueller.
+ - Add a new eventscript 01.reclock that will first mark a node unhealthy and later ban the node if the reclock file can not be accessed.
+ - Add machinereadable output to the ctdb getreclock command
+ - merge transaction updates from Michael Adam
+ - In the new banning code, reset the culprit count to 0 for all nodes that could successfully compelte a full recovery.
+ - dont mark the recovery master as a ban culprit because a node in the cluster needs a recovery. this happens naturally when using ctdb recover command so dont make this cause a node to be banned.
+* Sat Sep 12 2009 : Version 1.0.90
+ - Be more forgiving for eventscripts that hang during startup
+ - Fix for a banning bug in the new banning logic
+* Thu Sep 3 2009 : Version 1.0.89
+ - Make it possible to manage winbind independently of samba.
+ - Add new prototype banning code
+ - Overwrite the vsftpd state file instead of appending. This eliminates
+   annoying errors in the log.
+ - Redirect some iptables commands to dev null
+ - From Michael A, explicitely set the broadcast when we takeover a public ip
+ - Remove a reclock file check we no longer need
+ - Skip any persistent database files ending in .bak
+* Mon Aug 17 2009 : Version 1.0.88
+ - Add a new state for eventscripts : DISABLED.
+   Add two new commands "ctdb enablescript/disablescript" to enable/disable
+   eventscripts at runtime.
+ - Bugfixes for TDB from rusty.
+ - Merge/Port changes from upstream TDB library by rusty.
+ - Additional new tests from MartinS. Tests for stop/continue.
+ - Initial patch to rework vacuuming/repacking process from Wolfgang Mueller.
+ - Updates from Michael Adam for persistent writes.
+ - Updates from MartinS to handle the new STOPPED bit in the test framework.
+ - Make it possible to enable/disable the RECMASTER and LMASTER roles
+   at runtime. Add two new commands 
+   "ctdb setlmasterrole/setrecmasterrole on/off"
+ - Make it possible to enable/disable the natgw feature at runtime. Add
+   the command "ctdb setnatgwstate on/off"
+* Fri Jul 17 2009 : Version 1.0.87
+ - Add a new event "stopped" that is called when a node is stopped.
+ - Documentation of the STOPPED flag and the stop/continue commands
+ - Make it possible to start a node in STOPPED mode.
+ - Add a new node flag : STOPPED and commands "ctdb stop" "ctdb continue"
+   These commands are similar to "diasble/enable" but will also remove the node from the vnnmap, while disable only fails all ip addresses over.
+ - tests for NFS , CIFS by martins
+ - major updates to the init script by martins
+ - Send gratious arps with a 1.1 second stride instead of a 1 second stride to workaround interesting "features" of common linux stacks.
+ - Various test enhancements from martins:
+   - additional other tests
+   - add tests for grat arp generation, ping during failover, ssh and failover
+   - New/updated tcp tickle tests and supprot functions
+   - provide better debugging when a test fails
+   - make ctdbd restarts more reliable in the tests
+   - update the "wait bar" to  make the wait progress in tests more obvious
+   - various cleanups
+ - when dispatching a message to a handler, make the message a real talloc object so that we can reparent the object in the tallic hierarchy.
+ - document the ipreallocate command
+ - Updates to enable/disable to use the ipreallocate command to block until the following ipreallocation has completed.
+ - Update the main daemon and the tools to allow debug level to be a string instead of an integer.
+ - Update the sysconfig file to show using string literals instead of numeric values for the debuglevels used.
+ - If no debuglevel is specific, make "ctdb setdebug" show the available options.
+ - When trying to allocate network packets, add explicit checks if the network transport has been shutdown before trying and failing, to make log messages easier to read. Add this extra check and logging to every plave packets are allocated.
+* Tue Jun 30 2009 : Version 1.0.86
+ - Do not access the reclock at all if VerifyRecoveryLock is zero, not even try to probe it.
+ - Allow setting the reclock file as "", which means that no reclock file at all should be used.
+ - Document that a reclock file is no longer required, but that it is dangerous.
+ - Add a control that can be used to set/clear/change the reclock file in the daemon during runtime.
+ - Update the recovery daemon to poll whether a reclock file should be sued and if so which file at runtime in each monitoring cycle.
+ - Automatically disable VerifyRecoveryLock everytime a user changes the location of the reclock file.
+ - do not allow the VerifyRecoveryLock to be set using ctdb setvar if there is no recovery lock file specified.
+ - Add two commands "ctdb getreclock" and "ctdb setreclock" to modify the reclock file.
+* Tue Jun 23 2009 : Version 1.0.85
+ - From William Jojo : Dont use getopt on AIX
+ - Make it possible to use "ctdb listnodes" also when the daemon is not running
+ - Provide machinereadable output to "ctdb listnodes"
+ - Dont list DELETED nodes in the ctdb listnodes output
+ - Try to avoid causing a recovery for the average case when adding/deleting/moving an ip
+ - When banning a node, drop the IPs on that node only and not all nodes.
+ - Add tests for NFS and CIFS tickles
+ - Rename 99.routing to 11.routing so it executes before NFS and LVS scripts
+ - Increase the default timeout before we deem an unresponsive recovery daemon hung and shutdown
+ - Reduce the reclock timout to 5 seconds
+ - Spawn a child process in the recovery daemon ot check the reclock file to
+   avoid blocking the process if the underlying filesystem is unresponsive
+ - fix for filedescriptor leak when a child process timesout
+ - Dont log errors if waitpid() returns -1
+ - Onnode updates by Martins
+ - Test and initscript cleanups from Martin S
+* Tue Jun 2 2009 : Version 1.0.84
+ - Fix a bug in onnode that could not handle dead nodes
+* Tue Jun 2 2009 : Version 1.0.83
+ - Document how to remove a ndoe from a running cluster.
+ - Hide all deleted nodes from ctdb output.
+ - Lower the loglevel on some eventscript related items
+ - Dont queue packets to deleted nodes
+ - When building initial vnnmap, ignode any nonexisting nodes
+ - Add a new nodestate : DELETED that is used when deleting a node from an
+   existing cluster.
+ - dont remove the ctdb socket when shutting down. This prevents a race in the
+   initscripts when restarting ctdb quickly after stopping it.
+ - TDB nesting reworked.
+ - Remove obsolete ipmux
+ - From Flavio Carmo Junior: Add eventscript and documentation for ClamAV antivirus engine
+ - From Sumit Bose: fix the regex in the test to handle the new ctdb
+   statistics output that was recently added.
+ - change the socket type we use for grauitious arps from the obsolete
+   AF_INET/SOCK_PACKET to instead use PF_PACKET/SOCK_RAW.
+ - Check return codes for some functions, from Sumit Bose, based on codereview by Jim Meyering.
+ - Sumit Bose: Remove structure memeber node_list_file that is no longer used.
+ - Sumit Bose: fix configure warning for netfilter.h
+ - Updates to the webpages by Volker.
+ - Remove error messages about missing /var/log/log.ctdb file from ctdb_diagnostics.sh from christian Ambach
+ - Additional error logs if hte eventscript switching from dameon to client mode fails.
+ - track how long it takes for ctdbd and the recovery daemon to perform the rec-lock fcntl() lock attemt and show this in the ctdb statistics output.
+* Thu May 14 2009 : Version 1.0.82
+ - Update the "ctdb lvsmaster" command to return -1 on error.
+ - Add a -Y flag to "ctdb lvsmaster"
+ - RHEL5 apache leaks semaphores when crashing. Add semaphore cleanup to the 
+   41.httpd eventscript and try to restart apache when it has crashed.
+ - Fixes to some tests
+ - Add a -o option to "onnode" which will redirect all stdout to a file for
+   each of the nodes.
+ - Add a natgw and a lvs node specifier to onnode so that we can use 
+   "onnode natgw ..."
+ - Assign the natgw address to lo instead of the private network so it can also
+   be used where private and public networks are the same.
+ - Add GPL boilerplates to two missing scripts.
+ - Change the natgw prefix NATGW_ to CTDB_NATGW_
+* Fri May 8 2009 : Version 1.0.81
+ - use smbstatus -np instead of smbstatus -n in the 50.samba eventscript 
+   since this avoids performing an expensive traverse on the locking and brlock
+   databases.
+ - make ctdb automatically terminate all traverse child processes clusterwide
+   associated to a client application that terminates before the traversal is
+   completed.
+ - From Sumit Bose : fixes to AC_INIT handling.
+ - From Michael Adam, add Tridge's "ping_pong" tool the the ctdb distro since 
+   this is very useful for testing the backend filesystem.
+ - From Sumit bose, add support for additional 64 bit platforms.
+ - Add a link from the webpage to Michael Adams SambaXP paper on CTDB.
+* Fri May 1 2009 : Version 1.0.80
+ - change init shutdown level to 01 for ctdb so it stops before any of the other services
+ - if we can not pull a database from a remote node during recovery, mark that node as a culprit so it becomes banned
+ - increase the loglevel when we volunteer to drop all ip addresses after beeing in recovery mode for too long. Make this timeout tuneable with "RecoveryDropAllIPs" and have it default to 60 seconds
+ - Add a new flag TDB_NO_NESTING to the tdb layer to prevent nested transactions which ctdb does not use and does not expect. Have ctdb set this flag to prevent nested transactions from occuring.
+ - dont unconditionally kill off ctdb and restrat it on "service ctdb start". Fail "service ctdb start" with an error if ctdb is already running.
+ - Add a new tunable "VerifyRecoveryLock" that can be set to 0 to prevent the main ctdb daemon to verify that the recovery master has locked the reclock file correctly before allowing it to set the recovery mode to active.
+ - fix a cosmetic bug with ctdb statistics where certain counters could become negative.
+* Wed Apr 8 2009 : Version 1.0.79
+ - From Mathieu Parent: add a ctdb pkgconfig file
+ - Fix bug 6250
+ - add a funciton remove_ip to safely remove an ip from an interface, taking care to workaround an issue with linux alias interfaces.
+ - Update the natgw eventscript to use the safe remove_ip() function
+ - fix a bug in the eventscript child process that would cause the socket to be removed.
+ - dont verify nodemap on banned nodes during cluster monitoring
+ - Update the dodgy SeqnumInterval to have ms resolution
+* Tue Mar 31 2009 : Version 1.0.78
+ - Add a notify mechanism so we can send snmptraps/email to external management systems when the node becomes unhealthy
+ - include 11.natgw eventscript in thew install so that the NATGW feature works
+* Tue Mar 31 2009 : Version 1.0.77
+ - Update the 99.routing eventscript to also try to add the routes (back) during a releaseip event. Similar to the reasons why we must add addresses back during releaseip in 10.interfaces
+* Wed Mar 24 2009 : Version 1.0.76
+ - Add a debugging command "xpnn" which can print the pnn of the node even when ctdbd is not running.
+ - Redo the NATGW implementation to allow multiple disjoing NATGW groups in the same cluster.
+* Tue Mar 24 2009 : Version 1.0.75
+ - Various updates to LVS
+ - Fix a bug in the killtcp control where we did not set the port correctly
+ - add a new "ctdb scriptstatus" command that shows the status of the eventrscripts.
+* Mon Mar 16 2009 : Version 1.0.74
+ - Fixes to AIX from C Cowan.
+ - Fixes to ctdb_diagnostics so we collect correct GPFS data
+ - Fixes to the net conf list command in ctdb_diagnostics
+ - Check the static-routes file IFF it exists in ctdb_diagnostics
+* Wed Mar 4 2009 : Version 1.0.73
+ - Add possibility to disable the check of shares for NFS and Samba
+ - From Sumit Bose, fix dependencies so make -j works
+* Wed Feb 18 2009 : Version 1.0.72
+ - Updates to test scripts by martin s
+ - Adding a COPYING file
+ - Use netstat to check for services and ports and fallback to netcat
+   only if netstat is unavailable.
+* Mon Feb 2 2009 : Version 1.0.71
+ - Additional ipv6 fixes from Michael Adams
+* Fri Jan 16 2009 : Version 1.0.70
+ - IPv6 support is completed. this is backward compatible with ipv4-only
+   systems. To use IPv6 with samba and ctdb you need current GIT of samba 3.3
+   or michael adams samba-ctdeb branch.
+ - Many enhancements to the build system and scripts to make it more SUSE
+   friendly by Michael Adams.
+ - Change of how the naming of the package is structured. We are now
+   using "1.0.70" as a release and "-1" as the revision instead of as
+   previously using "1.0" as release and ".70" as the revision.
+   By Michael Adams.
+* Thu Dec 18 2008 : Version 1.0.69
+ - Various fixes to scripts by M Adam
+ - Dont call ctdb_fatal() when the transport is down during shutdown
+* Fri Dec 12 2008 : Version 1.0.68
+ - Fixes for monitoring of interfaces status from Michael Adam.
+ - Use -q instead of >/dev/null for grep to enhance readability of the
+   scripts from Michael Adam.
+ - Update to the "ctdb recover" command. This command now block until the
+   has completed. This makes it much easier to use in scripts and avoids
+   the common workaround :
+      ctdb recover
+      ... loop while waiting for recovery completes ...
+      continue ...
+ - Add a CTDB_TIMEOUT variable. If set, this variable provides an automatic
+   timeout for "ctdb <command>", similar to using -T <timeout>
+ - Set a unique errorcode for "ctdb <command>" when it terminates due to a 
+   timeout so that scripts can distinguish between a hung command and what was
+   just a failure.
+ - Update "ctdb ban/unban" so that if the cluster is in recovery these commands
+   blocks and waits until after recovery is complete before the perform the
+   ban/unban operation. This is necessary since the recovery process can cause
+   nodes to become automatically unbanned.
+ - Update "ctdb ban/unban" to block until the recovery that will follow shortly
+   after this command has completed.
+   This makes it much easier to use in scripts and avoids the common
+   workaround :
+      ctdb ban/unban
+      ... loop while waiting for recovery completes ...
+      continue ...
+ - Bugfix for the new flags handling in 1.0.67. Abort and restart monitoring
+   if we failed to get proper nodemaps from a remote node instead of
+   dereferencing a null pointer.
+ - If ctdbd was explicitely started with the '--socket' argument, make
+   ctdbd automatically set CTDB_SOCKET to the specified argument.
+   This ensures that eventscripts spawned by the ctdb daemon will default to
+   using the same socket and talk to the correct daemon.
+   This primarily affects running multiple daemons on the same host and where 
+   you want each instance of ctdb daemons have their eventscripts talk to the
+   "correct" daemon.
+ - Update "ctdb ping" to return an error code if the ping fail so that it
+   can be used in scripts.
+ - Update to how to synchronize management of node flags across the cluster.
+* Thu Dec 3 2008 : Version 1.0.67
+ - Add a document describing the recovery process.
+ - Fix a bug in "ctdb setdebug" where it would refuse to set a negative
+   debug level.
+ - Print the list of literals for debug names if an invalid one was given
+   to "ctdb setdebug"
+ - Redesign how "ctdb reloadnodes" works and reduce the amont of tcp teardowns
+   used during this event.
+ - Make it possible to delete a public ip from all nodes at once using
+   "ctdb delip -n all"
+* Mon Nov 24 2008 : Version 1.0.66
+ - Allow to change the recmaster even when we are not frozen.
+ - Remove two redundant SAMBA_CHECK variables from the sysconf example
+ - After a node failure it can take very long before some lock operations
+   ctdb needs to perform are allowed/works with gpfs again. Workaround this
+   by treating a hang/timeout as success.
+ - Dont override CTDB_BASE is fet in the shell already
+ - Always send keepalive packets regardless of whether the link is idle or not.
+ - Rewrite the disable/enable flag update logic to prevent a race between 
+   "ctdb disable/enable" and the recovery daemon when updating the flags to 
+   all nodes.
+* Thu Nov 13 2008 : Version 1.0.65
+ - Update the sysconfig example: The default debug level is 2 (NOTICE) and not
+   0 (ERROR)
+ - Add support for a CTDB_SOCKET environment variable for the ctdb command
+   line tool. If set, this overrides the default socket the ctdb tool will
+   use.
+ - Add logging of high latency operations.
+* Mon Oct 22 2008 : Version 1.0.64
+ - Add a context and a timed event so that once we have been in recovery for
+   too long we drop all public addresses.
+* Mon Oct 20 2008 : Version 1.0.63
+ - Remove logging of "periodic cleanup ..." in 50.samba
+ - When we reload a nodes file, we must detect this and reload the file also
+   in the recovery daemon before we try to dereference somethoung beyond the end
+   of the nodes array.
+* Thu Oct 16 2008 : Version 1.0.62
+ - Allow multiple eventscritps using the same prefix number.
+   It is undefined which order scripts with the same prefix will execute in.
+* Wed Oct 15 2008 : Version 1.0.61
+ - Use "route add -net" instead of "ip route add" when adding routes in 99.routing
+ - lower the loglevel os several debug statements
+ - check the status returned from ctdb_ctrl_get_tickles() before we try to print them out to the screen.
+ - install a new eventscript 20.multipathd whoich can be used to monitor that multipath devices are healthy
+* Wed Oct 15 2008 : Version 1.0.60
+ - Verify that nodes we try to ban/unban are reachable and print an error othervise.
+ - Update the client and server sides of TAKEIP/RELEASEIP/GETPUBLICIPS and GETNODEMAP to fall back to the old style ipv4-only controls if the new ipv4/ipv6 controls fail. This allows an ipv4/v6 enabled ctdb daemon to interoperate with earlier ipv4-only versions of the daemons.
+ - From Mathieu Parent : log debian systems log the package versions in ctdb diagnostics
+ - From Mathieu Parent : specify logdir location for debian (this patch was later reversed)
+ - From Michael Adams : allow # comments in nodes/public_addresses files
+* Tue Oct 7 2008 : Version 1.0.59
+ - Updated "reloadnodes" logic. Instead of bouncing the entire tcp layer it is sufficient to just close and reopen all outgoing tcp connections.
+ - New eventscript 99.routing which can be used to re-attach routes to public interfaces after a takeip event. (routes may be deleted by the kernel when we release an ip)
+ - IDR tree fix from Jim Houston
+ - Better handling of critical events if the local clock is suddenly changed forward by a lot.
+ - Fix three slow memory leaks in the recovery daemon
+ - New ctdb command : ctdb recmaster   which prints the pnn of the recmaster
+ - Onnode enhancements from Martin S : "healthy" and "connected" are now possible nodespecifiers
+ - From Martin S : doc fixes
+ - lowering some debug levels for some nonvital informational messages
+ - Make the daemon daemon monitoring stronger and allow ctdbd to detect a hung
+   recovery daemon.
+ - From C Cowan : patches to compile ipv6 under AIX
+ - zero out some structs to keep valgrind happy
+* Wed Aug 27 2008 : Version 1.0.58
+ - revert the name change tcp_tcp_client back to tcp_control_tcp so
+   samba can build.
+ - Updates to the init script from Abhijith Das <adas@redhat.com>
+* Mon Aug 25 2008 : Version 1.0.57
+ - initial support for IPv6
+* Mon Aug 11 2008 : Version 1.0.56
+ - fix a memory leak in the recovery daemon.
+* Mon Aug 11 2008 : Version 1.0.55
+ - Fix the releaseip message we seond to samba.
+* Fri Aug 8 2008 : Version 1.0.54
+ - fix a looping error in the transaction code
+ - provide a more detailed error code for persistent store errors
+   so clients can make more intelligent choices on how to try to recover
+* Thu Aug 7 2008 : Version 1.0.53
+ - Remove the reclock.pnn file   it can cause gpfs to fail to umount
+ - New transaction code
+* Mon Aug 4 2008 : Version 1.0.52
+ - Send an explicit gratious arp when starting sending the tcp tickles.
+ - When doing failover, issue a killtcp to non-NFS/non-CIFS clients
+   so that they fail quickly. NFS and CIFS already fail and recover 
+   quickly.
+ - Update the test scripts to handle CTRL-C to kill off the test.
+* Mon Jul 28 2008 : Version 1.0.51
+ - Strip off the vlan tag from bond devices before we check in /proc
+   if the interface is up or not.
+ - Use testparm in the background in the scripts to allow probing
+   that the shares do exist.
+ - Fix a bug in the logging code to handle multiline entries better
+ - Rename private elements from private to private_data
+* Fri Jul 18 2008 : Version 1.0.50
+ - Dont assume that just because we can establish a TCP connection
+   that we are actually talking to a functioning ctdb daemon.
+   So dont mark the node as CONNECTED just because the tcp handshake
+   was successful.
+ - Dont try to set the recmaster to ourself during elections for those
+   cases we know this will fail. To remove some annoying benign but scary
+   looking entries from the log.
+ - Bugfix for eventsystem for signal handling that could cause a node to
+   hang.
+* Thu Jul 17 2008 : Version 1.0.49
+ - Update the safe persistent update fix to work with unpatched samba
+   servers.
+* Thu Jul 17 2008 : Version 1.0.48
+ - Update the spec file.
+ - Do not start new user-triggered eventscripts if we are already
+   inside recovery mode.
+ - Add two new controls to start/cancel a persistent update.
+   A client such as samba can use these to tell ctdbd that it will soon
+   be writing directly to the persistent database tdb file. So if
+   samba is -9ed before it has eitehr done the persistent_store or
+   canceled the operation, ctdb knows that the persistent databases
+   'may' be out of sync and therefore a full blown recovery is called for.
+ - Add two new options :
+   CTDB_SAMBA_SKIP_CONF_CHECK and CTDB_SAMBA_CHECK_PORTS that can be used
+   to override what checks to do when monitoring samba health.
+   We can no longer use the smbstatus, net or testparm commands to check
+   if samba or its config is healthy since these commands may block
+   indefinitely and thus can not be used in scripts.
+* Fri Jul 11 2008 : Version 1.0.47
+ - Fix a double free bug where if a user striggered (ctdb eventscript)
+   hung and while the timeout handler was being processed a new user
+   triggered eventscript was started we would free state twice.
+ - Rewrite of onnode and associated documentation.
+* Thu Jul 10 2008 : Version 1.0.46
+ - Document both the LVS:cingle-ip-address and the REMOTE-NODE:wan-accelerator
+   capabilities.
+ - Add commands "ctdb pnn", "ctdb lvs", "ctdb lvsmaster".
+ - LVS improvements. LVS is the single-ip-address mode for a ctdb cluster.
+ - Fixes to supress rpmlint warnings
+ - AXI compile fixes.
+ - Change \s to [[:space:]] in some scripts. Not all RHEL5 packages come
+   with a egrep that handles \s   even same version but different arch.
+ - Revert the change to NFS restart. CTDB should NOT attempt to restart
+   failed services.
+ - Rewrite of the waitpid() patch to use the eventsystem for handling
+   signals.
+* Tue Jul 8 2008 : Version 1.0.45
+ - Try to restart the nfs service if it has failed to respond 3 times in a row.
+ - waitpid() can block if the child does not respond promptly to SIGTERM.
+   ignore all SIGCHILD signals by setting SIGCHLD to SIG_DEF.
+   get rid of all calls to waitpid().
+ - make handling of eventscripts hanging more liberal.
+   only consider the script to have failed and making the node unhealthy
+   IF the eventscript terminated wiht an error
+   OR the eventscript hung 5 or more times in a row
+* Mon Jul 7 2008 : Version 1.0.44
+ - Add a CTDB_VALGRIND option to /etc/sysconfig/ctdb to make it start
+   ctdb under valgrind. Logs go to /var/log/ctdb_valgrind.PID
+ - Add a hack to show the control opcode that caused uninitialized data
+   in the valgrind output by encoding the opcode as the line number.
+ - Initialize structures and allocated memory in various places in
+   ctdb to make it valgrind-clean and remove all valgrind errors/warnings.
+ - If/when we destroy a lockwait child, also make sure we cancel any pending transactions
+ - If a transaction_commit fails, delete/cancel any pending transactions and
+   return an error instead of calling ctdb_fatal()
+ - When running ctdb under valgrind, make sure we run it with --nosetsched and also
+   ensure that we do not use mem-mapped i/o when accessing the tdb's.
+ - zero out ctdb->freeze_handle when we free/destroy a freeze-child.
+   This prevent a heap corruption/ctdb crash bug that could trigger
+   if the freeze child times out.
+ - we dont need to explicitely thaw the databases from the recovery daemon
+   since this is done implicitely when we restore the recovery mode back to normal.
+ - track when we start and stop a recovery. Add the 'time it took to complete the
+   recovery' to the 'ctdb uptime' output.
+   Ensure by tracking the start/stop recovery timestamps that we do not
+   check that the ip allocation is consistend from inside the recovery daemon
+   while a different node (recovery master) is performing a recovery.
+   This prevent a race that could cause a full recovery to trigger if the
+   'ctdb disable/enable' commands took very long.
+ - The freeze child indicates to the master daemon that all databases are locked
+   by writing data to the pipe shared with the master daemon.
+   This write sometimes fail and thus the master daemon never notices that the databases
+   are locked cvausing long timeouts and extra recoveries.
+   Check that the write is successful and try the write again if it failed.
+ - In each node, verify that the recmaster have the right node flags for us
+   and force a push of our flags to the recmaster if wrong.
+* Tue Jul 1 2008 : Version 1.0.43
+ - Updates and bugfixes to the specfile to keep rpmlint happy
+ - Force a global flags update after each recovery event.
+ - Verify that the recmaster agrees with our node flags and update the
+   recmaster othervise.
+ - When writing back to the parent from a freeze-child across the pipe,
+   loop over the write in case the write failed with an error  othervise
+   the parent will never be notified tha the child has completed the operation.
+ - Automatically thaw all databases when recmaster marks us as being in normal
+   mode instead of recovery mode.
+* Fri Jun 13 2008 : Version 1.0.42
+ - When event scripts have hung/timedout more than EventScriptBanCount times
+   in a row the node will ban itself.
+ - Many updates to persistent write tests and the test scripts.
+* Wed May 28 2008 : Version 1.0.41
+ - Reactivate the safe writes to persistent databases and solve the
+   locking issues. Locking issues are solved the only possible way,
+   by using a child process to do the writes.  Expensive and slow but... . 
+* Tue May 27 2008 : Version 1.0.40
+ - Read the samba sysconfig file from the 50.samba eventscript
+ - Fix some emmory hierarchical bugs in the persistent write handling
+* Thu May 22 2008 : Version 1.0.39
+ - Moved a CTDB_MANAGES_NFS, CTDB_MANAGES_ISCSI and CTDB_MANAGES_CSFTPD
+   into /etc/sysconfig/ctdb
+ - Lowered some debug messages to not fill the logfile with entries
+   that normally occur in the default configuration.
+* Fri May 16 2008 : Version 1.0.38
+ - Add machine readable output support to "ctdb getmonmode"
+ - Lots of tweaks and enhancements if the event scripts are "slow"
+ - Merge from tridge: an attempt to break the chicken-and-egg deadlock that
+   net conf introduces if used from an eventscript.
+ - Enhance tickles so we can tickle an ipv6 connection.
+ - Start adding ipv6 support : create a new container to replace sockaddr_in.
+ - Add a checksum routine for ipv6/tcp
+ - When starting up ctdb, let the init script do a tdbdump on all
+   persistent databases and verify that they are good (i.e. not corrupted).
+ - Try to use "safe transactions" when writing to a persistent database
+   that was opened with the TDB_NOSYNC flag. If we can get the transaction
+   thats great, if we cant  we have to write anyway since we cant block here.
+* Mon May 12 2008 : Version 1.0.37
+ - When we shutdown ctdb we close the transport down before we run the 
+   "shutdown" eventscripts. If ctdb decides to send a packet to a remote node
+   after we have shutdown the transport but before we have shutdown ctdbd
+   itself this could lead to a SEGV instead of a clean shutdown. Fix.
+ - When using the "exportfs" command to extract which NFS export directories
+   to monitor,  exportfs violates the "principle of least surprise" and
+   sometimes report a single export line as two lines of text output
+   causing the monitoring to fail.
+* Fri May 9 2008 : Version 1.0.36
+ - fix a memory corruption bug that could cause the recovery daemon to crash.
+ - fix a bug with distributing public ip addresses during recovery.
+   If the node that is the recovery master did NOT use public addresses,
+   then it assumed that no other node in the cluster used them either and
+   thus skipped the entire step of reallocating public addresses.
+* Wed May 7 2008 : Version 1.0.35
+ - During recovery, when we define the new set of lmasters (vnnmap)
+   only consider those nodes that have the can-be-lmaster capability
+   when we create the vnnmap. unless there are no nodes available which
+   supports this capability in which case we allow the recmaster to
+   become lmaster capable (temporarily).
+ - Extend the async framework so that we can use paralell async calls
+   to controls that return data.
+ - If we do not have the "can be recmaster" capability, make sure we will
+   lose any recmaster elections, unless there are no nodes available that
+   have the capability, in which case we "take/win" the election anyway.
+ - Close and reopen the reclock pnn file at regular intervals.
+   Make it a non-fatal event if we occasionally fail to open/read/write
+   to this file.
+ - Monitor that the recovery daemon is still running from the main ctdb
+   daemon and shutdown the main daemon when recovery daemon has terminated.
+ - Add a "ctdb getcapabilities" command to read the capabilities off a node.
+ - Define two new capabilities : can be recmaster and can be lmaster
+   and default both capabilities to YES.
+ - Log denied tcp connection attempts with DEBUG_ERR and not DEBUG_WARNING
+* Thu Apr 24 2008 : Version 1.0.34
+ - When deleting a public ip from a node, try to migrate the ip to a different
+   node first.
+ - Change catdb to produce output similar to tdbdump
+ - When adding a new public ip address, if this ip does not exist yet in
+   the cluster, then grab the ip on the local node and activate it.
+ - When a node disagrees with the recmaster on WHO is the recmaster, then
+   mark that node as a recovery culprit so it will eventually become
+   banned.
+ - Make ctdb eventscript support the -n all argument.
+* Thu Apr 10 2008 : Version 1.0.33
+ - Add facilities to include site local adaptations to the eventscript
+   by /etc/ctdb/rc.local which will be read by all eventscripts.
+ - Add a "ctdb version" command.
+ - Secure the domain socket with proper permissions from Chris Cowan
+ - Bugfixes for AIX from Chris Cowan
+* Wed Apr 02 2008 : Version 1.0.32
+ - Add a control to have a node execute the eventscripts with arbitrary
+   command line arguments.
+ - Add a control "rddumpmemory" that will dump the talloc memory allocations
+   for the recovery daemon.
+ - Decorate the talloc memdump to produce better and easier memory leak
+   tracking. 
+ - Update the RHEL5 iscsi tgtd scripts to allow one iscsi target for each
+   public address.
+ - Add two new controls "addip/delip" that can be used to add/remove public
+   addresses to a node at runtime. After using these controls a "ctdb recover"
+   ir required to make the changes take.
+ - Fix a couple of slow memory leaks.
+* Tue Mar 25 2008 : Version 1.0.31
+ - Add back controls to disable/enable monitoring on a node.
+ - Fix a memory leak where we used to attach CALL data to the ctdb structure
+   when performing a local call. Memory which would be lost if the call was
+   aborted.
+ - Reduce the loglevel for the log output when someone connects to a non
+   public ip address for samba.
+ - Redo and optimize the vacuuming process to send only one control to each
+   other node containing all records to be vacuumed instead of one
+   control per node per record.
+* Tue Mar 04 2008 : Version 1.0.30
+ - Update documentation cor new commands and tuneables
+ - Add machinereadable output to the ip,uptime and getdebug commands
+ - Add a moveip command to manually failover/failback public ips
+ - Add NoIPFallback tuneable that prevents ip address failback
+ - Use file locking inside the CFS as alternative to verify when other nodes
+   Are connected/disconnected to be able to recover from split network
+ - Add DisableWhenUnhealthy tunable
+ - Add CTDB_START_AS_DISABLED sysconfig param
+ - Add --start-as-disabled flag to ctdb
+ - Add ability to monitor for OOM condition
+* Thu Feb 21 2008 : Version 1.0.29
+ - Add a new command to make expansion of an existing cluster easier
+ - Fix bug with references to freed objects in the ctdb structure
+ - Propagate debuglevel changes to the recovery daemon
+ - Merge patches to event scripts from Mathieu Parent :
+ - MP: Simulate "service" on systems which do not provide this tool
+ - MP: Set correct permissions for events.d/README
+ - Add nice helper functions to start/stop nfs from the event scripts
+* Fri Feb 08 2008 : Version 1.0.28
+ - Fix a problem where we tried to use ethtool on non-ethernet interfaces
+ - Warn if the ipvsadm packege is missing when LVS is used
+ - Dont use absolute pathnames in some of the event scripts
+ - Fix for persistent tdbs growing inifinitely.
+* Wed Feb 06 2008 : Version 1.0.27
+ - Add eventscript for iscsi
+* Thu Jan 31 2008 : Version 1.0.26
+ - Fix crashbug in tdb transaction code
+* Tue Jan 29 2008 : Version 1.0.25
+ - added async recovery code
+ - make event scripts more portable
+ - fixed ctdb dumpmemory
+ - more efficient tdb allocation code
+ - improved machine readable ctdb status output
+ - added ctdb uptime
+* Wed Jan 16 2008 : Version 1.0.24
+ - added syslog support
+ - documentation updates
+* Wed Jan 16 2008 : Version 1.0.23
+ - fixed a memory leak in the recoveryd
+ - fixed a corruption bug in the new transaction code
+ - fixed a case where an packet for a disconnected client could be processed
+ - added http event script
+ - updated documentation
+* Thu Jan 10 2008 : Version 1.0.22
+ - auto-run vacuum and repack ops
+* Wed Jan 09 2008 : Version 1.0.21
+ - added ctdb vacuum and ctdb repack code
+* Sun Jan 06 2008 : Version 1.0.20
+ - new transaction based recovery code
+* Sat Jan 05 2008 : Version 1.0.19
+ - fixed non-master bug
+ - big speedup in recovery for large databases
+ - lots of changes to improve tdb and ctdb for high churn databases
+* Thu Dec 27 2007 : Version 1.0.18
+ - fixed crash bug in monitor_handler
+* Tue Dec 04 2007 : Version 1.0.17
+ - fixed bugs related to ban/unban of nodes
+ - fixed a race condition that could lead to monitoring being permanently disabled,
+   which would lead to long recovery times
+ - make deterministic IPs the default
+ - fixed a bug related to continuous recovery 
+ - added a debugging option --node-ip
diff --git a/ctdb/packaging/RPM/makerpms.sh b/ctdb/packaging/RPM/makerpms.sh
new file mode 100755 (executable)
index 0000000..9b4f139
--- /dev/null
@@ -0,0 +1,99 @@
+#!/bin/sh
+#
+# makerpms.sh  -  build RPM packages from the git sources
+#
+# Copyright (C) John H Terpstra 1998-2002
+# Copyright (C) Gerald (Jerry) Carter 2003
+# Copyright (C) Jim McDonough 2007
+# Copyright (C) Andrew Tridgell 2007
+# Copyright (C) Michael Adam 2008-2009
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# The following allows environment variables to override the target directories
+#   the alternative is to have a file in your home directory calles .rpmmacros
+#   containing the following:
+#   %_topdir  /home/mylogin/redhat
+#
+# Note: Under this directory rpm expects to find the same directories that are under the
+#   /usr/src/redhat directory
+#
+
+EXTRA_OPTIONS="$*"
+
+DIRNAME=$(dirname $0)
+TOPDIR=${DIRNAME}/../..
+
+SPECDIR=`rpm --eval %_specdir`
+SRCDIR=`rpm --eval %_sourcedir`
+
+SPECFILE="ctdb.spec"
+SPECFILE_IN="ctdb.spec.in"
+RPMBUILD="rpmbuild"
+
+mkdir -p `rpm --eval %_specdir`
+mkdir -p `rpm --eval %_sourcedir`
+mkdir -p `rpm --eval %_builddir`
+mkdir -p `rpm --eval %_srcrpmdir`
+mkdir -p `rpm --eval %_rpmdir`/noarch
+mkdir -p `rpm --eval %_rpmdir`/i386
+mkdir -p `rpm --eval %_rpmdir`/x86_64
+
+set -- $(${TOPDIR}/packaging/mkversion.sh ${TOPDIR}/include/ctdb_version.h)
+VERSION=$1
+RELEASE=$2
+if [ -z "$VERSION" -o -z "$RELEASE" ]; then
+    exit 1
+fi
+
+sed -e "s/@VERSION@/$VERSION/g" \
+    -e "s/@RELEASE@/$RELEASE/g" \
+       < ${DIRNAME}/${SPECFILE_IN} \
+       > ${DIRNAME}/${SPECFILE}
+
+${TOPDIR}/packaging/maketarball.sh ${SRCDIR}
+if [ $? -ne 0 ]; then
+       echo "Build failed!"
+       exit 1
+fi
+
+# At this point the SPECDIR and SRCDIR vaiables must have a value!
+
+##
+## copy additional source files
+##
+cp -p ${DIRNAME}/${SPECFILE} ${SPECDIR}
+
+##
+## Build
+##
+echo "$(basename $0): Getting Ready to build release package"
+
+case ${EXTRA_OPTIONS} in
+       *-b*)
+               BUILD_TARGET=""
+               ;;
+       *)
+               BUILD_TARGET="-ba"
+               ;;
+esac
+
+
+${RPMBUILD} ${BUILD_TARGET} --clean --rmsource ${EXTRA_OPTIONS} ${SPECDIR}/${SPECFILE} || exit 1
+
+echo "$(basename $0): Done."
+
+exit 0
diff --git a/ctdb/packaging/maketarball.sh b/ctdb/packaging/maketarball.sh
new file mode 100755 (executable)
index 0000000..c27e2ec
--- /dev/null
@@ -0,0 +1,111 @@
+#!/bin/sh
+#
+# maketarball.sh - create a tarball from the git branch HEAD
+#
+# Copyright (C) Michael Adam 2009
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# Create CTDB source tarball of the current git branch HEAD.
+# The version is calculated from git tag in mkversion.sh.
+# Optional argument is the directory to which tarball is copied.
+#
+
+TARGETDIR="${1:-${PWD}}"  # Default target directory is .
+
+DIRNAME=$(dirname "$0")
+cd -P "${DIRNAME}/.."
+TOPDIR="$PWD"
+
+tmpd=$(mktemp -d) || {
+    echo "Failed to create temporary directory"
+    exit 1
+}
+
+TAR_PREFIX_TMP="ctdb-tmp"
+SPECFILE="${tmpd}/${TAR_PREFIX_TMP}/packaging/RPM/ctdb.spec"
+SPECFILE_IN="${SPECFILE}.in"
+VERSION_H="${tmpd}/${TAR_PREFIX_TMP}/include/ctdb_version.h"
+
+if echo | gzip -c --rsyncable - > /dev/null 2>&1 ; then
+       GZIP="gzip -9 --rsyncable"
+else
+       GZIP="gzip -9"
+fi
+
+echo "Creating tarball ... "
+git archive --prefix="${TAR_PREFIX_TMP}/" HEAD | ( cd "$tmpd" ; tar xf - )
+if [ $? -ne 0 ]; then
+       echo "Error calling git archive."
+       exit 1
+fi
+
+set -- $("${TOPDIR}/packaging/mkversion.sh" "$VERSION_H")
+VERSION=$1
+RELEASE=$2
+if [ -z "$VERSION" -o -z "$RELEASE" ]; then
+    exit 1
+fi
+
+sed -e "s/@VERSION@/${VERSION}/g" \
+    -e "s/@RELEASE@/$RELEASE/g" \
+       < ${SPECFILE_IN} \
+       > ${SPECFILE}
+
+TAR_PREFIX="ctdb-${VERSION}"
+TAR_BASE="ctdb-${VERSION}"
+
+cd "${tmpd}/${TAR_PREFIX_TMP}"
+./autogen.sh || {
+       echo "Error calling autogen.sh."
+       exit 1
+}
+
+make -C doc || {
+    echo "Error building docs."
+    exit 1
+}
+
+if [ "$DEBIAN_MODE" = "yes" ] ; then
+       TAR_PREFIX="ctdb-${VERSION}.orig"
+       TAR_BASE="ctdb_${VERSION}.orig"
+       rm -rf "${tmpd}/${TAR_PREFIX_TMP}/lib/popt"
+fi
+
+TAR_BALL="${TAR_BASE}.tar"
+TAR_GZ_BALL="${TAR_BALL}.gz"
+
+mv "${tmpd}/${TAR_PREFIX_TMP}" "${tmpd}/${TAR_PREFIX}"
+
+cd "$tmpd"
+tar cf "$TAR_BALL" "$TAR_PREFIX" || {
+        echo "Creation of tarball failed."
+        exit 1
+}
+
+$GZIP "$TAR_BALL" || {
+        echo "Zipping tarball failed."
+        exit 1
+}
+
+rm -rf "$TAR_PREFIX"
+
+mv "${tmpd}/${TAR_GZ_BALL}" "${TARGETDIR}/"
+
+rmdir "$tmpd"
+
+echo "Done."
+exit 0
diff --git a/ctdb/packaging/mkversion.sh b/ctdb/packaging/mkversion.sh
new file mode 100755 (executable)
index 0000000..7a550a5
--- /dev/null
@@ -0,0 +1,66 @@
+#!/bin/sh
+#
+# mkversion.sh - extract version string from git branch
+#
+# Copyright (C) Amitay Isaacs 2012
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# Common code to generate CTDB version string
+#
+
+OUTPUT=$1
+
+if [ -z "$OUTPUT" ]; then
+    OUTPUT="include/ctdb_version.h"
+fi
+
+# We use tags and determine the version, as follows:
+# ctdb-0.9.1  (First release of 0.9).
+# ctdb-0.9.23 (23rd minor release of the 112 version)
+#
+# If we're not directly on a tag, this is a devel release; we append
+# .0.<patchnum>.<checksum>.devel to the release.
+TAG=`git describe`
+case "$TAG" in
+    ctdb-*)
+       TAG=${TAG##ctdb-}
+       case "$TAG" in
+           *-*-g*) # 0.9-168-ge6cf0e8
+               # Not exactly on tag: devel version.
+               VERSION=`echo "$TAG" | sed 's/\([^-]\+\)-\([0-9]\+\)-\(g[0-9a-f]\+\)/\1.0.\2.\3.devel/'`
+               RELEASE=1
+               ;;
+           *)
+               # An actual release version
+               VERSION=$TAG
+               RELEASE=1
+               ;;
+       esac
+       ;;
+    *)
+       echo Invalid tag "$TAG" >&2
+       ;;
+esac
+
+cat > "$OUTPUT" <<EOF
+/* This file is auto-genrated by packaging/mkversion.sh */
+
+#define CTDB_VERSION_STRING "$VERSION"
+
+EOF
+
+echo "$VERSION $RELEASE"
diff --git a/ctdb/server/ctdb_banning.c b/ctdb/server/ctdb_banning.c
new file mode 100644 (file)
index 0000000..e6df4b9
--- /dev/null
@@ -0,0 +1,176 @@
+/* 
+   ctdb banning code
+
+   Copyright (C) Ronnie Sahlberg  2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "tdb.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+
+
+static void
+ctdb_ban_node_event(struct event_context *ev, struct timed_event *te, 
+                              struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       bool freeze_failed = false;
+       int i;
+
+       /* Make sure we were able to freeze databases during banning */
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
+                       freeze_failed = true;
+                       break;
+               }
+       }
+       if (freeze_failed) {
+               DEBUG(DEBUG_ERR, ("Banning timedout, but still unable to freeze databases\n"));
+               ctdb_ban_self(ctdb);
+               return;
+       }
+
+       DEBUG(DEBUG_ERR,("Banning timedout\n"));
+       ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_BANNED;
+
+       if (ctdb->banning_ctx != NULL) {
+               talloc_free(ctdb->banning_ctx);
+               ctdb->banning_ctx = NULL;
+       }
+}
+
+void ctdb_local_node_got_banned(struct ctdb_context *ctdb)
+{
+       uint32_t i;
+
+       /* make sure we are frozen */
+       DEBUG(DEBUG_NOTICE,("This node has been banned - forcing freeze and recovery\n"));
+
+       /* Reset the generation id to 1 to make us ignore any
+          REQ/REPLY CALL/DMASTER someone sends to us.
+          We are now banned so we shouldnt service database calls
+          anymore.
+       */
+       ctdb->vnn_map->generation = INVALID_GENERATION;
+
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               ctdb_start_freeze(ctdb, i);
+       }
+       ctdb_release_all_ips(ctdb);
+       ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+}
+
+int32_t ctdb_control_set_ban_state(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_ban_time *bantime = (struct ctdb_ban_time *)indata.dptr;
+
+       DEBUG(DEBUG_INFO,("SET BAN STATE\n"));
+
+       if (bantime->pnn != ctdb->pnn) {
+               if (bantime->pnn < 0 || bantime->pnn >= ctdb->num_nodes) {
+                       DEBUG(DEBUG_ERR,(__location__ " ERROR: Invalid ban request. PNN:%d is invalid. Max nodes %d\n", bantime->pnn, ctdb->num_nodes));
+                       return -1;
+               }
+               if (bantime->time == 0) {
+                       DEBUG(DEBUG_NOTICE,("unbanning node %d\n", bantime->pnn));
+                       ctdb->nodes[bantime->pnn]->flags &= ~NODE_FLAGS_BANNED;
+               } else {
+                       DEBUG(DEBUG_NOTICE,("banning node %d\n", bantime->pnn));
+                       if (ctdb->tunable.enable_bans == 0) {
+                               /* FIXME: This is bogus. We really should be
+                                * taking decision based on the tunables on
+                                * the banned node and not local node.
+                                */
+                               DEBUG(DEBUG_WARNING,("Bans are disabled - ignoring ban of node %u\n", bantime->pnn));
+                               return 0;
+                       }
+
+                       ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED;
+               }
+               return 0;
+       }
+
+       if (ctdb->banning_ctx != NULL) {
+               talloc_free(ctdb->banning_ctx);
+               ctdb->banning_ctx = NULL;
+       }
+
+       if (bantime->time == 0) {
+               DEBUG(DEBUG_ERR,("Unbanning this node\n"));
+               ctdb->nodes[bantime->pnn]->flags &= ~NODE_FLAGS_BANNED;
+               return 0;
+       }
+
+       if (ctdb->tunable.enable_bans == 0) {
+               DEBUG(DEBUG_ERR,("Bans are disabled - ignoring ban of node %u\n", bantime->pnn));
+               return 0;
+       }
+
+       ctdb->banning_ctx = talloc(ctdb, struct ctdb_ban_time);
+       if (ctdb->banning_ctx == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " ERROR Failed to allocate new banning state\n"));
+               return -1;
+       }
+       *((struct ctdb_ban_time *)(ctdb->banning_ctx)) = *bantime;
+
+
+       DEBUG(DEBUG_ERR,("Banning this node for %d seconds\n", bantime->time));
+       ctdb->nodes[bantime->pnn]->flags |= NODE_FLAGS_BANNED;
+
+       event_add_timed(ctdb->ev, ctdb->banning_ctx, timeval_current_ofs(bantime->time,0), ctdb_ban_node_event, ctdb);
+
+       ctdb_local_node_got_banned(ctdb);
+       return 0;
+}
+
+int32_t ctdb_control_get_ban_state(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+       struct ctdb_ban_time *bantime;
+
+       bantime = talloc(outdata, struct ctdb_ban_time);
+       CTDB_NO_MEMORY(ctdb, bantime);
+
+       if (ctdb->banning_ctx != NULL) {
+               *bantime = *(struct ctdb_ban_time *)(ctdb->banning_ctx);
+       } else {
+               bantime->pnn = ctdb->pnn;
+               bantime->time = 0;
+       }
+
+       outdata->dptr  = (uint8_t *)bantime;
+       outdata->dsize = sizeof(struct ctdb_ban_time);
+
+       return 0;
+}
+
+/* Routine to ban ourselves for a while when trouble strikes. */
+void ctdb_ban_self(struct ctdb_context *ctdb)
+{
+       TDB_DATA data;
+       struct ctdb_ban_time bantime;
+
+       bantime.pnn  = ctdb->pnn;
+       bantime.time = ctdb->tunable.recovery_ban_period;
+
+       data.dsize = sizeof(bantime);
+       data.dptr  = (uint8_t *)&bantime;
+
+       ctdb_control_set_ban_state(ctdb, data);
+}
diff --git a/ctdb/server/ctdb_call.c b/ctdb/server/ctdb_call.c
new file mode 100644 (file)
index 0000000..017bb81
--- /dev/null
@@ -0,0 +1,1738 @@
+/* 
+   ctdb_call protocol code
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+/*
+  see http://wiki.samba.org/index.php/Samba_%26_Clustering for
+  protocol design and packet details
+*/
+#include "includes.h"
+#include "tdb.h"
+#include "lib/util/dlinklist.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+struct ctdb_sticky_record {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       TDB_CONTEXT *pindown;
+};
+
+/*
+  find the ctdb_db from a db index
+ */
+ struct ctdb_db_context *find_ctdb_db(struct ctdb_context *ctdb, uint32_t id)
+{
+       struct ctdb_db_context *ctdb_db;
+
+       for (ctdb_db=ctdb->db_list; ctdb_db; ctdb_db=ctdb_db->next) {
+               if (ctdb_db->db_id == id) {
+                       break;
+               }
+       }
+       return ctdb_db;
+}
+
+/*
+  a varient of input packet that can be used in lock requeue
+*/
+static void ctdb_call_input_pkt(void *p, struct ctdb_req_header *hdr)
+{
+       struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+       ctdb_input_pkt(ctdb, hdr);
+}
+
+
+/*
+  send an error reply
+*/
+static void ctdb_send_error(struct ctdb_context *ctdb, 
+                           struct ctdb_req_header *hdr, uint32_t status,
+                           const char *fmt, ...) PRINTF_ATTRIBUTE(4,5);
+static void ctdb_send_error(struct ctdb_context *ctdb, 
+                           struct ctdb_req_header *hdr, uint32_t status,
+                           const char *fmt, ...)
+{
+       va_list ap;
+       struct ctdb_reply_error *r;
+       char *msg;
+       int msglen, len;
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Failed to send error. Transport is DOWN\n"));
+               return;
+       }
+
+       va_start(ap, fmt);
+       msg = talloc_vasprintf(ctdb, fmt, ap);
+       if (msg == NULL) {
+               ctdb_fatal(ctdb, "Unable to allocate error in ctdb_send_error\n");
+       }
+       va_end(ap);
+
+       msglen = strlen(msg)+1;
+       len = offsetof(struct ctdb_reply_error, msg);
+       r = ctdb_transport_allocate(ctdb, msg, CTDB_REPLY_ERROR, len + msglen, 
+                                   struct ctdb_reply_error);
+       CTDB_NO_MEMORY_FATAL(ctdb, r);
+
+       r->hdr.destnode  = hdr->srcnode;
+       r->hdr.reqid     = hdr->reqid;
+       r->status        = status;
+       r->msglen        = msglen;
+       memcpy(&r->msg[0], msg, msglen);
+
+       ctdb_queue_packet(ctdb, &r->hdr);
+
+       talloc_free(msg);
+}
+
+
+/**
+ * send a redirect reply
+ *
+ * The logic behind this function is this:
+ *
+ * A client wants to grab a record and sends a CTDB_REQ_CALL packet
+ * to its local ctdb (ctdb_request_call). If the node is not itself
+ * the record's DMASTER, it first redirects the packet to  the
+ * record's LMASTER. The LMASTER then redirects the call packet to
+ * the current DMASTER. Note that this works because of this: When
+ * a record is migrated off a node, then the new DMASTER is stored
+ * in the record's copy on the former DMASTER.
+ */
+static void ctdb_call_send_redirect(struct ctdb_context *ctdb,
+                                   struct ctdb_db_context *ctdb_db,
+                                   TDB_DATA key,
+                                   struct ctdb_req_call *c, 
+                                   struct ctdb_ltdb_header *header)
+{
+       uint32_t lmaster = ctdb_lmaster(ctdb, &key);
+
+       c->hdr.destnode = lmaster;
+       if (ctdb->pnn == lmaster) {
+               c->hdr.destnode = header->dmaster;
+       }
+       c->hopcount++;
+
+       if (c->hopcount%100 > 95) {
+               DEBUG(DEBUG_WARNING,("High hopcount %d dbid:%s "
+                       "key:0x%08x reqid=%08x pnn:%d src:%d lmaster:%d "
+                       "header->dmaster:%d dst:%d\n",
+                       c->hopcount, ctdb_db->db_name, ctdb_hash(&key),
+                       c->hdr.reqid, ctdb->pnn, c->hdr.srcnode, lmaster,
+                       header->dmaster, c->hdr.destnode));
+       }
+
+       ctdb_queue_packet(ctdb, &c->hdr);
+}
+
+
+/*
+  send a dmaster reply
+
+  caller must have the chainlock before calling this routine. Caller must be
+  the lmaster
+*/
+static void ctdb_send_dmaster_reply(struct ctdb_db_context *ctdb_db,
+                                   struct ctdb_ltdb_header *header,
+                                   TDB_DATA key, TDB_DATA data,
+                                   uint32_t new_dmaster,
+                                   uint32_t reqid)
+{
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       struct ctdb_reply_dmaster *r;
+       int ret, len;
+       TALLOC_CTX *tmp_ctx;
+
+       if (ctdb->pnn != ctdb_lmaster(ctdb, &key)) {
+               DEBUG(DEBUG_ALERT,(__location__ " Caller is not lmaster!\n"));
+               return;
+       }
+
+       header->dmaster = new_dmaster;
+       ret = ctdb_ltdb_store(ctdb_db, key, header, data);
+       if (ret != 0) {
+               ctdb_fatal(ctdb, "ctdb_send_dmaster_reply unable to update dmaster");
+               return;
+       }
+
+       if (ctdb->methods == NULL) {
+               ctdb_fatal(ctdb, "ctdb_send_dmaster_reply cant update dmaster since transport is down");
+               return;
+       }
+
+       /* put the packet on a temporary context, allowing us to safely free
+          it below even if ctdb_reply_dmaster() has freed it already */
+       tmp_ctx = talloc_new(ctdb);
+
+       /* send the CTDB_REPLY_DMASTER */
+       len = offsetof(struct ctdb_reply_dmaster, data) + key.dsize + data.dsize + sizeof(uint32_t);
+       r = ctdb_transport_allocate(ctdb, tmp_ctx, CTDB_REPLY_DMASTER, len,
+                                   struct ctdb_reply_dmaster);
+       CTDB_NO_MEMORY_FATAL(ctdb, r);
+
+       r->hdr.destnode  = new_dmaster;
+       r->hdr.reqid     = reqid;
+       r->rsn           = header->rsn;
+       r->keylen        = key.dsize;
+       r->datalen       = data.dsize;
+       r->db_id         = ctdb_db->db_id;
+       memcpy(&r->data[0], key.dptr, key.dsize);
+       memcpy(&r->data[key.dsize], data.dptr, data.dsize);
+       memcpy(&r->data[key.dsize+data.dsize], &header->flags, sizeof(uint32_t));
+
+       ctdb_queue_packet(ctdb, &r->hdr);
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  send a dmaster request (give another node the dmaster for a record)
+
+  This is always sent to the lmaster, which ensures that the lmaster
+  always knows who the dmaster is. The lmaster will then send a
+  CTDB_REPLY_DMASTER to the new dmaster
+*/
+static void ctdb_call_send_dmaster(struct ctdb_db_context *ctdb_db, 
+                                  struct ctdb_req_call *c, 
+                                  struct ctdb_ltdb_header *header,
+                                  TDB_DATA *key, TDB_DATA *data)
+{
+       struct ctdb_req_dmaster *r;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       int len;
+       uint32_t lmaster = ctdb_lmaster(ctdb, key);
+
+       if (ctdb->methods == NULL) {
+               ctdb_fatal(ctdb, "Failed ctdb_call_send_dmaster since transport is down");
+               return;
+       }
+
+       if (data->dsize != 0) {
+               header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+       }
+
+       if (lmaster == ctdb->pnn) {
+               ctdb_send_dmaster_reply(ctdb_db, header, *key, *data, 
+                                       c->hdr.srcnode, c->hdr.reqid);
+               return;
+       }
+       
+       len = offsetof(struct ctdb_req_dmaster, data) + key->dsize + data->dsize
+                       + sizeof(uint32_t);
+       r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_DMASTER, len, 
+                                   struct ctdb_req_dmaster);
+       CTDB_NO_MEMORY_FATAL(ctdb, r);
+       r->hdr.destnode  = lmaster;
+       r->hdr.reqid     = c->hdr.reqid;
+       r->db_id         = c->db_id;
+       r->rsn           = header->rsn;
+       r->dmaster       = c->hdr.srcnode;
+       r->keylen        = key->dsize;
+       r->datalen       = data->dsize;
+       memcpy(&r->data[0], key->dptr, key->dsize);
+       memcpy(&r->data[key->dsize], data->dptr, data->dsize);
+       memcpy(&r->data[key->dsize + data->dsize], &header->flags, sizeof(uint32_t));
+
+       header->dmaster = c->hdr.srcnode;
+       if (ctdb_ltdb_store(ctdb_db, *key, header, *data) != 0) {
+               ctdb_fatal(ctdb, "Failed to store record in ctdb_call_send_dmaster");
+       }
+       
+       ctdb_queue_packet(ctdb, &r->hdr);
+
+       talloc_free(r);
+}
+
+static void ctdb_sticky_pindown_timeout(struct event_context *ev, struct timed_event *te, 
+                                      struct timeval t, void *private_data)
+{
+       struct ctdb_sticky_record *sr = talloc_get_type(private_data, 
+                                                      struct ctdb_sticky_record);
+
+       DEBUG(DEBUG_ERR,("Pindown timeout db:%s  unstick record\n", sr->ctdb_db->db_name));
+       if (sr->pindown != NULL) {
+               talloc_free(sr->pindown);
+               sr->pindown = NULL;
+       }
+}
+
+static int
+ctdb_set_sticky_pindown(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint32_t *k;
+       struct ctdb_sticky_record *sr;
+
+       k = talloc_zero_size(tmp_ctx, ((key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       k[0] = (key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], key.dptr, key.dsize);
+
+       sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+       if (sr == NULL) {
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       talloc_free(tmp_ctx);
+
+       if (sr->pindown == NULL) {
+               DEBUG(DEBUG_ERR,("Pinning down record in %s for %d ms\n", ctdb_db->db_name, ctdb->tunable.sticky_pindown));
+               sr->pindown = talloc_new(sr);
+               if (sr->pindown == NULL) {
+                       DEBUG(DEBUG_ERR,("Failed to allocate pindown context for sticky record\n"));
+                       return -1;
+               }
+               event_add_timed(ctdb->ev, sr->pindown, timeval_current_ofs(ctdb->tunable.sticky_pindown / 1000, (ctdb->tunable.sticky_pindown * 1000) % 1000000), ctdb_sticky_pindown_timeout, sr);
+       }
+
+       return 0;
+}
+
+/*
+  called when a CTDB_REPLY_DMASTER packet comes in, or when the lmaster
+  gets a CTDB_REQUEST_DMASTER for itself. We become the dmaster.
+
+  must be called with the chainlock held. This function releases the chainlock
+*/
+static void ctdb_become_dmaster(struct ctdb_db_context *ctdb_db,
+                               struct ctdb_req_header *hdr,
+                               TDB_DATA key, TDB_DATA data,
+                               uint64_t rsn, uint32_t record_flags)
+{
+       struct ctdb_call_state *state;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       struct ctdb_ltdb_header header;
+       int ret;
+
+       DEBUG(DEBUG_DEBUG,("pnn %u dmaster response %08x\n", ctdb->pnn, ctdb_hash(&key)));
+
+       ZERO_STRUCT(header);
+       header.rsn = rsn;
+       header.dmaster = ctdb->pnn;
+       header.flags = record_flags;
+
+       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_call_state);
+
+       if (state) {
+               if (state->call->flags & CTDB_CALL_FLAG_VACUUM_MIGRATION) {
+                       /*
+                        * We temporarily add the VACUUM_MIGRATED flag to
+                        * the record flags, so that ctdb_ltdb_store can
+                        * decide whether the record should be stored or
+                        * deleted.
+                        */
+                       header.flags |= CTDB_REC_FLAG_VACUUM_MIGRATED;
+               }
+       }
+
+       if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+               ctdb_fatal(ctdb, "ctdb_reply_dmaster store failed\n");
+
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+               return;
+       }
+
+       /* we just became DMASTER and this database is "sticky",
+          see if the record is flagged as "hot" and set up a pin-down
+          context to stop migrations for a little while if so
+       */
+       if (ctdb_db->sticky) {
+               ctdb_set_sticky_pindown(ctdb, ctdb_db, key);
+       }
+
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_become_dmaster from node %u\n",
+                        ctdb->pnn, hdr->reqid, hdr->srcnode));
+
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+               return;
+       }
+
+       if (key.dsize != state->call->key.dsize || memcmp(key.dptr, state->call->key.dptr, key.dsize)) {
+               DEBUG(DEBUG_ERR, ("Got bogus DMASTER packet reqid:%u from node %u. Key does not match key held in matching idr.\n", hdr->reqid, hdr->srcnode));
+
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+               return;
+       }
+
+       if (hdr->reqid != state->reqid) {
+               /* we found a record  but it was the wrong one */
+               DEBUG(DEBUG_ERR, ("Dropped orphan in ctdb_become_dmaster with reqid:%u\n from node %u", hdr->reqid, hdr->srcnode));
+
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+               return;
+       }
+
+       ctdb_call_local(ctdb_db, state->call, &header, state, &data, true);
+
+       ret = ctdb_ltdb_unlock(ctdb_db, state->call->key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+       }
+
+       state->state = CTDB_CALL_DONE;
+       if (state->async.fn) {
+               state->async.fn(state);
+       }
+}
+
+
+
+/*
+  called when a CTDB_REQ_DMASTER packet comes in
+
+  this comes into the lmaster for a record when the current dmaster
+  wants to give up the dmaster role and give it to someone else
+*/
+void ctdb_request_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_req_dmaster *c = (struct ctdb_req_dmaster *)hdr;
+       TDB_DATA key, data, data2;
+       struct ctdb_ltdb_header header;
+       struct ctdb_db_context *ctdb_db;
+       uint32_t record_flags = 0;
+       size_t len;
+       int ret;
+
+       key.dptr = c->data;
+       key.dsize = c->keylen;
+       data.dptr = c->data + c->keylen;
+       data.dsize = c->datalen;
+       len = offsetof(struct ctdb_req_dmaster, data) + key.dsize + data.dsize
+                       + sizeof(uint32_t);
+       if (len <= c->hdr.length) {
+               record_flags = *(uint32_t *)&c->data[c->keylen + c->datalen];
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, c->db_id);
+       if (!ctdb_db) {
+               ctdb_send_error(ctdb, hdr, -1,
+                               "Unknown database in request. db_id==0x%08x",
+                               c->db_id);
+               return;
+       }
+       
+       /* fetch the current record */
+       ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, hdr, &data2,
+                                          ctdb_call_input_pkt, ctdb, false);
+       if (ret == -1) {
+               ctdb_fatal(ctdb, "ctdb_req_dmaster failed to fetch record");
+               return;
+       }
+       if (ret == -2) {
+               DEBUG(DEBUG_INFO,(__location__ " deferring ctdb_request_dmaster\n"));
+               return;
+       }
+
+       if (ctdb_lmaster(ctdb, &key) != ctdb->pnn) {
+               DEBUG(DEBUG_ALERT,("pnn %u dmaster request to non-lmaster lmaster=%u gen=%u curgen=%u\n",
+                        ctdb->pnn, ctdb_lmaster(ctdb, &key), 
+                        hdr->generation, ctdb->vnn_map->generation));
+               ctdb_fatal(ctdb, "ctdb_req_dmaster to non-lmaster");
+       }
+
+       DEBUG(DEBUG_DEBUG,("pnn %u dmaster request on %08x for %u from %u\n", 
+                ctdb->pnn, ctdb_hash(&key), c->dmaster, c->hdr.srcnode));
+
+       /* its a protocol error if the sending node is not the current dmaster */
+       if (header.dmaster != hdr->srcnode) {
+               DEBUG(DEBUG_ALERT,("pnn %u dmaster request for new-dmaster %u from non-master %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u keyval=0x%08x\n",
+                        ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key),
+                        ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation,
+                        (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid,
+                        (key.dsize >= 4)?(*(uint32_t *)key.dptr):0));
+               if (header.rsn != 0 || header.dmaster != ctdb->pnn) {
+                       DEBUG(DEBUG_ERR,("ctdb_req_dmaster from non-master. Force a recovery.\n"));
+
+                       ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       return;
+               }
+       }
+
+       if (header.rsn > c->rsn) {
+               DEBUG(DEBUG_ALERT,("pnn %u dmaster request with older RSN new-dmaster %u from %u real-dmaster=%u key %08x dbid 0x%08x gen=%u curgen=%u c->rsn=%llu header.rsn=%llu reqid=%u\n",
+                        ctdb->pnn, c->dmaster, hdr->srcnode, header.dmaster, ctdb_hash(&key),
+                        ctdb_db->db_id, hdr->generation, ctdb->vnn_map->generation,
+                        (unsigned long long)c->rsn, (unsigned long long)header.rsn, c->hdr.reqid));
+       }
+
+       /* use the rsn from the sending node */
+       header.rsn = c->rsn;
+
+       /* store the record flags from the sending node */
+       header.flags = record_flags;
+
+       /* check if the new dmaster is the lmaster, in which case we
+          skip the dmaster reply */
+       if (c->dmaster == ctdb->pnn) {
+               ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags);
+       } else {
+               ctdb_send_dmaster_reply(ctdb_db, &header, key, data, c->dmaster, hdr->reqid);
+
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+       }
+}
+
+static void ctdb_sticky_record_timeout(struct event_context *ev, struct timed_event *te, 
+                                      struct timeval t, void *private_data)
+{
+       struct ctdb_sticky_record *sr = talloc_get_type(private_data, 
+                                                      struct ctdb_sticky_record);
+       talloc_free(sr);
+}
+
+static void *ctdb_make_sticky_record_callback(void *parm, void *data)
+{
+        if (data) {
+               DEBUG(DEBUG_ERR,("Already have sticky record registered. Free old %p and create new %p\n", data, parm));
+                talloc_free(data);
+        }
+        return parm;
+}
+
+static int
+ctdb_make_record_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint32_t *k;
+       struct ctdb_sticky_record *sr;
+
+       k = talloc_zero_size(tmp_ctx, ((key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       k[0] = (key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], key.dptr, key.dsize);
+
+       sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+       if (sr != NULL) {
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       sr = talloc(ctdb_db->sticky_records, struct ctdb_sticky_record);
+       if (sr == NULL) {
+               talloc_free(tmp_ctx);
+               DEBUG(DEBUG_ERR,("Failed to allocate sticky record structure\n"));
+               return -1;
+       }
+
+       sr->ctdb    = ctdb;
+       sr->ctdb_db = ctdb_db;
+       sr->pindown = NULL;
+
+       DEBUG(DEBUG_ERR,("Make record sticky for %d seconds in db %s key:0x%08x.\n",
+                        ctdb->tunable.sticky_duration,
+                        ctdb_db->db_name, ctdb_hash(&key)));
+
+       trbt_insertarray32_callback(ctdb_db->sticky_records, k[0], &k[0], ctdb_make_sticky_record_callback, sr);
+
+       event_add_timed(ctdb->ev, sr, timeval_current_ofs(ctdb->tunable.sticky_duration, 0), ctdb_sticky_record_timeout, sr);
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+struct pinned_down_requeue_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+};
+
+struct pinned_down_deferred_call {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+};
+
+static void pinned_down_requeue(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct pinned_down_requeue_handle *handle = talloc_get_type(private_data, struct pinned_down_requeue_handle);
+       struct ctdb_context *ctdb = handle->ctdb;
+
+       talloc_steal(ctdb, handle->hdr);
+       ctdb_call_input_pkt(ctdb, handle->hdr);
+
+       talloc_free(handle);
+}
+
+static int pinned_down_destructor(struct pinned_down_deferred_call *pinned_down)
+{
+       struct ctdb_context *ctdb = pinned_down->ctdb;
+       struct pinned_down_requeue_handle *handle = talloc(ctdb, struct pinned_down_requeue_handle);
+
+       handle->ctdb = pinned_down->ctdb;
+       handle->hdr  = pinned_down->hdr;
+       talloc_steal(handle, handle->hdr);
+
+       event_add_timed(ctdb->ev, handle, timeval_zero(), pinned_down_requeue, handle);
+
+       return 0;
+}
+
+static int
+ctdb_defer_pinned_down_request(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint32_t *k;
+       struct ctdb_sticky_record *sr;
+       struct pinned_down_deferred_call *pinned_down;
+
+       k = talloc_zero_size(tmp_ctx, ((key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for sticky record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       k[0] = (key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], key.dptr, key.dsize);
+
+       sr = trbt_lookuparray32(ctdb_db->sticky_records, k[0], &k[0]);
+       if (sr == NULL) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+
+       if (sr->pindown == NULL) {
+               return -1;
+       }
+       
+       pinned_down = talloc(sr->pindown, struct pinned_down_deferred_call);
+       if (pinned_down == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate structure for deferred pinned down request\n"));
+               return -1;
+       }
+
+       pinned_down->ctdb = ctdb;
+       pinned_down->hdr  = hdr;
+
+       talloc_set_destructor(pinned_down, pinned_down_destructor);
+       talloc_steal(pinned_down, hdr);
+
+       return 0;
+}
+
+static void
+ctdb_update_db_stat_hot_keys(struct ctdb_db_context *ctdb_db, TDB_DATA key, int hopcount)
+{
+       int i, id;
+
+       /* smallest value is always at index 0 */
+       if (hopcount <= ctdb_db->statistics.hot_keys[0].count) {
+               return;
+       }
+
+       /* see if we already know this key */
+       for (i = 0; i < MAX_HOT_KEYS; i++) {
+               if (key.dsize != ctdb_db->statistics.hot_keys[i].key.dsize) {
+                       continue;
+               }
+               if (memcmp(key.dptr, ctdb_db->statistics.hot_keys[i].key.dptr, key.dsize)) {
+                       continue;
+               }
+               /* found an entry for this key */
+               if (hopcount <= ctdb_db->statistics.hot_keys[i].count) {
+                       return;
+               }
+               ctdb_db->statistics.hot_keys[i].count = hopcount;
+               goto sort_keys;
+       }
+
+       if (ctdb_db->statistics.num_hot_keys < MAX_HOT_KEYS) {
+               id = ctdb_db->statistics.num_hot_keys;
+               ctdb_db->statistics.num_hot_keys++;
+       } else {
+               id = 0;
+       }
+
+       if (ctdb_db->statistics.hot_keys[id].key.dptr != NULL) {
+               talloc_free(ctdb_db->statistics.hot_keys[id].key.dptr);
+       }
+       ctdb_db->statistics.hot_keys[id].key.dsize = key.dsize;
+       ctdb_db->statistics.hot_keys[id].key.dptr  = talloc_memdup(ctdb_db, key.dptr, key.dsize);
+       ctdb_db->statistics.hot_keys[id].count = hopcount;
+       DEBUG(DEBUG_NOTICE,("Updated hot key database=%s key=0x%08x id=%d hop_count=%d\n",
+                           ctdb_db->db_name, ctdb_hash(&key), id, hopcount));
+
+sort_keys:
+       for (i = 1; i < MAX_HOT_KEYS; i++) {
+               if (ctdb_db->statistics.hot_keys[i].count == 0) {
+                       continue;
+               }
+               if (ctdb_db->statistics.hot_keys[i].count < ctdb_db->statistics.hot_keys[0].count) {
+                       hopcount = ctdb_db->statistics.hot_keys[i].count;
+                       ctdb_db->statistics.hot_keys[i].count = ctdb_db->statistics.hot_keys[0].count;
+                       ctdb_db->statistics.hot_keys[0].count = hopcount;
+
+                       key = ctdb_db->statistics.hot_keys[i].key;
+                       ctdb_db->statistics.hot_keys[i].key = ctdb_db->statistics.hot_keys[0].key;
+                       ctdb_db->statistics.hot_keys[0].key = key;
+               }
+       }
+}
+
+/*
+  called when a CTDB_REQ_CALL packet comes in
+*/
+void ctdb_request_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_req_call *c = (struct ctdb_req_call *)hdr;
+       TDB_DATA data;
+       struct ctdb_reply_call *r;
+       int ret, len;
+       struct ctdb_ltdb_header header;
+       struct ctdb_call *call;
+       struct ctdb_db_context *ctdb_db;
+       int tmp_count, bucket;
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Failed ctdb_request_call. Transport is DOWN\n"));
+               return;
+       }
+
+
+       ctdb_db = find_ctdb_db(ctdb, c->db_id);
+       if (!ctdb_db) {
+               ctdb_send_error(ctdb, hdr, -1,
+                               "Unknown database in request. db_id==0x%08x",
+                               c->db_id);
+               return;
+       }
+
+       call = talloc(hdr, struct ctdb_call);
+       CTDB_NO_MEMORY_FATAL(ctdb, call);
+
+       call->call_id  = c->callid;
+       call->key.dptr = c->data;
+       call->key.dsize = c->keylen;
+       call->call_data.dptr = c->data + c->keylen;
+       call->call_data.dsize = c->calldatalen;
+       call->reply_data.dptr  = NULL;
+       call->reply_data.dsize = 0;
+
+
+       /* If this record is pinned down we should defer the
+          request until the pindown times out
+       */
+       if (ctdb_db->sticky) {
+               if (ctdb_defer_pinned_down_request(ctdb, ctdb_db, call->key, hdr) == 0) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Defer request for pinned down record in %s\n", ctdb_db->db_name));
+                       talloc_free(call);
+                       return;
+               }
+       }
+
+
+       /* determine if we are the dmaster for this key. This also
+          fetches the record data (if any), thus avoiding a 2nd fetch of the data 
+          if the call will be answered locally */
+
+       ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, call->key, &header, hdr, &data,
+                                          ctdb_call_input_pkt, ctdb, false);
+       if (ret == -1) {
+               ctdb_send_error(ctdb, hdr, ret, "ltdb fetch failed in ctdb_request_call");
+               talloc_free(call);
+               return;
+       }
+       if (ret == -2) {
+               DEBUG(DEBUG_INFO,(__location__ " deferred ctdb_request_call\n"));
+               talloc_free(call);
+               return;
+       }
+
+       /* Dont do READONLY if we dont have a tracking database */
+       if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db->readonly) {
+               c->flags &= ~CTDB_WANT_READONLY;
+       }
+
+       if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+               header.flags &= ~CTDB_REC_RO_FLAGS;
+               CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+               CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+               if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+               }
+               /* and clear out the tracking data */
+               if (tdb_delete(ctdb_db->rottdb, call->key) != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+               }
+       }
+
+       /* if we are revoking, we must defer all other calls until the revoke
+        * had completed.
+        */
+       if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+               talloc_free(data.dptr);
+               ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+               if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+               }
+               talloc_free(call);
+               return;
+       }
+
+       /*
+        * If we are not the dmaster and are not hosting any delegations,
+        * then we redirect the request to the node than can answer it
+        * (the lmaster or the dmaster).
+        */
+       if ((header.dmaster != ctdb->pnn) 
+           && (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS)) ) {
+               talloc_free(data.dptr);
+               ctdb_call_send_redirect(ctdb, ctdb_db, call->key, c, &header);
+
+               ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+               talloc_free(call);
+               return;
+       }
+
+       if ( (!(c->flags & CTDB_WANT_READONLY))
+       && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+               header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
+               if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+               }
+               ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+
+               if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to start record revoke");
+               }
+               talloc_free(data.dptr);
+
+               if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, call->key, hdr, ctdb_call_input_pkt, ctdb) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+               }
+               talloc_free(call);
+
+               return;
+       }               
+
+       /* If this is the first request for delegation. bump rsn and set
+        * the delegations flag
+        */
+       if ((c->flags & CTDB_WANT_READONLY)
+       &&  (c->callid == CTDB_FETCH_WITH_HEADER_FUNC)
+       &&  (!(header.flags & CTDB_REC_RO_HAVE_DELEGATIONS))) {
+               header.rsn     += 3;
+               header.flags   |= CTDB_REC_RO_HAVE_DELEGATIONS;
+               if (ctdb_ltdb_store(ctdb_db, call->key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+               }
+       }
+       if ((c->flags & CTDB_WANT_READONLY) 
+       &&  (call->call_id == CTDB_FETCH_WITH_HEADER_FUNC)) {
+               TDB_DATA tdata;
+
+               tdata = tdb_fetch(ctdb_db->rottdb, call->key);
+               if (ctdb_trackingdb_add_pnn(ctdb, &tdata, c->hdr.srcnode) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add node to trackingdb");
+               }
+               if (tdb_store(ctdb_db->rottdb, call->key, tdata, TDB_REPLACE) != 0) {
+                       ctdb_fatal(ctdb, "Failed to store trackingdb data");
+               }
+               free(tdata.dptr);
+
+               ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+
+               len = offsetof(struct ctdb_reply_call, data) + data.dsize + sizeof(struct ctdb_ltdb_header);
+               r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, 
+                                           struct ctdb_reply_call);
+               CTDB_NO_MEMORY_FATAL(ctdb, r);
+               r->hdr.destnode  = c->hdr.srcnode;
+               r->hdr.reqid     = c->hdr.reqid;
+               r->status        = 0;
+               r->datalen       = data.dsize + sizeof(struct ctdb_ltdb_header);
+               header.rsn      -= 2;
+               header.flags   |= CTDB_REC_RO_HAVE_READONLY;
+               header.flags   &= ~CTDB_REC_RO_HAVE_DELEGATIONS;
+               memcpy(&r->data[0], &header, sizeof(struct ctdb_ltdb_header));
+
+               if (data.dsize) {
+                       memcpy(&r->data[sizeof(struct ctdb_ltdb_header)], data.dptr, data.dsize);
+               }
+
+               ctdb_queue_packet(ctdb, &r->hdr);
+               CTDB_INCREMENT_STAT(ctdb, total_ro_delegations);
+               CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_delegations);
+
+               talloc_free(r);
+               talloc_free(call);
+               return;
+       }
+
+       CTDB_UPDATE_STAT(ctdb, max_hop_count, c->hopcount);
+       tmp_count = c->hopcount;
+       bucket = 0;
+       while (tmp_count) {
+               tmp_count >>= 2;
+               bucket++;
+       }
+       if (bucket >= MAX_COUNT_BUCKETS) {
+               bucket = MAX_COUNT_BUCKETS - 1;
+       }
+       CTDB_INCREMENT_STAT(ctdb, hop_count_bucket[bucket]);
+       CTDB_INCREMENT_DB_STAT(ctdb_db, hop_count_bucket[bucket]);
+       ctdb_update_db_stat_hot_keys(ctdb_db, call->key, c->hopcount);
+
+       /* If this database supports sticky records, then check if the
+          hopcount is big. If it is it means the record is hot and we
+          should make it sticky.
+       */
+       if (ctdb_db->sticky && c->hopcount >= ctdb->tunable.hopcount_make_sticky) {
+               ctdb_make_record_sticky(ctdb, ctdb_db, call->key);
+       }
+
+
+       /* Try if possible to migrate the record off to the caller node.
+        * From the clients perspective a fetch of the data is just as 
+        * expensive as a migration.
+        */
+       if (c->hdr.srcnode != ctdb->pnn) {
+               if (ctdb_db->persistent_state) {
+                       DEBUG(DEBUG_INFO, (__location__ " refusing migration"
+                             " of key %s while transaction is active\n",
+                             (char *)call->key.dptr));
+               } else {
+                       DEBUG(DEBUG_DEBUG,("pnn %u starting migration of %08x to %u\n",
+                                ctdb->pnn, ctdb_hash(&(call->key)), c->hdr.srcnode));
+                       ctdb_call_send_dmaster(ctdb_db, c, &header, &(call->key), &data);
+                       talloc_free(data.dptr);
+
+                       ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+                       }
+               }
+               talloc_free(call);
+               return;
+       }
+
+       ret = ctdb_call_local(ctdb_db, call, &header, hdr, &data, true);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_call_local failed\n"));
+               call->status = -1;
+       }
+
+       ret = ctdb_ltdb_unlock(ctdb_db, call->key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+       }
+
+       len = offsetof(struct ctdb_reply_call, data) + call->reply_data.dsize;
+       r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CALL, len, 
+                                   struct ctdb_reply_call);
+       CTDB_NO_MEMORY_FATAL(ctdb, r);
+       r->hdr.destnode  = hdr->srcnode;
+       r->hdr.reqid     = hdr->reqid;
+       r->status        = call->status;
+       r->datalen       = call->reply_data.dsize;
+       if (call->reply_data.dsize) {
+               memcpy(&r->data[0], call->reply_data.dptr, call->reply_data.dsize);
+       }
+
+       ctdb_queue_packet(ctdb, &r->hdr);
+
+       talloc_free(r);
+       talloc_free(call);
+}
+
+/**
+ * called when a CTDB_REPLY_CALL packet comes in
+ *
+ * This packet comes in response to a CTDB_REQ_CALL request packet. It
+ * contains any reply data from the call
+ */
+void ctdb_reply_call(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_reply_call *c = (struct ctdb_reply_call *)hdr;
+       struct ctdb_call_state *state;
+
+       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_call_state);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " reqid %u not found\n", hdr->reqid));
+               return;
+       }
+
+       if (hdr->reqid != state->reqid) {
+               /* we found a record  but it was the wrong one */
+               DEBUG(DEBUG_ERR, ("Dropped orphaned call reply with reqid:%u\n",hdr->reqid));
+               return;
+       }
+
+
+       /* read only delegation processing */
+       /* If we got a FETCH_WITH_HEADER we should check if this is a ro
+        * delegation since we may need to update the record header
+        */
+       if (state->c->callid == CTDB_FETCH_WITH_HEADER_FUNC) {
+               struct ctdb_db_context *ctdb_db = state->ctdb_db;
+               struct ctdb_ltdb_header *header = (struct ctdb_ltdb_header *)&c->data[0];
+               struct ctdb_ltdb_header oldheader;
+               TDB_DATA key, data, olddata;
+               int ret;
+
+               if (!(header->flags & CTDB_REC_RO_HAVE_READONLY)) {
+                       goto finished_ro;
+                       return;
+               }
+
+               key.dsize = state->c->keylen;
+               key.dptr  = state->c->data;
+               ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
+                                    ctdb_call_input_pkt, ctdb, false);
+               if (ret == -2) {
+                       return;
+               }
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_call\n"));
+                       return;
+               }
+
+               ret = ctdb_ltdb_fetch(ctdb_db, key, &oldheader, state, &olddata);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Failed to fetch old record in ctdb_reply_call\n"));
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }                       
+
+               if (header->rsn <= oldheader.rsn) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }
+
+               if (c->datalen < sizeof(struct ctdb_ltdb_header)) {
+                       DEBUG(DEBUG_ERR,(__location__ " Got FETCH_WITH_HEADER reply with too little data: %d bytes\n", c->datalen));
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }
+
+               data.dsize = c->datalen - sizeof(struct ctdb_ltdb_header);
+               data.dptr  = &c->data[sizeof(struct ctdb_ltdb_header)];
+               ret = ctdb_ltdb_store(ctdb_db, key, header, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Failed to store new record in ctdb_reply_call\n"));
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       goto finished_ro;
+               }                       
+
+               ctdb_ltdb_unlock(ctdb_db, key);
+       }
+finished_ro:
+
+       state->call->reply_data.dptr = c->data;
+       state->call->reply_data.dsize = c->datalen;
+       state->call->status = c->status;
+
+       talloc_steal(state, c);
+
+       state->state = CTDB_CALL_DONE;
+       if (state->async.fn) {
+               state->async.fn(state);
+       }
+}
+
+
+/**
+ * called when a CTDB_REPLY_DMASTER packet comes in
+ *
+ * This packet comes in from the lmaster in response to a CTDB_REQ_CALL
+ * request packet. It means that the current dmaster wants to give us
+ * the dmaster role.
+ */
+void ctdb_reply_dmaster(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_reply_dmaster *c = (struct ctdb_reply_dmaster *)hdr;
+       struct ctdb_db_context *ctdb_db;
+       TDB_DATA key, data;
+       uint32_t record_flags = 0;
+       size_t len;
+       int ret;
+
+       ctdb_db = find_ctdb_db(ctdb, c->db_id);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_reply_dmaster\n", c->db_id));
+               return;
+       }
+       
+       key.dptr = c->data;
+       key.dsize = c->keylen;
+       data.dptr = &c->data[key.dsize];
+       data.dsize = c->datalen;
+       len = offsetof(struct ctdb_reply_dmaster, data) + key.dsize + data.dsize
+               + sizeof(uint32_t);
+       if (len <= c->hdr.length) {
+               record_flags = *(uint32_t *)&c->data[c->keylen + c->datalen];
+       }
+
+       ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr,
+                                    ctdb_call_input_pkt, ctdb, false);
+       if (ret == -2) {
+               return;
+       }
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get lock in ctdb_reply_dmaster\n"));
+               return;
+       }
+
+       ctdb_become_dmaster(ctdb_db, hdr, key, data, c->rsn, record_flags);
+}
+
+
+/*
+  called when a CTDB_REPLY_ERROR packet comes in
+*/
+void ctdb_reply_error(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_reply_error *c = (struct ctdb_reply_error *)hdr;
+       struct ctdb_call_state *state;
+
+       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_call_state);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_error\n",
+                        ctdb->pnn, hdr->reqid));
+               return;
+       }
+
+       if (hdr->reqid != state->reqid) {
+               /* we found a record  but it was the wrong one */
+               DEBUG(DEBUG_ERR, ("Dropped orphaned error reply with reqid:%u\n",hdr->reqid));
+               return;
+       }
+
+       talloc_steal(state, c);
+
+       state->state  = CTDB_CALL_ERROR;
+       state->errmsg = (char *)c->msg;
+       if (state->async.fn) {
+               state->async.fn(state);
+       }
+}
+
+
+/*
+  destroy a ctdb_call
+*/
+static int ctdb_call_destructor(struct ctdb_call_state *state)
+{
+       DLIST_REMOVE(state->ctdb_db->ctdb->pending_calls, state);
+       ctdb_reqid_remove(state->ctdb_db->ctdb, state->reqid);
+       return 0;
+}
+
+
+/*
+  called when a ctdb_call needs to be resent after a reconfigure event
+*/
+static void ctdb_call_resend(struct ctdb_call_state *state)
+{
+       struct ctdb_context *ctdb = state->ctdb_db->ctdb;
+
+       state->generation = ctdb->vnn_map->generation;
+
+       /* use a new reqid, in case the old reply does eventually come in */
+       ctdb_reqid_remove(ctdb, state->reqid);
+       state->reqid = ctdb_reqid_new(ctdb, state);
+       state->c->hdr.reqid = state->reqid;
+
+       /* update the generation count for this request, so its valid with the new vnn_map */
+       state->c->hdr.generation = state->generation;
+
+       /* send the packet to ourselves, it will be redirected appropriately */
+       state->c->hdr.destnode = ctdb->pnn;
+
+       ctdb_queue_packet(ctdb, &state->c->hdr);
+       DEBUG(DEBUG_NOTICE,("resent ctdb_call\n"));
+}
+
+/*
+  resend all pending calls on recovery
+ */
+void ctdb_call_resend_all(struct ctdb_context *ctdb)
+{
+       struct ctdb_call_state *state, *next;
+       for (state=ctdb->pending_calls;state;state=next) {
+               next = state->next;
+               ctdb_call_resend(state);
+       }
+}
+
+/*
+  this allows the caller to setup a async.fn 
+*/
+static void call_local_trigger(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct ctdb_call_state *state = talloc_get_type(private_data, struct ctdb_call_state);
+       if (state->async.fn) {
+               state->async.fn(state);
+       }
+}      
+
+
+/*
+  construct an event driven local ctdb_call
+
+  this is used so that locally processed ctdb_call requests are processed
+  in an event driven manner
+*/
+struct ctdb_call_state *ctdb_call_local_send(struct ctdb_db_context *ctdb_db, 
+                                            struct ctdb_call *call,
+                                            struct ctdb_ltdb_header *header,
+                                            TDB_DATA *data)
+{
+       struct ctdb_call_state *state;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       int ret;
+
+       state = talloc_zero(ctdb_db, struct ctdb_call_state);
+       CTDB_NO_MEMORY_NULL(ctdb, state);
+
+       talloc_steal(state, data->dptr);
+
+       state->state = CTDB_CALL_DONE;
+       state->call  = talloc(state, struct ctdb_call);
+       CTDB_NO_MEMORY_NULL(ctdb, state->call);
+       *(state->call) = *call;
+       state->ctdb_db = ctdb_db;
+
+       ret = ctdb_call_local(ctdb_db, state->call, header, state, data, true);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_call_local() failed, ignoring return code %d\n", ret));
+       }
+
+       event_add_timed(ctdb->ev, state, timeval_zero(), call_local_trigger, state);
+
+       return state;
+}
+
+
+/*
+  make a remote ctdb call - async send. Called in daemon context.
+
+  This constructs a ctdb_call request and queues it for processing. 
+  This call never blocks.
+*/
+struct ctdb_call_state *ctdb_daemon_call_send_remote(struct ctdb_db_context *ctdb_db, 
+                                                    struct ctdb_call *call, 
+                                                    struct ctdb_ltdb_header *header)
+{
+       uint32_t len;
+       struct ctdb_call_state *state;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Failed send packet. Transport is down\n"));
+               return NULL;
+       }
+
+       state = talloc_zero(ctdb_db, struct ctdb_call_state);
+       CTDB_NO_MEMORY_NULL(ctdb, state);
+       state->call = talloc(state, struct ctdb_call);
+       CTDB_NO_MEMORY_NULL(ctdb, state->call);
+
+       state->reqid = ctdb_reqid_new(ctdb, state);
+       state->ctdb_db = ctdb_db;
+       talloc_set_destructor(state, ctdb_call_destructor);
+
+       len = offsetof(struct ctdb_req_call, data) + call->key.dsize + call->call_data.dsize;
+       state->c = ctdb_transport_allocate(ctdb, state, CTDB_REQ_CALL, len, 
+                                          struct ctdb_req_call);
+       CTDB_NO_MEMORY_NULL(ctdb, state->c);
+       state->c->hdr.destnode  = header->dmaster;
+
+       /* this limits us to 16k outstanding messages - not unreasonable */
+       state->c->hdr.reqid     = state->reqid;
+       state->c->flags         = call->flags;
+       state->c->db_id         = ctdb_db->db_id;
+       state->c->callid        = call->call_id;
+       state->c->hopcount      = 0;
+       state->c->keylen        = call->key.dsize;
+       state->c->calldatalen   = call->call_data.dsize;
+       memcpy(&state->c->data[0], call->key.dptr, call->key.dsize);
+       memcpy(&state->c->data[call->key.dsize], 
+              call->call_data.dptr, call->call_data.dsize);
+       *(state->call)              = *call;
+       state->call->call_data.dptr = &state->c->data[call->key.dsize];
+       state->call->key.dptr       = &state->c->data[0];
+
+       state->state  = CTDB_CALL_WAIT;
+       state->generation = ctdb->vnn_map->generation;
+
+       DLIST_ADD(ctdb->pending_calls, state);
+
+       ctdb_queue_packet(ctdb, &state->c->hdr);
+
+       return state;
+}
+
+/*
+  make a remote ctdb call - async recv - called in daemon context
+
+  This is called when the program wants to wait for a ctdb_call to complete and get the 
+  results. This call will block unless the call has already completed.
+*/
+int ctdb_daemon_call_recv(struct ctdb_call_state *state, struct ctdb_call *call)
+{
+       while (state->state < CTDB_CALL_DONE) {
+               event_loop_once(state->ctdb_db->ctdb->ev);
+       }
+       if (state->state != CTDB_CALL_DONE) {
+               ctdb_set_error(state->ctdb_db->ctdb, "%s", state->errmsg);
+               talloc_free(state);
+               return -1;
+       }
+
+       if (state->call->reply_data.dsize) {
+               call->reply_data.dptr = talloc_memdup(call,
+                                                     state->call->reply_data.dptr,
+                                                     state->call->reply_data.dsize);
+               call->reply_data.dsize = state->call->reply_data.dsize;
+       } else {
+               call->reply_data.dptr = NULL;
+               call->reply_data.dsize = 0;
+       }
+       call->status = state->call->status;
+       talloc_free(state);
+       return 0;
+}
+
+
+/* 
+   send a keepalive packet to the other node
+*/
+void ctdb_send_keepalive(struct ctdb_context *ctdb, uint32_t destnode)
+{
+       struct ctdb_req_keepalive *r;
+       
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Failed to send keepalive. Transport is DOWN\n"));
+               return;
+       }
+
+       r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_KEEPALIVE,
+                                   sizeof(struct ctdb_req_keepalive), 
+                                   struct ctdb_req_keepalive);
+       CTDB_NO_MEMORY_FATAL(ctdb, r);
+       r->hdr.destnode  = destnode;
+       r->hdr.reqid     = 0;
+       
+       CTDB_INCREMENT_STAT(ctdb, keepalive_packets_sent);
+
+       ctdb_queue_packet(ctdb, &r->hdr);
+
+       talloc_free(r);
+}
+
+
+
+struct revokechild_deferred_call {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+       deferred_requeue_fn fn;
+       void *ctx;
+};
+
+struct revokechild_handle {
+       struct revokechild_handle *next, *prev;
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct fd_event *fde;
+       int status;
+       int fd[2];
+       pid_t child;
+       TDB_DATA key;
+};
+
+struct revokechild_requeue_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+       deferred_requeue_fn fn;
+       void *ctx;
+};
+
+static void deferred_call_requeue(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct revokechild_requeue_handle *requeue_handle = talloc_get_type(private_data, struct revokechild_requeue_handle);
+
+       requeue_handle->fn(requeue_handle->ctx, requeue_handle->hdr);
+       talloc_free(requeue_handle);
+}
+
+static int deferred_call_destructor(struct revokechild_deferred_call *deferred_call)
+{
+       struct ctdb_context *ctdb = deferred_call->ctdb;
+       struct revokechild_requeue_handle *requeue_handle = talloc(ctdb, struct revokechild_requeue_handle);
+       struct ctdb_req_call *c = (struct ctdb_req_call *)deferred_call->hdr;
+
+       requeue_handle->ctdb = ctdb;
+       requeue_handle->hdr  = deferred_call->hdr;
+       requeue_handle->fn   = deferred_call->fn;
+       requeue_handle->ctx  = deferred_call->ctx;
+       talloc_steal(requeue_handle, requeue_handle->hdr);
+
+       /* when revoking, any READONLY requests have 1 second grace to let read/write finish first */
+       event_add_timed(ctdb->ev, requeue_handle, timeval_current_ofs(c->flags & CTDB_WANT_READONLY ? 1 : 0, 0), deferred_call_requeue, requeue_handle);
+
+       return 0;
+}
+
+
+static int revokechild_destructor(struct revokechild_handle *rc)
+{
+       if (rc->fde != NULL) {
+               talloc_free(rc->fde);
+       }
+
+       if (rc->fd[0] != -1) {
+               close(rc->fd[0]);
+       }
+       if (rc->fd[1] != -1) {
+               close(rc->fd[1]);
+       }
+       ctdb_kill(rc->ctdb, rc->child, SIGKILL);
+
+       DLIST_REMOVE(rc->ctdb_db->revokechild_active, rc);
+       return 0;
+}
+
+static void revokechild_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct revokechild_handle *rc = talloc_get_type(private_data, 
+                                                    struct revokechild_handle);
+       int ret;
+       char c;
+
+       ret = read(rc->fd[0], &c, 1);
+       if (ret != 1) {
+               DEBUG(DEBUG_ERR,("Failed to read status from revokechild. errno:%d\n", errno));
+               rc->status = -1;
+               talloc_free(rc);
+               return;
+       }
+       if (c != 0) {
+               DEBUG(DEBUG_ERR,("revokechild returned failure. status:%d\n", c));
+               rc->status = -1;
+               talloc_free(rc);
+               return;
+       }
+
+       talloc_free(rc);
+}
+
+struct ctdb_revoke_state {
+       struct ctdb_db_context *ctdb_db;
+       TDB_DATA key;
+       struct ctdb_ltdb_header *header;
+       TDB_DATA data;
+       int count;
+       int status;
+       int finished;
+};
+
+static void update_record_cb(struct ctdb_client_control_state *state)
+{
+       struct ctdb_revoke_state *revoke_state;
+       int ret;
+       int32_t res;
+
+       if (state == NULL) {
+               return;
+       }
+       revoke_state = state->async.private_data;
+
+       state->async.fn = NULL;
+        ret = ctdb_control_recv(state->ctdb, state, state, NULL, &res, NULL);
+        if ((ret != 0) || (res != 0)) {
+               DEBUG(DEBUG_ERR,("Recv for revoke update record failed ret:%d res:%d\n", ret, res));
+               revoke_state->status = -1;
+       }
+
+       revoke_state->count--;
+       if (revoke_state->count <= 0) {
+               revoke_state->finished = 1;
+       }
+}
+
+static void revoke_send_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data)
+{
+       struct ctdb_revoke_state *revoke_state = private_data;
+       struct ctdb_client_control_state *state;
+
+       state = ctdb_ctrl_updaterecord_send(ctdb, revoke_state, timeval_current_ofs(5,0), pnn, revoke_state->ctdb_db, revoke_state->key, revoke_state->header, revoke_state->data);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,("Failure to send update record to revoke readonly delegation\n"));
+               revoke_state->status = -1;
+               return;
+       }
+       state->async.fn           = update_record_cb;
+       state->async.private_data = revoke_state;
+
+       revoke_state->count++;
+
+}
+
+static void ctdb_revoke_timeout_handler(struct event_context *ev, struct timed_event *te, 
+                             struct timeval yt, void *private_data)
+{
+       struct ctdb_revoke_state *state = private_data;
+
+       DEBUG(DEBUG_ERR,("Timed out waiting for revoke to finish\n"));
+       state->finished = 1;
+       state->status   = -1;
+}
+
+static int ctdb_revoke_all_delegations(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA tdata, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       struct ctdb_revoke_state *state = talloc_zero(ctdb, struct ctdb_revoke_state);
+       int status;
+
+       state->ctdb_db = ctdb_db;
+       state->key     = key;
+       state->header  = header;
+       state->data    = data;
+       ctdb_trackingdb_traverse(ctdb, tdata, revoke_send_cb, state);
+
+       event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0), ctdb_revoke_timeout_handler, state);
+
+       while (state->finished == 0) {
+               event_loop_once(ctdb->ev);
+       }
+
+       status = state->status;
+
+       if (status == 0) {
+               struct ctdb_ltdb_header new_header;
+               TDB_DATA new_data;
+
+               if (ctdb_ltdb_lock(ctdb_db, key) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to chainlock the database in revokechild\n"));
+                       talloc_free(state);
+                       return -1;
+               }
+               if (ctdb_ltdb_fetch(ctdb_db, key, &new_header, state, &new_data) != 0) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       DEBUG(DEBUG_ERR,("Failed for fetch tdb record in revokechild\n"));
+                       talloc_free(state);
+                       return -1;
+               }
+               header->rsn++;
+               if (new_header.rsn > header->rsn) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       DEBUG(DEBUG_ERR,("RSN too high in tdb record in revokechild\n"));
+                       talloc_free(state);
+                       return -1;
+               }
+               if ( (new_header.flags & (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS)) != (CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS) ) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       DEBUG(DEBUG_ERR,("Flags are wrong in tdb record in revokechild\n"));
+                       talloc_free(state);
+                       return -1;
+               }
+               new_header.rsn++;
+               new_header.flags |= CTDB_REC_RO_REVOKE_COMPLETE;
+               if (ctdb_ltdb_store(ctdb_db, key, &new_header, new_data) != 0) {
+                       ctdb_ltdb_unlock(ctdb_db, key);
+                       DEBUG(DEBUG_ERR,("Failed to write new record in revokechild\n"));
+                       talloc_free(state);
+                       return -1;
+               }
+               ctdb_ltdb_unlock(ctdb_db, key);
+       }
+
+       talloc_free(state);
+       return status;
+}
+
+
+int ctdb_start_revoke_ro_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_ltdb_header *header, TDB_DATA data)
+{
+       TDB_DATA tdata;
+       struct revokechild_handle *rc;
+       pid_t parent = getpid();
+       int ret;
+
+       header->flags &= ~(CTDB_REC_RO_REVOKING_READONLY|CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY);
+       header->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+       header->rsn   -= 1;
+
+       if ((rc = talloc_zero(ctdb_db, struct revokechild_handle)) == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate revokechild_handle\n"));
+               return -1;
+       }
+
+       tdata = tdb_fetch(ctdb_db->rottdb, key);
+       if (tdata.dsize > 0) {
+               uint8_t *tmp;
+
+               tmp = tdata.dptr;
+               tdata.dptr = talloc_memdup(rc, tdata.dptr, tdata.dsize);
+               free(tmp);
+       }
+
+       rc->status    = 0;
+       rc->ctdb      = ctdb;
+       rc->ctdb_db   = ctdb_db;
+       rc->fd[0]     = -1;
+       rc->fd[1]     = -1;
+
+       talloc_set_destructor(rc, revokechild_destructor);
+
+       rc->key.dsize = key.dsize;
+       rc->key.dptr  = talloc_memdup(rc, key.dptr, key.dsize);
+       if (rc->key.dptr == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for revokechild_handle\n"));
+               talloc_free(rc);
+               return -1;
+       }
+
+       ret = pipe(rc->fd);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for revokechild_handle\n"));
+               talloc_free(rc);
+               return -1;
+       }
+
+
+       rc->child = ctdb_fork(ctdb);
+       if (rc->child == (pid_t)-1) {
+               DEBUG(DEBUG_ERR,("Failed to fork child for revokechild\n"));
+               talloc_free(rc);
+               return -1;
+       }
+
+       if (rc->child == 0) {
+               char c = 0;
+               close(rc->fd[0]);
+               debug_extra = talloc_asprintf(NULL, "revokechild-%s:", ctdb_db->db_name);
+
+               ctdb_set_process_name("ctdb_revokechild");
+               if (switch_from_server_to_client(ctdb, "revokechild-%s", ctdb_db->db_name) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to switch from server to client for revokechild process\n"));
+                       c = 1;
+                       goto child_finished;
+               }
+
+               c = ctdb_revoke_all_delegations(ctdb, ctdb_db, tdata, key, header, data);
+
+child_finished:
+               write(rc->fd[1], &c, 1);
+               /* make sure we die when our parent dies */
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+               }
+               _exit(0);
+       }
+
+       close(rc->fd[1]);
+       rc->fd[1] = -1;
+       set_close_on_exec(rc->fd[0]);
+
+       /* This is an active revokechild child process */
+       DLIST_ADD_END(ctdb_db->revokechild_active, rc, NULL);
+
+       rc->fde = event_add_fd(ctdb->ev, rc, rc->fd[0],
+                                  EVENT_FD_READ, revokechild_handler,
+                                  (void *)rc);
+       if (rc->fde == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to set up fd event for revokechild process\n"));
+               talloc_free(rc);
+       }
+       tevent_fd_set_auto_close(rc->fde);
+
+       return 0;
+}
+
+int ctdb_add_revoke_deferred_call(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key, struct ctdb_req_header *hdr, deferred_requeue_fn fn, void *call_context)
+{
+       struct revokechild_handle *rc;
+       struct revokechild_deferred_call *deferred_call;
+
+       for (rc = ctdb_db->revokechild_active; rc; rc = rc->next) {
+               if (rc->key.dsize == 0) {
+                       continue;
+               }
+               if (rc->key.dsize != key.dsize) {
+                       continue;
+               }
+               if (!memcmp(rc->key.dptr, key.dptr, key.dsize)) {
+                       break;
+               }
+       }
+
+       if (rc == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to add deferred call to revoke list. revoke structure not found\n"));
+               return -1;
+       }
+
+       deferred_call = talloc(rc, struct revokechild_deferred_call);
+       if (deferred_call == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate deferred call structure for revoking record\n"));
+               return -1;
+       }
+
+       deferred_call->ctdb = ctdb;
+       deferred_call->hdr  = hdr;
+       deferred_call->fn   = fn;
+       deferred_call->ctx  = call_context;
+
+       talloc_set_destructor(deferred_call, deferred_call_destructor);
+       talloc_steal(deferred_call, hdr);
+
+       return 0;
+}
diff --git a/ctdb/server/ctdb_control.c b/ctdb/server/ctdb_control.c
new file mode 100644 (file)
index 0000000..99319ac
--- /dev/null
@@ -0,0 +1,895 @@
+/* 
+   ctdb_control protocol code
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include "lib/util/dlinklist.h"
+#include "db_wrap.h"
+
+
+struct ctdb_control_state {
+       struct ctdb_context *ctdb;
+       uint32_t reqid;
+       ctdb_control_callback_fn_t callback;
+       void *private_data;
+       unsigned flags;
+};
+
+
+/*
+  dump talloc memory hierarchy, returning it as a blob to the client
+ */
+int32_t ctdb_dump_memory(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+       /* dump to a file, then send the file as a blob */
+       FILE *f;
+       long fsize;
+       f = tmpfile();
+       if (f == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Unable to open tmpfile - %s\n", strerror(errno)));
+               return -1;
+       }
+       talloc_report_full(NULL, f);
+       fsize = ftell(f);
+       rewind(f);
+       outdata->dptr = talloc_size(outdata, fsize);
+       if (outdata->dptr == NULL) {
+               fclose(f);
+               CTDB_NO_MEMORY(ctdb, outdata->dptr);
+       }
+       outdata->dsize = fread(outdata->dptr, 1, fsize, f);
+       fclose(f);
+       if (outdata->dsize != fsize) {
+               DEBUG(DEBUG_ERR,(__location__ " Unable to read tmpfile\n"));
+               return -1;
+       }
+       return 0;
+}
+
+static int32_t control_not_implemented(const char *unsupported,
+                                      const char *alternate)
+{
+       if (alternate == NULL) {
+               DEBUG(DEBUG_ERR,
+                     ("Control %s is not implemented any more\n",
+                      unsupported));
+       } else {
+               DEBUG(DEBUG_ERR,
+                     ("Control %s is not implemented any more, use %s instead\n",
+                      unsupported, alternate));
+       }
+       return -1;
+}
+
+/*
+  process a control request
+ */
+static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, 
+                                    struct ctdb_req_control *c,
+                                    TDB_DATA indata,
+                                    TDB_DATA *outdata, uint32_t srcnode,
+                                    const char **errormsg,
+                                    bool *async_reply)
+{
+       uint32_t opcode = c->opcode;
+       uint64_t srvid = c->srvid;
+       uint32_t client_id = c->client_id;
+
+       switch (opcode) {
+       case CTDB_CONTROL_PROCESS_EXISTS: {
+               CHECK_CONTROL_DATA_SIZE(sizeof(pid_t));
+               return ctdb_control_process_exists(ctdb, *(pid_t *)indata.dptr);
+       }
+
+       case CTDB_CONTROL_SET_DEBUG: {
+               CHECK_CONTROL_DATA_SIZE(sizeof(int32_t));
+               LogLevel = *(int32_t *)indata.dptr;
+               return 0;
+       }
+
+       case CTDB_CONTROL_GET_DEBUG: {
+               CHECK_CONTROL_DATA_SIZE(0);
+               outdata->dptr = (uint8_t *)&LogLevel;
+               outdata->dsize = sizeof(LogLevel);
+               return 0;
+       }
+
+       case CTDB_CONTROL_STATISTICS: {
+               int i;
+               CHECK_CONTROL_DATA_SIZE(0);
+               ctdb->statistics.memory_used = talloc_total_size(NULL);
+               ctdb->statistics.num_clients = ctdb->num_clients;
+               ctdb->statistics.frozen = 0;
+               for (i=1; i<= NUM_DB_PRIORITIES; i++) {
+                       if (ctdb->freeze_mode[i] == CTDB_FREEZE_FROZEN) {
+                               ctdb->statistics.frozen = 1;
+                       }
+               }
+               ctdb->statistics.recovering = (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE);
+               ctdb->statistics.statistics_current_time = timeval_current();
+
+               outdata->dptr = (uint8_t *)&ctdb->statistics;
+               outdata->dsize = sizeof(ctdb->statistics);
+               return 0;
+       }
+
+       case CTDB_CONTROL_GET_ALL_TUNABLES: {
+               CHECK_CONTROL_DATA_SIZE(0);
+               outdata->dptr = (uint8_t *)&ctdb->tunable;
+               outdata->dsize = sizeof(ctdb->tunable);
+               return 0;
+       }
+
+       case CTDB_CONTROL_DUMP_MEMORY: {
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_dump_memory(ctdb, outdata);
+       }
+
+       case CTDB_CONTROL_STATISTICS_RESET: {
+               CHECK_CONTROL_DATA_SIZE(0);
+               ZERO_STRUCT(ctdb->statistics);
+               ctdb->statistics.statistics_start_time = timeval_current();
+               return 0;
+       }
+
+       case CTDB_CONTROL_GETVNNMAP:
+               return ctdb_control_getvnnmap(ctdb, opcode, indata, outdata);
+
+       case CTDB_CONTROL_GET_DBMAP:
+               return ctdb_control_getdbmap(ctdb, opcode, indata, outdata);
+
+       case CTDB_CONTROL_GET_NODEMAPv4:
+               return ctdb_control_getnodemapv4(ctdb, opcode, indata, outdata);
+
+       case CTDB_CONTROL_GET_NODEMAP:
+               return ctdb_control_getnodemap(ctdb, opcode, indata, outdata);
+
+       case CTDB_CONTROL_RELOAD_NODES_FILE:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_reload_nodes_file(ctdb, opcode);
+
+       case CTDB_CONTROL_SET_DB_STICKY: {
+               uint32_t db_id;
+               struct ctdb_db_context *ctdb_db;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+               db_id = *(uint32_t *)indata.dptr;
+               ctdb_db = find_ctdb_db(ctdb, db_id);
+               if (ctdb_db == NULL) return -1;
+               return ctdb_set_db_sticky(ctdb, ctdb_db);
+       }
+
+       case CTDB_CONTROL_SETVNNMAP:
+               return ctdb_control_setvnnmap(ctdb, opcode, indata, outdata);
+
+       case CTDB_CONTROL_PULL_DB: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_pulldb));
+               return ctdb_control_pull_db(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_SET_DMASTER: 
+               return control_not_implemented("SET_DMASTER", NULL);
+
+       case CTDB_CONTROL_PUSH_DB:
+               return ctdb_control_push_db(ctdb, indata);
+
+       case CTDB_CONTROL_GET_RECMODE: {
+               int i;
+               if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) {
+                       return CTDB_RECOVERY_ACTIVE;
+               }                 
+               for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+                       if (ctdb->freeze_mode[i] == CTDB_FREEZE_FROZEN) {
+                               return CTDB_RECOVERY_ACTIVE;
+                       }
+               }
+               return CTDB_RECOVERY_NORMAL;
+       }
+
+       case CTDB_CONTROL_SET_RECMASTER: {
+               return ctdb_control_set_recmaster(ctdb, opcode, indata);
+       }
+
+       case CTDB_CONTROL_GET_RECMASTER:
+               return ctdb->recovery_master;
+
+       case CTDB_CONTROL_GET_PID:
+               return getpid();
+
+       case CTDB_CONTROL_GET_PNN:
+               return ctdb->pnn;
+
+       case CTDB_CONTROL_PING:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb->num_clients;
+
+       case CTDB_CONTROL_GET_RUNSTATE:
+               CHECK_CONTROL_DATA_SIZE(0);
+               outdata->dptr = (uint8_t *)&ctdb->runstate;
+               outdata->dsize = sizeof(uint32_t);
+               return 0;
+
+
+       case CTDB_CONTROL_SET_DB_READONLY: {
+               uint32_t db_id;
+               struct ctdb_db_context *ctdb_db;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+               db_id = *(uint32_t *)indata.dptr;
+               ctdb_db = find_ctdb_db(ctdb, db_id);
+               if (ctdb_db == NULL) return -1;
+               return ctdb_set_db_readonly(ctdb, ctdb_db);
+       }
+       case CTDB_CONTROL_GET_DBNAME: {
+               uint32_t db_id;
+               struct ctdb_db_context *ctdb_db;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+               db_id = *(uint32_t *)indata.dptr;
+               ctdb_db = find_ctdb_db(ctdb, db_id);
+               if (ctdb_db == NULL) return -1;
+               outdata->dptr = discard_const(ctdb_db->db_name);
+               outdata->dsize = strlen(ctdb_db->db_name)+1;
+               return 0;
+       }
+
+       case CTDB_CONTROL_GETDBPATH: {
+               uint32_t db_id;
+               struct ctdb_db_context *ctdb_db;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+               db_id = *(uint32_t *)indata.dptr;
+               ctdb_db = find_ctdb_db(ctdb, db_id);
+               if (ctdb_db == NULL) return -1;
+               outdata->dptr = discard_const(ctdb_db->db_path);
+               outdata->dsize = strlen(ctdb_db->db_path)+1;
+               return 0;
+       }
+
+       case CTDB_CONTROL_DB_ATTACH:
+         return ctdb_control_db_attach(ctdb, indata, outdata, srvid, false, client_id, c, async_reply);
+
+       case CTDB_CONTROL_DB_ATTACH_PERSISTENT:
+         return ctdb_control_db_attach(ctdb, indata, outdata, srvid, true, client_id, c, async_reply);
+
+       case CTDB_CONTROL_SET_CALL: {
+               struct ctdb_control_set_call *sc = 
+                       (struct ctdb_control_set_call *)indata.dptr;
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_set_call));
+               return ctdb_daemon_set_call(ctdb, sc->db_id, sc->fn, sc->id);
+       }
+
+       case CTDB_CONTROL_TRAVERSE_START:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start));
+               return ctdb_control_traverse_start(ctdb, indata, outdata, srcnode, client_id);
+
+       case CTDB_CONTROL_TRAVERSE_START_EXT:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start_ext));
+               return ctdb_control_traverse_start_ext(ctdb, indata, outdata, srcnode, client_id);
+
+       case CTDB_CONTROL_TRAVERSE_ALL:
+               return ctdb_control_traverse_all(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_TRAVERSE_ALL_EXT:
+               return ctdb_control_traverse_all_ext(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_TRAVERSE_DATA:
+               return ctdb_control_traverse_data(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_TRAVERSE_KILL:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_traverse_start));
+               return ctdb_control_traverse_kill(ctdb, indata, outdata, srcnode);
+
+       case CTDB_CONTROL_REGISTER_SRVID:
+               return daemon_register_message_handler(ctdb, client_id, srvid);
+
+       case CTDB_CONTROL_DEREGISTER_SRVID:
+               return daemon_deregister_message_handler(ctdb, client_id, srvid);
+
+       case CTDB_CONTROL_CHECK_SRVIDS:
+               return daemon_check_srvids(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_ENABLE_SEQNUM:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+               return ctdb_ltdb_enable_seqnum(ctdb, *(uint32_t *)indata.dptr);
+
+       case CTDB_CONTROL_UPDATE_SEQNUM:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));              
+               return ctdb_ltdb_update_seqnum(ctdb, *(uint32_t *)indata.dptr, srcnode);
+
+       case CTDB_CONTROL_FREEZE:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_freeze(ctdb, c, async_reply);
+
+       case CTDB_CONTROL_THAW:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_thaw(ctdb, (uint32_t)c->srvid);
+
+       case CTDB_CONTROL_SET_RECMODE:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));              
+               return ctdb_control_set_recmode(ctdb, c, indata, async_reply, errormsg);
+
+       case CTDB_CONTROL_GET_MONMODE: 
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_monitoring_mode(ctdb);
+               
+       case CTDB_CONTROL_ENABLE_MONITOR: 
+               CHECK_CONTROL_DATA_SIZE(0);
+               ctdb_enable_monitoring(ctdb);
+               return 0;
+       
+       case CTDB_CONTROL_RUN_EVENTSCRIPTS: 
+               return ctdb_run_eventscripts(ctdb, c, indata, async_reply);
+
+       case CTDB_CONTROL_DISABLE_MONITOR: 
+               CHECK_CONTROL_DATA_SIZE(0);
+               ctdb_disable_monitoring(ctdb);
+               return 0;
+
+       case CTDB_CONTROL_SHUTDOWN:
+               DEBUG(DEBUG_NOTICE,("Received SHUTDOWN command.\n"));
+               ctdb_shutdown_sequence(ctdb, 0);
+               /* In case above returns due to duplicate shutdown */
+               return 0;
+
+       case CTDB_CONTROL_TAKEOVER_IPv4:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ipv4));
+               return ctdb_control_takeover_ipv4(ctdb, c, indata, async_reply);
+
+       case CTDB_CONTROL_TAKEOVER_IP:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip));
+               return ctdb_control_takeover_ip(ctdb, c, indata, async_reply);
+
+       case CTDB_CONTROL_RELEASE_IPv4:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ipv4));
+               return ctdb_control_release_ipv4(ctdb, c, indata, async_reply);
+
+       case CTDB_CONTROL_RELEASE_IP:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_public_ip));
+               return ctdb_control_release_ip(ctdb, c, indata, async_reply);
+
+       case CTDB_CONTROL_IPREALLOCATED:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_ipreallocated(ctdb, c, async_reply);
+
+       case CTDB_CONTROL_GET_PUBLIC_IPSv4:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_get_public_ipsv4(ctdb, c, outdata);
+
+       case CTDB_CONTROL_GET_PUBLIC_IPS:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_get_public_ips(ctdb, c, outdata);
+
+       case CTDB_CONTROL_TCP_CLIENT: 
+               return ctdb_control_tcp_client(ctdb, client_id, indata);
+
+       case CTDB_CONTROL_STARTUP: 
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_startup(ctdb, srcnode);
+
+       case CTDB_CONTROL_TCP_ADD: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_tcp_connection));
+               return ctdb_control_tcp_add(ctdb, indata, false);
+
+       case CTDB_CONTROL_TCP_ADD_DELAYED_UPDATE: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_tcp_connection));
+               return ctdb_control_tcp_add(ctdb, indata, true);
+
+       case CTDB_CONTROL_TCP_REMOVE: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_tcp_connection));
+               return ctdb_control_tcp_remove(ctdb, indata);
+
+       case CTDB_CONTROL_SET_TUNABLE:
+               return ctdb_control_set_tunable(ctdb, indata);
+
+       case CTDB_CONTROL_GET_TUNABLE:
+               return ctdb_control_get_tunable(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_LIST_TUNABLES:
+               return ctdb_control_list_tunables(ctdb, outdata);
+
+       case CTDB_CONTROL_MODIFY_FLAGS:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_node_flag_change));
+               return ctdb_control_modflags(ctdb, indata);
+
+       case CTDB_CONTROL_KILL_TCP: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_killtcp));
+               return ctdb_control_kill_tcp(ctdb, indata);
+
+       case CTDB_CONTROL_GET_TCP_TICKLE_LIST:
+               CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr));
+               return ctdb_control_get_tcp_tickle_list(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_SET_TCP_TICKLE_LIST:
+               /* data size is verified in the called function */
+               return ctdb_control_set_tcp_tickle_list(ctdb, indata);
+
+       case CTDB_CONTROL_REGISTER_SERVER_ID: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_server_id));
+               return ctdb_control_register_server_id(ctdb, client_id, indata);
+
+       case CTDB_CONTROL_UNREGISTER_SERVER_ID: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_server_id));
+               return ctdb_control_unregister_server_id(ctdb, indata);
+
+       case CTDB_CONTROL_CHECK_SERVER_ID: 
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_server_id));
+               return ctdb_control_check_server_id(ctdb, indata);
+
+       case CTDB_CONTROL_GET_SERVER_ID_LIST:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_get_server_id_list(ctdb, outdata);
+
+       case CTDB_CONTROL_PERSISTENT_STORE:
+               return control_not_implemented("PERSISTENT_STORE", NULL);
+
+       case CTDB_CONTROL_UPDATE_RECORD:
+               return ctdb_control_update_record(ctdb, c, indata, async_reply);
+
+       case CTDB_CONTROL_SEND_GRATIOUS_ARP:
+               return ctdb_control_send_gratious_arp(ctdb, indata);
+
+       case CTDB_CONTROL_TRANSACTION_START:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+               return ctdb_control_transaction_start(ctdb, *(uint32_t *)indata.dptr);
+
+       case CTDB_CONTROL_TRANSACTION_COMMIT:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+               return ctdb_control_transaction_commit(ctdb, *(uint32_t *)indata.dptr);
+
+       case CTDB_CONTROL_WIPE_DATABASE:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_wipe_database));
+               return ctdb_control_wipe_database(ctdb, indata);
+
+       case CTDB_CONTROL_UPTIME:
+               return ctdb_control_uptime(ctdb, outdata);
+
+       case CTDB_CONTROL_START_RECOVERY:
+               return ctdb_control_start_recovery(ctdb, c, async_reply);
+
+       case CTDB_CONTROL_END_RECOVERY:
+               return ctdb_control_end_recovery(ctdb, c, async_reply);
+
+       case CTDB_CONTROL_TRY_DELETE_RECORDS:
+               return ctdb_control_try_delete_records(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_ADD_PUBLIC_IP:
+               return ctdb_control_add_public_address(ctdb, indata);
+
+       case CTDB_CONTROL_DEL_PUBLIC_IP:
+               return ctdb_control_del_public_address(ctdb, indata);
+
+       case CTDB_CONTROL_GET_CAPABILITIES:
+               return ctdb_control_get_capabilities(ctdb, outdata);
+
+       case CTDB_CONTROL_START_PERSISTENT_UPDATE:
+               return ctdb_control_start_persistent_update(ctdb, c, indata);
+
+       case CTDB_CONTROL_CANCEL_PERSISTENT_UPDATE:
+               return ctdb_control_cancel_persistent_update(ctdb, c, indata);
+
+       case CTDB_CONTROL_TRANS2_COMMIT:
+       case CTDB_CONTROL_TRANS2_COMMIT_RETRY:
+               return control_not_implemented("TRANS2_COMMIT", "TRANS3_COMMIT");
+
+       case CTDB_CONTROL_TRANS2_ERROR:
+               return control_not_implemented("TRANS2_ERROR", NULL);
+
+       case CTDB_CONTROL_TRANS2_FINISHED:
+               return control_not_implemented("TRANS2_FINISHED", NULL);
+
+       case CTDB_CONTROL_TRANS2_ACTIVE:
+               return control_not_implemented("TRANS2_ACTIVE", NULL);
+
+       case CTDB_CONTROL_TRANS3_COMMIT:
+               return ctdb_control_trans3_commit(ctdb, c, indata, async_reply);
+
+       case CTDB_CONTROL_RECD_PING:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_recd_ping(ctdb);
+
+       case CTDB_CONTROL_GET_EVENT_SCRIPT_STATUS:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+               return ctdb_control_get_event_script_status(ctdb, *(uint32_t *)indata.dptr, outdata);
+
+       case CTDB_CONTROL_RECD_RECLOCK_LATENCY:
+               CHECK_CONTROL_DATA_SIZE(sizeof(double));
+               CTDB_UPDATE_RECLOCK_LATENCY(ctdb, "recd reclock", reclock.recd, *((double *)indata.dptr));
+               return 0;
+       case CTDB_CONTROL_GET_RECLOCK_FILE:
+               CHECK_CONTROL_DATA_SIZE(0);
+               if (ctdb->recovery_lock_file != NULL) {
+                       outdata->dptr  = discard_const(ctdb->recovery_lock_file);
+                       outdata->dsize = strlen(ctdb->recovery_lock_file) + 1;
+               }
+               return 0;
+       case CTDB_CONTROL_SET_RECLOCK_FILE:
+               ctdb->tunable.verify_recovery_lock = 0;
+               if (ctdb->recovery_lock_file != NULL) {
+                       talloc_free(ctdb->recovery_lock_file);
+                       ctdb->recovery_lock_file = NULL;
+               }
+               if (indata.dsize > 0) {
+                       ctdb->recovery_lock_file = talloc_strdup(ctdb, discard_const(indata.dptr));
+                       ctdb->tunable.verify_recovery_lock = 1;
+               }
+               return 0;
+
+       case CTDB_CONTROL_STOP_NODE:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_stop_node(ctdb);
+
+       case CTDB_CONTROL_CONTINUE_NODE:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_continue_node(ctdb);
+
+       case CTDB_CONTROL_SET_NATGWSTATE: {
+               uint32_t natgwstate;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));              
+               natgwstate = *(uint32_t *)indata.dptr;
+               if (natgwstate == 0) {
+                       ctdb->capabilities &= ~CTDB_CAP_NATGW;
+               } else {
+                       ctdb->capabilities |= CTDB_CAP_NATGW;
+               }
+               return 0;
+       }
+
+       case CTDB_CONTROL_SET_LMASTERROLE: {
+               uint32_t lmasterrole;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));              
+               lmasterrole = *(uint32_t *)indata.dptr;
+               if (lmasterrole == 0) {
+                       ctdb->capabilities &= ~CTDB_CAP_LMASTER;
+               } else {
+                       ctdb->capabilities |= CTDB_CAP_LMASTER;
+               }
+               return 0;
+       }
+
+       case CTDB_CONTROL_SET_RECMASTERROLE: {
+               uint32_t recmasterrole;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));              
+               recmasterrole = *(uint32_t *)indata.dptr;
+               if (recmasterrole == 0) {
+                       ctdb->capabilities &= ~CTDB_CAP_RECMASTER;
+               } else {
+                       ctdb->capabilities |= CTDB_CAP_RECMASTER;
+               }
+               return 0;
+       }
+
+       case CTDB_CONTROL_ENABLE_SCRIPT:
+               return ctdb_control_enable_script(ctdb, indata);
+
+       case CTDB_CONTROL_DISABLE_SCRIPT:
+               return ctdb_control_disable_script(ctdb, indata);
+
+       case CTDB_CONTROL_SET_BAN_STATE:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_ban_time));
+               return ctdb_control_set_ban_state(ctdb, indata);
+
+       case CTDB_CONTROL_GET_BAN_STATE:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_get_ban_state(ctdb, outdata);
+
+       case CTDB_CONTROL_SET_DB_PRIORITY:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_db_priority));
+               return ctdb_control_set_db_priority(ctdb, indata);
+
+       case CTDB_CONTROL_GET_DB_PRIORITY: {
+               uint32_t db_id;
+               struct ctdb_db_context *ctdb_db;
+
+               CHECK_CONTROL_DATA_SIZE(sizeof(db_id));
+               db_id = *(uint32_t *)indata.dptr;
+               ctdb_db = find_ctdb_db(ctdb, db_id);
+               if (ctdb_db == NULL) return -1;
+               return ctdb_db->priority;
+       }
+
+       case CTDB_CONTROL_TRANSACTION_CANCEL:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_transaction_cancel(ctdb);
+
+       case CTDB_CONTROL_REGISTER_NOTIFY:
+               return ctdb_control_register_notify(ctdb, client_id, indata);
+
+       case CTDB_CONTROL_DEREGISTER_NOTIFY:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_client_notify_deregister));
+               return ctdb_control_deregister_notify(ctdb, client_id, indata);
+
+       case CTDB_CONTROL_GET_LOG:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_get_log_addr));
+               return ctdb_control_get_log(ctdb, indata);
+
+       case CTDB_CONTROL_CLEAR_LOG:
+               return ctdb_control_clear_log(ctdb);
+
+       case CTDB_CONTROL_GET_DB_SEQNUM:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint64_t));
+               return ctdb_control_get_db_seqnum(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_DB_SET_HEALTHY:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+               return ctdb_control_db_set_healthy(ctdb, indata);
+
+       case CTDB_CONTROL_DB_GET_HEALTH:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+               return ctdb_control_db_get_health(ctdb, indata, outdata);
+
+       case CTDB_CONTROL_GET_PUBLIC_IP_INFO:
+               CHECK_CONTROL_DATA_SIZE(sizeof(ctdb_sock_addr));
+               return ctdb_control_get_public_ip_info(ctdb, c, indata, outdata);
+
+       case CTDB_CONTROL_GET_IFACES:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_get_ifaces(ctdb, c, outdata);
+
+       case CTDB_CONTROL_SET_IFACE_LINK_STATE:
+               CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_iface_info));
+               return ctdb_control_set_iface_link(ctdb, c, indata);
+
+       case CTDB_CONTROL_GET_STAT_HISTORY:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_get_stat_history(ctdb, c, outdata);
+
+       case CTDB_CONTROL_SCHEDULE_FOR_DELETION: {
+               struct ctdb_control_schedule_for_deletion *d;
+               size_t size = offsetof(struct ctdb_control_schedule_for_deletion, key);
+               CHECK_CONTROL_MIN_DATA_SIZE(size);
+               d = (struct ctdb_control_schedule_for_deletion *)indata.dptr;
+               size += d->keylen;
+               CHECK_CONTROL_DATA_SIZE(size);
+               return ctdb_control_schedule_for_deletion(ctdb, indata);
+       }
+       case CTDB_CONTROL_GET_DB_STATISTICS:
+               CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+               return ctdb_control_get_db_statistics(ctdb, *(uint32_t *)indata.dptr, outdata);
+
+       case CTDB_CONTROL_RELOAD_PUBLIC_IPS:
+               CHECK_CONTROL_DATA_SIZE(0);
+               return ctdb_control_reload_public_ips(ctdb, c, async_reply);
+
+       case CTDB_CONTROL_RECEIVE_RECORDS:
+               return ctdb_control_receive_records(ctdb, indata, outdata);
+
+       default:
+               DEBUG(DEBUG_CRIT,(__location__ " Unknown CTDB control opcode %u\n", opcode));
+               return -1;
+       }
+}
+
+/*
+  send a reply for a ctdb control
+ */
+void ctdb_request_control_reply(struct ctdb_context *ctdb, struct ctdb_req_control *c,
+                               TDB_DATA *outdata, int32_t status, const char *errormsg)
+{
+       struct ctdb_reply_control *r;
+       size_t len;
+       
+       /* some controls send no reply */
+       if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
+               return;
+       }
+
+       len = offsetof(struct ctdb_reply_control, data) + (outdata?outdata->dsize:0);
+       if (errormsg) {
+               len += strlen(errormsg);
+       }
+       r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REPLY_CONTROL, len, struct ctdb_reply_control);
+       if (r == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ "Unable to allocate transport - OOM or transport is down\n"));
+               return;
+       }
+
+       r->hdr.destnode     = c->hdr.srcnode;
+       r->hdr.reqid        = c->hdr.reqid;
+       r->status           = status;
+       r->datalen          = outdata?outdata->dsize:0;
+       if (outdata && outdata->dsize) {
+               memcpy(&r->data[0], outdata->dptr, outdata->dsize);
+       }
+       if (errormsg) {
+               r->errorlen = strlen(errormsg);
+               memcpy(&r->data[r->datalen], errormsg, r->errorlen);
+       }
+
+       ctdb_queue_packet_opcode(ctdb, &r->hdr, c->opcode);     
+
+       talloc_free(r);
+}
+
+/*
+  called when a CTDB_REQ_CONTROL packet comes in
+*/
+void ctdb_request_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_req_control *c = (struct ctdb_req_control *)hdr;
+       TDB_DATA data, *outdata;
+       int32_t status;
+       bool async_reply = false;
+       const char *errormsg = NULL;
+
+       data.dptr = &c->data[0];
+       data.dsize = c->datalen;
+
+       outdata = talloc_zero(c, TDB_DATA);
+
+       status = ctdb_control_dispatch(ctdb, c, data, outdata, hdr->srcnode, 
+                                      &errormsg, &async_reply);
+
+       if (!async_reply) {
+               ctdb_request_control_reply(ctdb, c, outdata, status, errormsg);
+       }
+}
+
+/*
+  called when a CTDB_REPLY_CONTROL packet comes in
+*/
+void ctdb_reply_control(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_reply_control *c = (struct ctdb_reply_control *)hdr;
+       TDB_DATA data;
+       struct ctdb_control_state *state;
+       const char *errormsg = NULL;
+
+       state = ctdb_reqid_find(ctdb, hdr->reqid, struct ctdb_control_state);
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,("pnn %u Invalid reqid %u in ctdb_reply_control\n",
+                        ctdb->pnn, hdr->reqid));
+               return;
+       }
+
+       if (hdr->reqid != state->reqid) {
+               /* we found a record  but it was the wrong one */
+               DEBUG(DEBUG_ERR, ("Dropped orphaned control reply with reqid:%u\n", hdr->reqid));
+               return;
+       }
+
+       data.dptr = &c->data[0];
+       data.dsize = c->datalen;
+       if (c->errorlen) {
+               errormsg = talloc_strndup(state, 
+                                         (char *)&c->data[c->datalen], c->errorlen);
+       }
+
+       /* make state a child of the packet, so it goes away when the packet
+          is freed. */
+       talloc_steal(hdr, state);
+
+       state->callback(ctdb, c->status, data, errormsg, state->private_data);
+}
+
+static int ctdb_control_destructor(struct ctdb_control_state *state)
+{
+       ctdb_reqid_remove(state->ctdb, state->reqid);
+       return 0;
+}
+
+/*
+  handle a timeout of a control
+ */
+static void ctdb_control_timeout(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct ctdb_control_state *state = talloc_get_type(private_data, struct ctdb_control_state);
+       TALLOC_CTX *tmp_ctx = talloc_new(ev);
+
+       CTDB_INCREMENT_STAT(state->ctdb, timeouts.control);
+
+       talloc_steal(tmp_ctx, state);
+
+       state->callback(state->ctdb, -1, tdb_null,
+                       "ctdb_control timed out", 
+                       state->private_data);
+       talloc_free(tmp_ctx);
+}
+
+
+/*
+  send a control message to a node
+ */
+int ctdb_daemon_send_control(struct ctdb_context *ctdb, uint32_t destnode,
+                            uint64_t srvid, uint32_t opcode, uint32_t client_id,
+                            uint32_t flags,
+                            TDB_DATA data,
+                            ctdb_control_callback_fn_t callback,
+                            void *private_data)
+{
+       struct ctdb_req_control *c;
+       struct ctdb_control_state *state;
+       size_t len;
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Failed to send control. Transport is DOWN\n"));
+               return -1;
+       }
+
+       if (((destnode == CTDB_BROADCAST_VNNMAP) || 
+            (destnode == CTDB_BROADCAST_ALL) ||
+            (destnode == CTDB_BROADCAST_CONNECTED)) && 
+           !(flags & CTDB_CTRL_FLAG_NOREPLY)) {
+               DEBUG(DEBUG_CRIT,("Attempt to broadcast control without NOREPLY\n"));
+               return -1;
+       }
+
+       if (destnode != CTDB_BROADCAST_VNNMAP && 
+           destnode != CTDB_BROADCAST_ALL && 
+           destnode != CTDB_BROADCAST_CONNECTED && 
+           (!ctdb_validate_pnn(ctdb, destnode) || 
+            (ctdb->nodes[destnode]->flags & NODE_FLAGS_DISCONNECTED))) {
+               if (!(flags & CTDB_CTRL_FLAG_NOREPLY)) {
+                       callback(ctdb, -1, tdb_null, "ctdb_control to disconnected node", private_data);
+               }
+               return 0;
+       }
+
+       /* the state is made a child of private_data if possible. This means any reply
+          will be discarded if the private_data goes away */
+       state = talloc(private_data?private_data:ctdb, struct ctdb_control_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->reqid = ctdb_reqid_new(ctdb, state);
+       state->callback = callback;
+       state->private_data = private_data;
+       state->ctdb = ctdb;
+       state->flags = flags;
+
+       talloc_set_destructor(state, ctdb_control_destructor);
+
+       len = offsetof(struct ctdb_req_control, data) + data.dsize;
+       c = ctdb_transport_allocate(ctdb, state, CTDB_REQ_CONTROL, len, 
+                                   struct ctdb_req_control);
+       CTDB_NO_MEMORY(ctdb, c);
+       talloc_set_name_const(c, "ctdb_req_control packet");
+
+       c->hdr.destnode     = destnode;
+       c->hdr.reqid        = state->reqid;
+       c->opcode           = opcode;
+       c->client_id        = client_id;
+       c->flags            = flags;
+       c->srvid            = srvid;
+       c->datalen          = data.dsize;
+       if (data.dsize) {
+               memcpy(&c->data[0], data.dptr, data.dsize);
+       }
+
+       ctdb_queue_packet(ctdb, &c->hdr);       
+
+       if (flags & CTDB_CTRL_FLAG_NOREPLY) {
+               talloc_free(state);
+               return 0;
+       }
+
+       if (ctdb->tunable.control_timeout) {
+               event_add_timed(ctdb->ev, state, 
+                               timeval_current_ofs(ctdb->tunable.control_timeout, 0), 
+                               ctdb_control_timeout, state);
+       }
+
+       talloc_free(c);
+       return 0;
+}
diff --git a/ctdb/server/ctdb_daemon.c b/ctdb/server/ctdb_daemon.c
new file mode 100644 (file)
index 0000000..cbe6b23
--- /dev/null
@@ -0,0 +1,1761 @@
+/* 
+   ctdb daemon code
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "db_wrap.h"
+#include "tdb.h"
+#include "lib/util/dlinklist.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_version.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+#include <sys/socket.h>
+
+struct ctdb_client_pid_list {
+       struct ctdb_client_pid_list *next, *prev;
+       struct ctdb_context *ctdb;
+       pid_t pid;
+       struct ctdb_client *client;
+};
+
+const char *ctdbd_pidfile = NULL;
+
+static void daemon_incoming_packet(void *, struct ctdb_req_header *);
+
+static void print_exit_message(void)
+{
+       if (debug_extra != NULL && debug_extra[0] != '\0') {
+               DEBUG(DEBUG_NOTICE,("CTDB %s shutting down\n", debug_extra));
+       } else {
+               DEBUG(DEBUG_NOTICE,("CTDB daemon shutting down\n"));
+
+               /* Wait a second to allow pending log messages to be flushed */
+               sleep(1);
+       }
+}
+
+
+
+static void ctdb_time_tick(struct event_context *ev, struct timed_event *te, 
+                                 struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       if (getpid() != ctdbd_pid) {
+               return;
+       }
+
+       event_add_timed(ctdb->ev, ctdb, 
+                       timeval_current_ofs(1, 0), 
+                       ctdb_time_tick, ctdb);
+}
+
+/* Used to trigger a dummy event once per second, to make
+ * detection of hangs more reliable.
+ */
+static void ctdb_start_time_tickd(struct ctdb_context *ctdb)
+{
+       event_add_timed(ctdb->ev, ctdb, 
+                       timeval_current_ofs(1, 0), 
+                       ctdb_time_tick, ctdb);
+}
+
+static void ctdb_start_periodic_events(struct ctdb_context *ctdb)
+{
+       /* start monitoring for connected/disconnected nodes */
+       ctdb_start_keepalive(ctdb);
+
+       /* start monitoring for node health */
+       ctdb_start_monitoring(ctdb);
+
+       /* start periodic update of tcp tickle lists */
+               ctdb_start_tcp_tickle_update(ctdb);
+
+       /* start listening for recovery daemon pings */
+       ctdb_control_recd_ping(ctdb);
+
+       /* start listening to timer ticks */
+       ctdb_start_time_tickd(ctdb);
+}
+
+static void block_signal(int signum)
+{
+       struct sigaction act;
+
+       memset(&act, 0, sizeof(act));
+
+       act.sa_handler = SIG_IGN;
+       sigemptyset(&act.sa_mask);
+       sigaddset(&act.sa_mask, signum);
+       sigaction(signum, &act, NULL);
+}
+
+
+/*
+  send a packet to a client
+ */
+static int daemon_queue_send(struct ctdb_client *client, struct ctdb_req_header *hdr)
+{
+       CTDB_INCREMENT_STAT(client->ctdb, client_packets_sent);
+       if (hdr->operation == CTDB_REQ_MESSAGE) {
+               if (ctdb_queue_length(client->queue) > client->ctdb->tunable.max_queue_depth_drop_msg) {
+                       DEBUG(DEBUG_ERR,("CTDB_REQ_MESSAGE queue full - killing client connection.\n"));
+                       talloc_free(client);
+                       return -1;
+               }
+       }
+       return ctdb_queue_send(client->queue, (uint8_t *)hdr, hdr->length);
+}
+
+/*
+  message handler for when we are in daemon mode. This redirects the message
+  to the right client
+ */
+static void daemon_message_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                                   TDB_DATA data, void *private_data)
+{
+       struct ctdb_client *client = talloc_get_type(private_data, struct ctdb_client);
+       struct ctdb_req_message *r;
+       int len;
+
+       /* construct a message to send to the client containing the data */
+       len = offsetof(struct ctdb_req_message, data) + data.dsize;
+       r = ctdbd_allocate_pkt(ctdb, ctdb, CTDB_REQ_MESSAGE, 
+                              len, struct ctdb_req_message);
+       CTDB_NO_MEMORY_VOID(ctdb, r);
+
+       talloc_set_name_const(r, "req_message packet");
+
+       r->srvid         = srvid;
+       r->datalen       = data.dsize;
+       memcpy(&r->data[0], data.dptr, data.dsize);
+
+       daemon_queue_send(client, &r->hdr);
+
+       talloc_free(r);
+}
+
+/*
+  this is called when the ctdb daemon received a ctdb request to 
+  set the srvid from the client
+ */
+int daemon_register_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+       struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+       int res;
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_register_message_handler\n"));
+               return -1;
+       }
+       res = ctdb_register_message_handler(ctdb, client, srvid, daemon_message_handler, client);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to register handler %llu in daemon\n", 
+                        (unsigned long long)srvid));
+       } else {
+               DEBUG(DEBUG_INFO,(__location__ " Registered message handler for srvid=%llu\n", 
+                        (unsigned long long)srvid));
+       }
+
+       return res;
+}
+
+/*
+  this is called when the ctdb daemon received a ctdb request to 
+  remove a srvid from the client
+ */
+int daemon_deregister_message_handler(struct ctdb_context *ctdb, uint32_t client_id, uint64_t srvid)
+{
+       struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,("Bad client_id in daemon_request_deregister_message_handler\n"));
+               return -1;
+       }
+       return ctdb_deregister_message_handler(ctdb, srvid, client);
+}
+
+int daemon_check_srvids(struct ctdb_context *ctdb, TDB_DATA indata,
+                       TDB_DATA *outdata)
+{
+       uint64_t *ids;
+       int i, num_ids;
+       uint8_t *results;
+
+       if ((indata.dsize % sizeof(uint64_t)) != 0) {
+               DEBUG(DEBUG_ERR, ("Bad indata in daemon_check_srvids, "
+                                 "size=%d\n", (int)indata.dsize));
+               return -1;
+       }
+
+       ids = (uint64_t *)indata.dptr;
+       num_ids = indata.dsize / 8;
+
+       results = talloc_zero_array(outdata, uint8_t, (num_ids+7)/8);
+       if (results == NULL) {
+               DEBUG(DEBUG_ERR, ("talloc failed in daemon_check_srvids\n"));
+               return -1;
+       }
+       for (i=0; i<num_ids; i++) {
+               if (ctdb_check_message_handler(ctdb, ids[i])) {
+                       results[i/8] |= (1 << (i%8));
+               }
+       }
+       outdata->dptr = (uint8_t *)results;
+       outdata->dsize = talloc_get_size(results);
+       return 0;
+}
+
+/*
+  destroy a ctdb_client
+*/
+static int ctdb_client_destructor(struct ctdb_client *client)
+{
+       struct ctdb_db_context *ctdb_db;
+
+       ctdb_takeover_client_destructor_hook(client);
+       ctdb_reqid_remove(client->ctdb, client->client_id);
+       client->ctdb->num_clients--;
+
+       if (client->num_persistent_updates != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Client disconnecting with %u persistent updates in flight. Starting recovery\n", client->num_persistent_updates));
+               client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+       }
+       ctdb_db = find_ctdb_db(client->ctdb, client->db_id);
+       if (ctdb_db) {
+               DEBUG(DEBUG_ERR, (__location__ " client exit while transaction "
+                                 "commit active. Forcing recovery.\n"));
+               client->ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+               /*
+                * trans3 transaction state:
+                *
+                * The destructor sets the pointer to NULL.
+                */
+               talloc_free(ctdb_db->persistent_state);
+       }
+
+       return 0;
+}
+
+
+/*
+  this is called when the ctdb daemon received a ctdb request message
+  from a local client over the unix domain socket
+ */
+static void daemon_request_message_from_client(struct ctdb_client *client, 
+                                              struct ctdb_req_message *c)
+{
+       TDB_DATA data;
+       int res;
+
+       if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+               c->hdr.destnode = ctdb_get_pnn(client->ctdb);
+       }
+
+       /* maybe the message is for another client on this node */
+       if (ctdb_get_pnn(client->ctdb)==c->hdr.destnode) {
+               ctdb_request_message(client->ctdb, (struct ctdb_req_header *)c);
+               return;
+       }
+
+       /* its for a remote node */
+       data.dptr = &c->data[0];
+       data.dsize = c->datalen;
+       res = ctdb_daemon_send_message(client->ctdb, c->hdr.destnode,
+                                      c->srvid, data);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to send message to remote node %u\n",
+                        c->hdr.destnode));
+       }
+}
+
+
+struct daemon_call_state {
+       struct ctdb_client *client;
+       uint32_t reqid;
+       struct ctdb_call *call;
+       struct timeval start_time;
+
+       /* readonly request ? */
+       uint32_t readonly_fetch;
+       uint32_t client_callid;
+};
+
+/* 
+   complete a call from a client 
+*/
+static void daemon_call_from_client_callback(struct ctdb_call_state *state)
+{
+       struct daemon_call_state *dstate = talloc_get_type(state->async.private_data, 
+                                                          struct daemon_call_state);
+       struct ctdb_reply_call *r;
+       int res;
+       uint32_t length;
+       struct ctdb_client *client = dstate->client;
+       struct ctdb_db_context *ctdb_db = state->ctdb_db;
+
+       talloc_steal(client, dstate);
+       talloc_steal(dstate, dstate->call);
+
+       res = ctdb_daemon_call_recv(state, dstate->call);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdbd_call_recv() returned error\n"));
+               CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+
+               CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 1", call_latency, dstate->start_time);
+               return;
+       }
+
+       length = offsetof(struct ctdb_reply_call, data) + dstate->call->reply_data.dsize;
+       /* If the client asked for readonly FETCH, we remapped this to 
+          FETCH_WITH_HEADER when calling the daemon. So we must
+          strip the extra header off the reply data before passing
+          it back to the client.
+       */
+       if (dstate->readonly_fetch
+       && dstate->client_callid == CTDB_FETCH_FUNC) {
+               length -= sizeof(struct ctdb_ltdb_header);
+       }
+
+       r = ctdbd_allocate_pkt(client->ctdb, dstate, CTDB_REPLY_CALL, 
+                              length, struct ctdb_reply_call);
+       if (r == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to allocate reply_call in ctdb daemon\n"));
+               CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+               CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 2", call_latency, dstate->start_time);
+               return;
+       }
+       r->hdr.reqid        = dstate->reqid;
+       r->status           = dstate->call->status;
+
+       if (dstate->readonly_fetch
+       && dstate->client_callid == CTDB_FETCH_FUNC) {
+               /* client only asked for a FETCH so we must strip off
+                  the extra ctdb_ltdb header
+               */
+               r->datalen          = dstate->call->reply_data.dsize - sizeof(struct ctdb_ltdb_header);
+               memcpy(&r->data[0], dstate->call->reply_data.dptr + sizeof(struct ctdb_ltdb_header), r->datalen);
+       } else {
+               r->datalen          = dstate->call->reply_data.dsize;
+               memcpy(&r->data[0], dstate->call->reply_data.dptr, r->datalen);
+       }
+
+       res = daemon_queue_send(client, &r->hdr);
+       if (res == -1) {
+               /* client is dead - return immediately */
+               return;
+       }
+       if (res != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to queue packet from daemon to client\n"));
+       }
+       CTDB_UPDATE_LATENCY(client->ctdb, ctdb_db, "call_from_client_cb 3", call_latency, dstate->start_time);
+       CTDB_DECREMENT_STAT(client->ctdb, pending_calls);
+       talloc_free(dstate);
+}
+
+struct ctdb_daemon_packet_wrap {
+       struct ctdb_context *ctdb;
+       uint32_t client_id;
+};
+
+/*
+  a wrapper to catch disconnected clients
+ */
+static void daemon_incoming_packet_wrap(void *p, struct ctdb_req_header *hdr)
+{
+       struct ctdb_client *client;
+       struct ctdb_daemon_packet_wrap *w = talloc_get_type(p, 
+                                                           struct ctdb_daemon_packet_wrap);
+       if (w == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Bad packet type '%s'\n", talloc_get_name(p)));
+               return;
+       }
+
+       client = ctdb_reqid_find(w->ctdb, w->client_id, struct ctdb_client);
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+                        w->client_id));
+               talloc_free(w);
+               return;
+       }
+       talloc_free(w);
+
+       /* process it */
+       daemon_incoming_packet(client, hdr);    
+}
+
+struct ctdb_deferred_fetch_call {
+       struct ctdb_deferred_fetch_call *next, *prev;
+       struct ctdb_req_call *c;
+       struct ctdb_daemon_packet_wrap *w;
+};
+
+struct ctdb_deferred_fetch_queue {
+       struct ctdb_deferred_fetch_call *deferred_calls;
+};
+
+struct ctdb_deferred_requeue {
+       struct ctdb_deferred_fetch_call *dfc;
+       struct ctdb_client *client;
+};
+
+/* called from a timer event and starts reprocessing the deferred call.*/
+static void reprocess_deferred_call(struct event_context *ev, struct timed_event *te, 
+                                      struct timeval t, void *private_data)
+{
+       struct ctdb_deferred_requeue *dfr = (struct ctdb_deferred_requeue *)private_data;
+       struct ctdb_client *client = dfr->client;
+
+       talloc_steal(client, dfr->dfc->c);
+       daemon_incoming_packet(client, (struct ctdb_req_header *)dfr->dfc->c);
+       talloc_free(dfr);
+}
+
+/* the referral context is destroyed either after a timeout or when the initial
+   fetch-lock has finished.
+   at this stage, immediately start reprocessing the queued up deferred
+   calls so they get reprocessed immediately (and since we are dmaster at
+   this stage, trigger the waiting smbd processes to pick up and aquire the
+   record right away.
+*/
+static int deferred_fetch_queue_destructor(struct ctdb_deferred_fetch_queue *dfq)
+{
+
+       /* need to reprocess the packets from the queue explicitely instead of
+          just using a normal destructor since we want, need, to
+          call the clients in the same oder as the requests queued up
+       */
+       while (dfq->deferred_calls != NULL) {
+               struct ctdb_client *client;
+               struct ctdb_deferred_fetch_call *dfc = dfq->deferred_calls;
+               struct ctdb_deferred_requeue *dfr;
+
+               DLIST_REMOVE(dfq->deferred_calls, dfc);
+
+               client = ctdb_reqid_find(dfc->w->ctdb, dfc->w->client_id, struct ctdb_client);
+               if (client == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Packet for disconnected client %u\n",
+                                dfc->w->client_id));
+                       continue;
+               }
+
+               /* process it by pushing it back onto the eventloop */
+               dfr = talloc(client, struct ctdb_deferred_requeue);
+               if (dfr == NULL) {
+                       DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch requeue structure\n"));
+                       continue;
+               }
+
+               dfr->dfc    = talloc_steal(dfr, dfc);
+               dfr->client = client;
+
+               event_add_timed(dfc->w->ctdb->ev, client, timeval_zero(), reprocess_deferred_call, dfr);
+       }
+
+       return 0;
+}
+
+/* insert the new deferral context into the rb tree.
+   there should never be a pre-existing context here, but check for it
+   warn and destroy the previous context if there is already a deferral context
+   for this key.
+*/
+static void *insert_dfq_callback(void *parm, void *data)
+{
+        if (data) {
+               DEBUG(DEBUG_ERR,("Already have DFQ registered. Free old %p and create new %p\n", data, parm));
+                talloc_free(data);
+        }
+        return parm;
+}
+
+/* if the original fetch-lock did not complete within a reasonable time,
+   free the context and context for all deferred requests to cause them to be
+   re-inserted into the event system.
+*/
+static void dfq_timeout(struct event_context *ev, struct timed_event *te, 
+                                 struct timeval t, void *private_data)
+{
+       talloc_free(private_data);
+}
+
+/* This function is used in the local daemon to register a KEY in a database
+   for being "fetched"
+   While the remote fetch is in-flight, any futher attempts to re-fetch the
+   same record will be deferred until the fetch completes.
+*/
+static int setup_deferred_fetch_locks(struct ctdb_db_context *ctdb_db, struct ctdb_call *call)
+{
+       uint32_t *k;
+       struct ctdb_deferred_fetch_queue *dfq;
+
+       k = talloc_zero_size(call, ((call->key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+               return -1;
+       }
+
+       k[0] = (call->key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], call->key.dptr, call->key.dsize);
+
+       dfq  = talloc(call, struct ctdb_deferred_fetch_queue);
+       if (dfq == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch queue structure\n"));
+               talloc_free(k);
+               return -1;
+       }
+       dfq->deferred_calls = NULL;
+
+       trbt_insertarray32_callback(ctdb_db->deferred_fetch, k[0], &k[0], insert_dfq_callback, dfq);
+
+       talloc_set_destructor(dfq, deferred_fetch_queue_destructor);
+
+       /* if the fetch havent completed in 30 seconds, just tear it all down
+          and let it try again as the events are reissued */
+       event_add_timed(ctdb_db->ctdb->ev, dfq, timeval_current_ofs(30, 0), dfq_timeout, dfq);
+
+       talloc_free(k);
+       return 0;
+}
+
+/* check if this is a duplicate request to a fetch already in-flight
+   if it is, make this call deferred to be reprocessed later when
+   the in-flight fetch completes.
+*/
+static int requeue_duplicate_fetch(struct ctdb_db_context *ctdb_db, struct ctdb_client *client, TDB_DATA key, struct ctdb_req_call *c)
+{
+       uint32_t *k;
+       struct ctdb_deferred_fetch_queue *dfq;
+       struct ctdb_deferred_fetch_call *dfc;
+
+       k = talloc_zero_size(c, ((key.dsize + 3) & 0xfffffffc) + 4);
+       if (k == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate key for deferred fetch\n"));
+               return -1;
+       }
+
+       k[0] = (key.dsize + 3) / 4 + 1;
+       memcpy(&k[1], key.dptr, key.dsize);
+
+       dfq = trbt_lookuparray32(ctdb_db->deferred_fetch, k[0], &k[0]);
+       if (dfq == NULL) {
+               talloc_free(k);
+               return -1;
+       }
+
+
+       talloc_free(k);
+
+       dfc = talloc(dfq, struct ctdb_deferred_fetch_call);
+       if (dfc == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to allocate deferred fetch call structure\n"));
+               return -1;
+       }
+
+       dfc->w = talloc(dfc, struct ctdb_daemon_packet_wrap);
+       if (dfc->w == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate deferred fetch daemon packet wrap structure\n"));
+               talloc_free(dfc);
+               return -1;
+       }
+
+       dfc->c = talloc_steal(dfc, c);
+       dfc->w->ctdb = ctdb_db->ctdb;
+       dfc->w->client_id = client->client_id;
+
+       DLIST_ADD_END(dfq->deferred_calls, dfc, NULL);
+
+       return 0;
+}
+
+
+/*
+  this is called when the ctdb daemon received a ctdb request call
+  from a local client over the unix domain socket
+ */
+static void daemon_request_call_from_client(struct ctdb_client *client, 
+                                           struct ctdb_req_call *c)
+{
+       struct ctdb_call_state *state;
+       struct ctdb_db_context *ctdb_db;
+       struct daemon_call_state *dstate;
+       struct ctdb_call *call;
+       struct ctdb_ltdb_header header;
+       TDB_DATA key, data;
+       int ret;
+       struct ctdb_context *ctdb = client->ctdb;
+       struct ctdb_daemon_packet_wrap *w;
+
+       CTDB_INCREMENT_STAT(ctdb, total_calls);
+       CTDB_DECREMENT_STAT(ctdb, pending_calls);
+
+       ctdb_db = find_ctdb_db(client->ctdb, c->db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR, (__location__ " Unknown database in request. db_id==0x%08x",
+                         c->db_id));
+               CTDB_DECREMENT_STAT(ctdb, pending_calls);
+               return;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               /*
+                * this is just a warning, as the tdb should be empty anyway,
+                * and only persistent databases can be unhealthy, which doesn't
+                * use this code patch
+                */
+               DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in daemon_request_call_from_client(): %s\n",
+                                    ctdb_db->db_name, ctdb_db->unhealthy_reason));
+       }
+
+       key.dptr = c->data;
+       key.dsize = c->keylen;
+
+       w = talloc(ctdb, struct ctdb_daemon_packet_wrap);
+       CTDB_NO_MEMORY_VOID(ctdb, w);   
+
+       w->ctdb = ctdb;
+       w->client_id = client->client_id;
+
+       ret = ctdb_ltdb_lock_fetch_requeue(ctdb_db, key, &header, 
+                                          (struct ctdb_req_header *)c, &data,
+                                          daemon_incoming_packet_wrap, w, true);
+       if (ret == -2) {
+               /* will retry later */
+               CTDB_DECREMENT_STAT(ctdb, pending_calls);
+               return;
+       }
+
+       talloc_free(w);
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Unable to fetch record\n"));
+               CTDB_DECREMENT_STAT(ctdb, pending_calls);
+               return;
+       }
+
+
+       /* check if this fetch request is a duplicate for a
+          request we already have in flight. If so defer it until
+          the first request completes.
+       */
+       if (ctdb->tunable.fetch_collapse == 1) {
+               if (requeue_duplicate_fetch(ctdb_db, client, key, c) == 0) {
+                       ret = ctdb_ltdb_unlock(ctdb_db, key);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+                       }
+                       return;
+               }
+       }
+
+       /* Dont do READONLY if we dont have a tracking database */
+       if ((c->flags & CTDB_WANT_READONLY) && !ctdb_db->readonly) {
+               c->flags &= ~CTDB_WANT_READONLY;
+       }
+
+       if (header.flags & CTDB_REC_RO_REVOKE_COMPLETE) {
+               header.flags &= ~CTDB_REC_RO_FLAGS;
+               CTDB_INCREMENT_STAT(ctdb, total_ro_revokes);
+               CTDB_INCREMENT_DB_STAT(ctdb_db, db_ro_revokes);
+               if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to write header with cleared REVOKE flag");
+               }
+               /* and clear out the tracking data */
+               if (tdb_delete(ctdb_db->rottdb, key) != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to clear out trackingdb record\n"));
+               }
+       }
+
+       /* if we are revoking, we must defer all other calls until the revoke
+        * had completed.
+        */
+       if (header.flags & CTDB_REC_RO_REVOKING_READONLY) {
+               talloc_free(data.dptr);
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+               if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+               }
+               return;
+       }
+
+       if ((header.dmaster == ctdb->pnn)
+       && (!(c->flags & CTDB_WANT_READONLY))
+       && (header.flags & (CTDB_REC_RO_HAVE_DELEGATIONS|CTDB_REC_RO_HAVE_READONLY)) ) {
+               header.flags   |= CTDB_REC_RO_REVOKING_READONLY;
+               if (ctdb_ltdb_store(ctdb_db, key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to store record with HAVE_DELEGATIONS set");
+               }
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+
+               if (ctdb_start_revoke_ro_record(ctdb, ctdb_db, key, &header, data) != 0) {
+                       ctdb_fatal(ctdb, "Failed to start record revoke");
+               }
+               talloc_free(data.dptr);
+
+               if (ctdb_add_revoke_deferred_call(ctdb, ctdb_db, key, (struct ctdb_req_header *)c, daemon_incoming_packet, client) != 0) {
+                       ctdb_fatal(ctdb, "Failed to add deferred call for revoke child");
+               }
+
+               return;
+       }               
+
+       dstate = talloc(client, struct daemon_call_state);
+       if (dstate == NULL) {
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+
+               DEBUG(DEBUG_ERR,(__location__ " Unable to allocate dstate\n"));
+               CTDB_DECREMENT_STAT(ctdb, pending_calls);
+               return;
+       }
+       dstate->start_time = timeval_current();
+       dstate->client = client;
+       dstate->reqid  = c->hdr.reqid;
+       talloc_steal(dstate, data.dptr);
+
+       call = dstate->call = talloc_zero(dstate, struct ctdb_call);
+       if (call == NULL) {
+               ret = ctdb_ltdb_unlock(ctdb_db, key);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+               }
+
+               DEBUG(DEBUG_ERR,(__location__ " Unable to allocate call\n"));
+               CTDB_DECREMENT_STAT(ctdb, pending_calls);
+               CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 1", call_latency, dstate->start_time);
+               return;
+       }
+
+       dstate->readonly_fetch = 0;
+       call->call_id = c->callid;
+       call->key = key;
+       call->call_data.dptr = c->data + c->keylen;
+       call->call_data.dsize = c->calldatalen;
+       call->flags = c->flags;
+
+       if (c->flags & CTDB_WANT_READONLY) {
+               /* client wants readonly record, so translate this into a 
+                  fetch with header. remember what the client asked for
+                  so we can remap the reply back to the proper format for
+                  the client in the reply
+                */
+               dstate->client_callid = call->call_id;
+               call->call_id = CTDB_FETCH_WITH_HEADER_FUNC;
+               dstate->readonly_fetch = 1;
+       }
+
+       if (header.dmaster == ctdb->pnn) {
+               state = ctdb_call_local_send(ctdb_db, call, &header, &data);
+       } else {
+               state = ctdb_daemon_call_send_remote(ctdb_db, call, &header);
+               if (ctdb->tunable.fetch_collapse == 1) {
+                       /* This request triggered a remote fetch-lock.
+                          set up a deferral for this key so any additional
+                          fetch-locks are deferred until the current one
+                          finishes.
+                        */
+                       setup_deferred_fetch_locks(ctdb_db, call);
+               }
+       }
+
+       ret = ctdb_ltdb_unlock(ctdb_db, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", ret));
+       }
+
+       if (state == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Unable to setup call send\n"));
+               CTDB_DECREMENT_STAT(ctdb, pending_calls);
+               CTDB_UPDATE_LATENCY(ctdb, ctdb_db, "call_from_client 2", call_latency, dstate->start_time);
+               return;
+       }
+       talloc_steal(state, dstate);
+       talloc_steal(client, state);
+
+       state->async.fn = daemon_call_from_client_callback;
+       state->async.private_data = dstate;
+}
+
+
+static void daemon_request_control_from_client(struct ctdb_client *client, 
+                                              struct ctdb_req_control *c);
+
+/* data contains a packet from the client */
+static void daemon_incoming_packet(void *p, struct ctdb_req_header *hdr)
+{
+       struct ctdb_client *client = talloc_get_type(p, struct ctdb_client);
+       TALLOC_CTX *tmp_ctx;
+       struct ctdb_context *ctdb = client->ctdb;
+
+       /* place the packet as a child of a tmp_ctx. We then use
+          talloc_free() below to free it. If any of the calls want
+          to keep it, then they will steal it somewhere else, and the
+          talloc_free() will be a no-op */
+       tmp_ctx = talloc_new(client);
+       talloc_steal(tmp_ctx, hdr);
+
+       if (hdr->ctdb_magic != CTDB_MAGIC) {
+               ctdb_set_error(client->ctdb, "Non CTDB packet rejected in daemon\n");
+               goto done;
+       }
+
+       if (hdr->ctdb_version != CTDB_VERSION) {
+               ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+               goto done;
+       }
+
+       switch (hdr->operation) {
+       case CTDB_REQ_CALL:
+               CTDB_INCREMENT_STAT(ctdb, client.req_call);
+               daemon_request_call_from_client(client, (struct ctdb_req_call *)hdr);
+               break;
+
+       case CTDB_REQ_MESSAGE:
+               CTDB_INCREMENT_STAT(ctdb, client.req_message);
+               daemon_request_message_from_client(client, (struct ctdb_req_message *)hdr);
+               break;
+
+       case CTDB_REQ_CONTROL:
+               CTDB_INCREMENT_STAT(ctdb, client.req_control);
+               daemon_request_control_from_client(client, (struct ctdb_req_control *)hdr);
+               break;
+
+       default:
+               DEBUG(DEBUG_CRIT,(__location__ " daemon: unrecognized operation %u\n",
+                        hdr->operation));
+       }
+
+done:
+       talloc_free(tmp_ctx);
+}
+
+/*
+  called when the daemon gets a incoming packet
+ */
+static void ctdb_daemon_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+       struct ctdb_client *client = talloc_get_type(args, struct ctdb_client);
+       struct ctdb_req_header *hdr;
+
+       if (cnt == 0) {
+               talloc_free(client);
+               return;
+       }
+
+       CTDB_INCREMENT_STAT(client->ctdb, client_packets_recv);
+
+       if (cnt < sizeof(*hdr)) {
+               ctdb_set_error(client->ctdb, "Bad packet length %u in daemon\n", 
+                              (unsigned)cnt);
+               return;
+       }
+       hdr = (struct ctdb_req_header *)data;
+       if (cnt != hdr->length) {
+               ctdb_set_error(client->ctdb, "Bad header length %u expected %u\n in daemon", 
+                              (unsigned)hdr->length, (unsigned)cnt);
+               return;
+       }
+
+       if (hdr->ctdb_magic != CTDB_MAGIC) {
+               ctdb_set_error(client->ctdb, "Non CTDB packet rejected\n");
+               return;
+       }
+
+       if (hdr->ctdb_version != CTDB_VERSION) {
+               ctdb_set_error(client->ctdb, "Bad CTDB version 0x%x rejected in daemon\n", hdr->ctdb_version);
+               return;
+       }
+
+       DEBUG(DEBUG_DEBUG,(__location__ " client request %u of type %u length %u from "
+                "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
+                hdr->srcnode, hdr->destnode));
+
+       /* it is the responsibility of the incoming packet function to free 'data' */
+       daemon_incoming_packet(client, hdr);
+}
+
+
+static int ctdb_clientpid_destructor(struct ctdb_client_pid_list *client_pid)
+{
+       if (client_pid->ctdb->client_pids != NULL) {
+               DLIST_REMOVE(client_pid->ctdb->client_pids, client_pid);
+       }
+
+       return 0;
+}
+
+
+static void ctdb_accept_client(struct event_context *ev, struct fd_event *fde, 
+                        uint16_t flags, void *private_data)
+{
+       struct sockaddr_un addr;
+       socklen_t len;
+       int fd;
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       struct ctdb_client *client;
+       struct ctdb_client_pid_list *client_pid;
+       pid_t peer_pid = 0;
+
+       memset(&addr, 0, sizeof(addr));
+       len = sizeof(addr);
+       fd = accept(ctdb->daemon.sd, (struct sockaddr *)&addr, &len);
+       if (fd == -1) {
+               return;
+       }
+
+       set_nonblocking(fd);
+       set_close_on_exec(fd);
+
+       DEBUG(DEBUG_DEBUG,(__location__ " Created SOCKET FD:%d to connected child\n", fd));
+
+       client = talloc_zero(ctdb, struct ctdb_client);
+       if (ctdb_get_peer_pid(fd, &peer_pid) == 0) {
+               DEBUG(DEBUG_INFO,("Connected client with pid:%u\n", (unsigned)peer_pid));
+       }
+
+       client->ctdb = ctdb;
+       client->fd = fd;
+       client->client_id = ctdb_reqid_new(ctdb, client);
+       client->pid = peer_pid;
+
+       client_pid = talloc(client, struct ctdb_client_pid_list);
+       if (client_pid == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate client pid structure\n"));
+               close(fd);
+               talloc_free(client);
+               return;
+       }               
+       client_pid->ctdb   = ctdb;
+       client_pid->pid    = peer_pid;
+       client_pid->client = client;
+
+       DLIST_ADD(ctdb->client_pids, client_pid);
+
+       client->queue = ctdb_queue_setup(ctdb, client, fd, CTDB_DS_ALIGNMENT, 
+                                        ctdb_daemon_read_cb, client,
+                                        "client-%u", client->pid);
+
+       talloc_set_destructor(client, ctdb_client_destructor);
+       talloc_set_destructor(client_pid, ctdb_clientpid_destructor);
+       ctdb->num_clients++;
+}
+
+
+
+/*
+  create a unix domain socket and bind it
+  return a file descriptor open on the socket 
+*/
+static int ux_socket_bind(struct ctdb_context *ctdb)
+{
+       struct sockaddr_un addr;
+
+       ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (ctdb->daemon.sd == -1) {
+               return -1;
+       }
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sun_family = AF_UNIX;
+       strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path));
+
+       /* First check if an old ctdbd might be running */
+       if (connect(ctdb->daemon.sd,
+                   (struct sockaddr *)&addr, sizeof(addr)) == 0) {
+               DEBUG(DEBUG_CRIT,
+                     ("Something is already listening on ctdb socket '%s'\n",
+                      ctdb->daemon.name));
+               goto failed;
+       }
+
+       /* Remove any old socket */
+       unlink(ctdb->daemon.name);
+
+       set_close_on_exec(ctdb->daemon.sd);
+       set_nonblocking(ctdb->daemon.sd);
+
+       if (bind(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to bind on ctdb socket '%s'\n", ctdb->daemon.name));
+               goto failed;
+       }
+
+       if (chown(ctdb->daemon.name, geteuid(), getegid()) != 0 ||
+           chmod(ctdb->daemon.name, 0700) != 0) {
+               DEBUG(DEBUG_CRIT,("Unable to secure ctdb socket '%s', ctdb->daemon.name\n", ctdb->daemon.name));
+               goto failed;
+       }
+
+
+       if (listen(ctdb->daemon.sd, 100) != 0) {
+               DEBUG(DEBUG_CRIT,("Unable to listen on ctdb socket '%s'\n", ctdb->daemon.name));
+               goto failed;
+       }
+
+       return 0;
+
+failed:
+       close(ctdb->daemon.sd);
+       ctdb->daemon.sd = -1;
+       return -1;      
+}
+
+static void initialise_node_flags (struct ctdb_context *ctdb)
+{
+       if (ctdb->pnn == -1) {
+               ctdb_fatal(ctdb, "PNN is set to -1 (unknown value)");
+       }
+
+       ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_DISCONNECTED;
+
+       /* do we start out in DISABLED mode? */
+       if (ctdb->start_as_disabled != 0) {
+               DEBUG(DEBUG_INFO, ("This node is configured to start in DISABLED state\n"));
+               ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_DISABLED;
+       }
+       /* do we start out in STOPPED mode? */
+       if (ctdb->start_as_stopped != 0) {
+               DEBUG(DEBUG_INFO, ("This node is configured to start in STOPPED state\n"));
+               ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
+       }
+}
+
+static void ctdb_setup_event_callback(struct ctdb_context *ctdb, int status,
+                                     void *private_data)
+{
+       if (status != 0) {
+               ctdb_die(ctdb, "Failed to run setup event");
+       }
+       ctdb_run_notification_script(ctdb, "setup");
+
+       ctdb_set_runstate(ctdb, CTDB_RUNSTATE_FIRST_RECOVERY);
+
+       /* tell all other nodes we've just started up */
+       ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL,
+                                0, CTDB_CONTROL_STARTUP, 0,
+                                CTDB_CTRL_FLAG_NOREPLY,
+                                tdb_null, NULL, NULL);
+
+       /* Start the recovery daemon */
+       if (ctdb_start_recoverd(ctdb) != 0) {
+               DEBUG(DEBUG_ALERT,("Failed to start recovery daemon\n"));
+               exit(11);
+       }
+
+       ctdb_start_periodic_events(ctdb);
+}
+
+static struct timeval tevent_before_wait_ts;
+static struct timeval tevent_after_wait_ts;
+
+static void ctdb_tevent_trace(enum tevent_trace_point tp,
+                             void *private_data)
+{
+       struct timeval diff;
+       struct timeval now;
+
+       if (getpid() != ctdbd_pid) {
+               return;
+       }
+
+       now = timeval_current();
+
+       switch (tp) {
+       case TEVENT_TRACE_BEFORE_WAIT:
+               if (!timeval_is_zero(&tevent_after_wait_ts)) {
+                       diff = timeval_until(&tevent_after_wait_ts, &now);
+                       if (diff.tv_sec > 3) {
+                               DEBUG(DEBUG_ERR,
+                                     ("Handling event took %ld seconds!\n",
+                                      diff.tv_sec));
+                       }
+               }
+               tevent_before_wait_ts = now;
+               break;
+
+       case TEVENT_TRACE_AFTER_WAIT:
+               if (!timeval_is_zero(&tevent_before_wait_ts)) {
+                       diff = timeval_until(&tevent_before_wait_ts, &now);
+                       if (diff.tv_sec > 3) {
+                               DEBUG(DEBUG_CRIT,
+                                     ("No event for %ld seconds!\n",
+                                      diff.tv_sec));
+                       }
+               }
+               tevent_after_wait_ts = now;
+               break;
+
+       default:
+               /* Do nothing for future tevent trace points */ ;
+       }
+}
+
+static void ctdb_remove_pidfile(void)
+{
+       if (ctdbd_pidfile != NULL && !ctdb_is_child_process()) {
+               if (unlink(ctdbd_pidfile) == 0) {
+                       DEBUG(DEBUG_NOTICE, ("Removed PID file %s\n",
+                                            ctdbd_pidfile));
+               } else {
+                       DEBUG(DEBUG_WARNING, ("Failed to Remove PID file %s\n",
+                                             ctdbd_pidfile));
+               }
+       }
+}
+
+static void ctdb_create_pidfile(pid_t pid)
+{
+       if (ctdbd_pidfile != NULL) {
+               FILE *fp;
+
+               fp = fopen(ctdbd_pidfile, "w");
+               if (fp == NULL) {
+                       DEBUG(DEBUG_ALERT,
+                             ("Failed to open PID file %s\n", ctdbd_pidfile));
+                       exit(11);
+               }
+
+               fprintf(fp, "%d\n", pid);
+               fclose(fp);
+               DEBUG(DEBUG_NOTICE, ("Created PID file %s\n", ctdbd_pidfile));
+               atexit(ctdb_remove_pidfile);
+       }
+}
+
+/*
+  start the protocol going as a daemon
+*/
+int ctdb_start_daemon(struct ctdb_context *ctdb, bool do_fork, bool use_syslog)
+{
+       int res, ret = -1;
+       struct fd_event *fde;
+       const char *domain_socket_name;
+
+       /* create a unix domain stream socket to listen to */
+       res = ux_socket_bind(ctdb);
+       if (res!=0) {
+               DEBUG(DEBUG_ALERT,("Cannot continue.  Exiting!\n"));
+               exit(10);
+       }
+
+       if (do_fork && fork()) {
+               return 0;
+       }
+
+       tdb_reopen_all(false);
+
+       if (do_fork) {
+               setsid();
+               close(0);
+               if (open("/dev/null", O_RDONLY) != 0) {
+                       DEBUG(DEBUG_ALERT,(__location__ " Failed to setup stdin on /dev/null\n"));
+                       exit(11);
+               }
+       }
+       block_signal(SIGPIPE);
+
+       ctdbd_pid = getpid();
+       ctdb->ctdbd_pid = ctdbd_pid;
+       DEBUG(DEBUG_ERR, ("Starting CTDBD (Version %s) as PID: %u\n",
+                         CTDB_VERSION_STRING, ctdbd_pid));
+       ctdb_create_pidfile(ctdb->ctdbd_pid);
+
+       /* Make sure we log something when the daemon terminates.
+        * This must be the first exit handler to run (so the last to
+        * be registered.
+        */
+       atexit(print_exit_message);
+
+       if (ctdb->do_setsched) {
+               /* try to set us up as realtime */
+               ctdb_set_scheduler(ctdb);
+       }
+
+       /* ensure the socket is deleted on exit of the daemon */
+       domain_socket_name = talloc_strdup(talloc_autofree_context(), ctdb->daemon.name);
+       if (domain_socket_name == NULL) {
+               DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup failed.\n"));
+               exit(12);
+       }
+
+       ctdb->ev = event_context_init(NULL);
+       tevent_loop_allow_nesting(ctdb->ev);
+       tevent_set_trace_callback(ctdb->ev, ctdb_tevent_trace, NULL);
+       ret = ctdb_init_tevent_logging(ctdb);
+       if (ret != 0) {
+               DEBUG(DEBUG_ALERT,("Failed to initialize TEVENT logging\n"));
+               exit(1);
+       }
+
+       /* set up a handler to pick up sigchld */
+       if (ctdb_init_sigchld(ctdb) == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD\n"));
+               exit(1);
+       }
+
+       ctdb_set_child_logging(ctdb);
+       if (use_syslog) {
+               if (start_syslog_daemon(ctdb)) {
+                       DEBUG(DEBUG_CRIT, ("Failed to start syslog daemon\n"));
+                       exit(10);
+               }
+       }
+
+       /* initialize statistics collection */
+       ctdb_statistics_init(ctdb);
+
+       /* force initial recovery for election */
+       ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+
+       ctdb_set_runstate(ctdb, CTDB_RUNSTATE_INIT);
+       ret = ctdb_event_script(ctdb, CTDB_EVENT_INIT);
+       if (ret != 0) {
+               ctdb_die(ctdb, "Failed to run init event\n");
+       }
+       ctdb_run_notification_script(ctdb, "init");
+
+       if (strcmp(ctdb->transport, "tcp") == 0) {
+               int ctdb_tcp_init(struct ctdb_context *);
+               ret = ctdb_tcp_init(ctdb);
+       }
+#ifdef USE_INFINIBAND
+       if (strcmp(ctdb->transport, "ib") == 0) {
+               int ctdb_ibw_init(struct ctdb_context *);
+               ret = ctdb_ibw_init(ctdb);
+       }
+#endif
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to initialise transport '%s'\n", ctdb->transport));
+               return -1;
+       }
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_ALERT,(__location__ " Can not initialize transport. ctdb->methods is NULL\n"));
+               ctdb_fatal(ctdb, "transport is unavailable. can not initialize.");
+       }
+
+       /* initialise the transport  */
+       if (ctdb->methods->initialise(ctdb) != 0) {
+               ctdb_fatal(ctdb, "transport failed to initialise");
+       }
+
+       initialise_node_flags(ctdb);
+
+       if (ctdb->public_addresses_file) {
+               ret = ctdb_set_public_addresses(ctdb, true);
+               if (ret == -1) {
+                       DEBUG(DEBUG_ALERT,("Unable to setup public address list\n"));
+                       exit(1);
+               }
+               if (ctdb->do_checkpublicip) {
+                       ctdb_start_monitoring_interfaces(ctdb);
+               }
+       }
+
+
+       /* attach to existing databases */
+       if (ctdb_attach_databases(ctdb) != 0) {
+               ctdb_fatal(ctdb, "Failed to attach to databases\n");
+       }
+
+       /* start frozen, then let the first election sort things out */
+       if (!ctdb_blocking_freeze(ctdb)) {
+               ctdb_fatal(ctdb, "Failed to get initial freeze\n");
+       }
+
+       /* now start accepting clients, only can do this once frozen */
+       fde = event_add_fd(ctdb->ev, ctdb, ctdb->daemon.sd, 
+                          EVENT_FD_READ,
+                          ctdb_accept_client, ctdb);
+       if (fde == NULL) {
+               ctdb_fatal(ctdb, "Failed to add daemon socket to event loop");
+       }
+       tevent_fd_set_auto_close(fde);
+
+       /* release any IPs we hold from previous runs of the daemon */
+       if (ctdb->tunable.disable_ip_failover == 0) {
+               ctdb_release_all_ips(ctdb);
+       }
+
+       /* Start the transport */
+       if (ctdb->methods->start(ctdb) != 0) {
+               DEBUG(DEBUG_ALERT,("transport failed to start!\n"));
+               ctdb_fatal(ctdb, "transport failed to start");
+       }
+
+       /* Recovery daemon and timed events are started from the
+        * callback, only after the setup event completes
+        * successfully.
+        */
+       ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SETUP);
+       ret = ctdb_event_script_callback(ctdb,
+                                        ctdb,
+                                        ctdb_setup_event_callback,
+                                        ctdb,
+                                        false,
+                                        CTDB_EVENT_SETUP,
+                                        "%s",
+                                        "");
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,("Failed to set up 'setup' event\n"));
+               exit(1);
+       }
+
+       ctdb_lockdown_memory(ctdb);
+         
+       /* go into a wait loop to allow other nodes to complete */
+       event_loop_wait(ctdb->ev);
+
+       DEBUG(DEBUG_CRIT,("event_loop_wait() returned. this should not happen\n"));
+       exit(1);
+}
+
+/*
+  allocate a packet for use in daemon<->daemon communication
+ */
+struct ctdb_req_header *_ctdb_transport_allocate(struct ctdb_context *ctdb,
+                                                TALLOC_CTX *mem_ctx, 
+                                                enum ctdb_operation operation, 
+                                                size_t length, size_t slength,
+                                                const char *type)
+{
+       int size;
+       struct ctdb_req_header *hdr;
+
+       length = MAX(length, slength);
+       size = (length+(CTDB_DS_ALIGNMENT-1)) & ~(CTDB_DS_ALIGNMENT-1);
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Unable to allocate transport packet for operation %u of length %u. Transport is DOWN.\n",
+                        operation, (unsigned)length));
+               return NULL;
+       }
+
+       hdr = (struct ctdb_req_header *)ctdb->methods->allocate_pkt(mem_ctx, size);
+       if (hdr == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to allocate transport packet for operation %u of length %u\n",
+                        operation, (unsigned)length));
+               return NULL;
+       }
+       talloc_set_name_const(hdr, type);
+       memset(hdr, 0, slength);
+       hdr->length       = length;
+       hdr->operation    = operation;
+       hdr->ctdb_magic   = CTDB_MAGIC;
+       hdr->ctdb_version = CTDB_VERSION;
+       hdr->generation   = ctdb->vnn_map->generation;
+       hdr->srcnode      = ctdb->pnn;
+
+       return hdr;     
+}
+
+struct daemon_control_state {
+       struct daemon_control_state *next, *prev;
+       struct ctdb_client *client;
+       struct ctdb_req_control *c;
+       uint32_t reqid;
+       struct ctdb_node *node;
+};
+
+/*
+  callback when a control reply comes in
+ */
+static void daemon_control_callback(struct ctdb_context *ctdb,
+                                   int32_t status, TDB_DATA data, 
+                                   const char *errormsg,
+                                   void *private_data)
+{
+       struct daemon_control_state *state = talloc_get_type(private_data, 
+                                                            struct daemon_control_state);
+       struct ctdb_client *client = state->client;
+       struct ctdb_reply_control *r;
+       size_t len;
+       int ret;
+
+       /* construct a message to send to the client containing the data */
+       len = offsetof(struct ctdb_reply_control, data) + data.dsize;
+       if (errormsg) {
+               len += strlen(errormsg);
+       }
+       r = ctdbd_allocate_pkt(ctdb, state, CTDB_REPLY_CONTROL, len, 
+                              struct ctdb_reply_control);
+       CTDB_NO_MEMORY_VOID(ctdb, r);
+
+       r->hdr.reqid     = state->reqid;
+       r->status        = status;
+       r->datalen       = data.dsize;
+       r->errorlen = 0;
+       memcpy(&r->data[0], data.dptr, data.dsize);
+       if (errormsg) {
+               r->errorlen = strlen(errormsg);
+               memcpy(&r->data[r->datalen], errormsg, r->errorlen);
+       }
+
+       ret = daemon_queue_send(client, &r->hdr);
+       if (ret != -1) {
+               talloc_free(state);
+       }
+}
+
+/*
+  fail all pending controls to a disconnected node
+ */
+void ctdb_daemon_cancel_controls(struct ctdb_context *ctdb, struct ctdb_node *node)
+{
+       struct daemon_control_state *state;
+       while ((state = node->pending_controls)) {
+               DLIST_REMOVE(node->pending_controls, state);
+               daemon_control_callback(ctdb, (uint32_t)-1, tdb_null, 
+                                       "node is disconnected", state);
+       }
+}
+
+/*
+  destroy a daemon_control_state
+ */
+static int daemon_control_destructor(struct daemon_control_state *state)
+{
+       if (state->node) {
+               DLIST_REMOVE(state->node->pending_controls, state);
+       }
+       return 0;
+}
+
+/*
+  this is called when the ctdb daemon received a ctdb request control
+  from a local client over the unix domain socket
+ */
+static void daemon_request_control_from_client(struct ctdb_client *client, 
+                                              struct ctdb_req_control *c)
+{
+       TDB_DATA data;
+       int res;
+       struct daemon_control_state *state;
+       TALLOC_CTX *tmp_ctx = talloc_new(client);
+
+       if (c->hdr.destnode == CTDB_CURRENT_NODE) {
+               c->hdr.destnode = client->ctdb->pnn;
+       }
+
+       state = talloc(client, struct daemon_control_state);
+       CTDB_NO_MEMORY_VOID(client->ctdb, state);
+
+       state->client = client;
+       state->c = talloc_steal(state, c);
+       state->reqid = c->hdr.reqid;
+       if (ctdb_validate_pnn(client->ctdb, c->hdr.destnode)) {
+               state->node = client->ctdb->nodes[c->hdr.destnode];
+               DLIST_ADD(state->node->pending_controls, state);
+       } else {
+               state->node = NULL;
+       }
+
+       talloc_set_destructor(state, daemon_control_destructor);
+
+       if (c->flags & CTDB_CTRL_FLAG_NOREPLY) {
+               talloc_steal(tmp_ctx, state);
+       }
+       
+       data.dptr = &c->data[0];
+       data.dsize = c->datalen;
+       res = ctdb_daemon_send_control(client->ctdb, c->hdr.destnode,
+                                      c->srvid, c->opcode, client->client_id,
+                                      c->flags,
+                                      data, daemon_control_callback,
+                                      state);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to send control to remote node %u\n",
+                        c->hdr.destnode));
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  register a call function
+*/
+int ctdb_daemon_set_call(struct ctdb_context *ctdb, uint32_t db_id,
+                        ctdb_fn_t fn, int id)
+{
+       struct ctdb_registered_call *call;
+       struct ctdb_db_context *ctdb_db;
+
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (ctdb_db == NULL) {
+               return -1;
+       }
+
+       call = talloc(ctdb_db, struct ctdb_registered_call);
+       call->fn = fn;
+       call->id = id;
+
+       DLIST_ADD(ctdb_db->calls, call);        
+       return 0;
+}
+
+
+
+/*
+  this local messaging handler is ugly, but is needed to prevent
+  recursion in ctdb_send_message() when the destination node is the
+  same as the source node
+ */
+struct ctdb_local_message {
+       struct ctdb_context *ctdb;
+       uint64_t srvid;
+       TDB_DATA data;
+};
+
+static void ctdb_local_message_trigger(struct event_context *ev, struct timed_event *te, 
+                                      struct timeval t, void *private_data)
+{
+       struct ctdb_local_message *m = talloc_get_type(private_data, 
+                                                      struct ctdb_local_message);
+       int res;
+
+       res = ctdb_dispatch_message(m->ctdb, m->srvid, m->data);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to dispatch message for srvid=%llu\n", 
+                         (unsigned long long)m->srvid));
+       }
+       talloc_free(m);
+}
+
+static int ctdb_local_message(struct ctdb_context *ctdb, uint64_t srvid, TDB_DATA data)
+{
+       struct ctdb_local_message *m;
+       m = talloc(ctdb, struct ctdb_local_message);
+       CTDB_NO_MEMORY(ctdb, m);
+
+       m->ctdb = ctdb;
+       m->srvid = srvid;
+       m->data  = data;
+       m->data.dptr = talloc_memdup(m, m->data.dptr, m->data.dsize);
+       if (m->data.dptr == NULL) {
+               talloc_free(m);
+               return -1;
+       }
+
+       /* this needs to be done as an event to prevent recursion */
+       event_add_timed(ctdb->ev, m, timeval_zero(), ctdb_local_message_trigger, m);
+       return 0;
+}
+
+/*
+  send a ctdb message
+*/
+int ctdb_daemon_send_message(struct ctdb_context *ctdb, uint32_t pnn,
+                            uint64_t srvid, TDB_DATA data)
+{
+       struct ctdb_req_message *r;
+       int len;
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Failed to send message. Transport is DOWN\n"));
+               return -1;
+       }
+
+       /* see if this is a message to ourselves */
+       if (pnn == ctdb->pnn) {
+               return ctdb_local_message(ctdb, srvid, data);
+       }
+
+       len = offsetof(struct ctdb_req_message, data) + data.dsize;
+       r = ctdb_transport_allocate(ctdb, ctdb, CTDB_REQ_MESSAGE, len,
+                                   struct ctdb_req_message);
+       CTDB_NO_MEMORY(ctdb, r);
+
+       r->hdr.destnode  = pnn;
+       r->srvid         = srvid;
+       r->datalen       = data.dsize;
+       memcpy(&r->data[0], data.dptr, data.dsize);
+
+       ctdb_queue_packet(ctdb, &r->hdr);
+
+       talloc_free(r);
+       return 0;
+}
+
+
+
+struct ctdb_client_notify_list {
+       struct ctdb_client_notify_list *next, *prev;
+       struct ctdb_context *ctdb;
+       uint64_t srvid;
+       TDB_DATA data;
+};
+
+
+static int ctdb_client_notify_destructor(struct ctdb_client_notify_list *nl)
+{
+       int ret;
+
+       DEBUG(DEBUG_ERR,("Sending client notify message for srvid:%llu\n", (unsigned long long)nl->srvid));
+
+       ret = ctdb_daemon_send_message(nl->ctdb, CTDB_BROADCAST_CONNECTED, (unsigned long long)nl->srvid, nl->data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send client notify message\n"));
+       }
+
+       return 0;
+}
+
+int32_t ctdb_control_register_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+       struct ctdb_client_notify_register *notify = (struct ctdb_client_notify_register *)indata.dptr;
+        struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client); 
+       struct ctdb_client_notify_list *nl;
+
+       DEBUG(DEBUG_INFO,("Register srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
+
+       if (indata.dsize < offsetof(struct ctdb_client_notify_register, notify_data)) {
+               DEBUG(DEBUG_ERR,(__location__ " Too little data in control : %d\n", (int)indata.dsize));
+               return -1;
+       }
+
+       if (indata.dsize != (notify->len + offsetof(struct ctdb_client_notify_register, notify_data))) {
+               DEBUG(DEBUG_ERR,(__location__ " Wrong amount of data in control. Got %d, expected %d\n", (int)indata.dsize, (int)(notify->len + offsetof(struct ctdb_client_notify_register, notify_data))));
+               return -1;
+       }
+
+
+        if (client == NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+                return -1;
+        }
+
+       for(nl=client->notify; nl; nl=nl->next) {
+               if (nl->srvid == notify->srvid) {
+                       break;
+               }
+       }
+       if (nl != NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " Notification for srvid:%llu already exists for this client\n", (unsigned long long)notify->srvid));
+                return -1;
+        }
+
+       nl = talloc(client, struct ctdb_client_notify_list);
+       CTDB_NO_MEMORY(ctdb, nl);
+       nl->ctdb       = ctdb;
+       nl->srvid      = notify->srvid;
+       nl->data.dsize = notify->len;
+       nl->data.dptr  = talloc_size(nl, nl->data.dsize);
+       CTDB_NO_MEMORY(ctdb, nl->data.dptr);
+       memcpy(nl->data.dptr, notify->notify_data, nl->data.dsize);
+       
+       DLIST_ADD(client->notify, nl);
+       talloc_set_destructor(nl, ctdb_client_notify_destructor);
+
+       return 0;
+}
+
+int32_t ctdb_control_deregister_notify(struct ctdb_context *ctdb, uint32_t client_id, TDB_DATA indata)
+{
+       struct ctdb_client_notify_deregister *notify = (struct ctdb_client_notify_deregister *)indata.dptr;
+        struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client); 
+       struct ctdb_client_notify_list *nl;
+
+       DEBUG(DEBUG_INFO,("Deregister srvid %llu for client %d\n", (unsigned long long)notify->srvid, client_id));
+
+        if (client == NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+                return -1;
+        }
+
+       for(nl=client->notify; nl; nl=nl->next) {
+               if (nl->srvid == notify->srvid) {
+                       break;
+               }
+       }
+       if (nl == NULL) {
+                DEBUG(DEBUG_ERR,(__location__ " No notification for srvid:%llu found for this client\n", (unsigned long long)notify->srvid));
+                return -1;
+        }
+
+       DLIST_REMOVE(client->notify, nl);
+       talloc_set_destructor(nl, NULL);
+       talloc_free(nl);
+
+       return 0;
+}
+
+struct ctdb_client *ctdb_find_client_by_pid(struct ctdb_context *ctdb, pid_t pid)
+{
+       struct ctdb_client_pid_list *client_pid;
+
+       for (client_pid = ctdb->client_pids; client_pid; client_pid=client_pid->next) {
+               if (client_pid->pid == pid) {
+                       return client_pid->client;
+               }
+       }
+       return NULL;
+}
+
+
+/* This control is used by samba when probing if a process (of a samba daemon)
+   exists on the node.
+   Samba does this when it needs/wants to check if a subrecord in one of the
+   databases is still valied, or if it is stale and can be removed.
+   If the node is in unhealthy or stopped state we just kill of the samba
+   process holding htis sub-record and return to the calling samba that
+   the process does not exist.
+   This allows us to forcefully recall subrecords registered by samba processes
+   on banned and stopped nodes.
+*/
+int32_t ctdb_control_process_exists(struct ctdb_context *ctdb, pid_t pid)
+{
+        struct ctdb_client *client;
+
+       if (ctdb->nodes[ctdb->pnn]->flags & (NODE_FLAGS_BANNED|NODE_FLAGS_STOPPED)) {
+               client = ctdb_find_client_by_pid(ctdb, pid);
+               if (client != NULL) {
+                       DEBUG(DEBUG_NOTICE,(__location__ " Killing client with pid:%d on banned/stopped node\n", (int)pid));
+                       talloc_free(client);
+               }
+               return -1;
+       }
+
+       return kill(pid, 0);
+}
+
+void ctdb_shutdown_sequence(struct ctdb_context *ctdb, int exit_code)
+{
+       if (ctdb->runstate == CTDB_RUNSTATE_SHUTDOWN) {
+               DEBUG(DEBUG_NOTICE,("Already shutting down so will not proceed.\n"));
+               return;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Shutdown sequence commencing.\n"));
+       ctdb_set_runstate(ctdb, CTDB_RUNSTATE_SHUTDOWN);
+       ctdb_stop_recoverd(ctdb);
+       ctdb_stop_keepalive(ctdb);
+       ctdb_stop_monitoring(ctdb);
+       ctdb_release_all_ips(ctdb);
+       ctdb_event_script(ctdb, CTDB_EVENT_SHUTDOWN);
+       if (ctdb->methods != NULL) {
+               ctdb->methods->shutdown(ctdb);
+       }
+
+       DEBUG(DEBUG_NOTICE,("Shutdown sequence complete, exiting.\n"));
+       exit(exit_code);
+}
diff --git a/ctdb/server/ctdb_freeze.c b/ctdb/server/ctdb_freeze.c
new file mode 100644 (file)
index 0000000..fee44d4
--- /dev/null
@@ -0,0 +1,485 @@
+/* 
+   ctdb freeze handling
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include "lib/util/dlinklist.h"
+#include "db_wrap.h"
+#include "../common/rb_tree.h"
+
+/*
+  a list of control requests waiting for a freeze lock child to get
+  the database locks
+ */
+struct ctdb_freeze_waiter {
+       struct ctdb_freeze_waiter *next, *prev;
+       struct ctdb_context *ctdb;
+       struct ctdb_req_control *c;
+       uint32_t priority;
+       int32_t status;
+};
+
+/* a handle to a freeze lock child process */
+struct ctdb_freeze_handle {
+       struct ctdb_context *ctdb;
+       uint32_t priority;
+       struct lock_request *lreq;
+       struct ctdb_freeze_waiter *waiters;
+};
+
+/*
+  destroy a freeze handle
+ */    
+static int ctdb_freeze_handle_destructor(struct ctdb_freeze_handle *h)
+{
+       struct ctdb_context *ctdb = h->ctdb;
+       struct ctdb_db_context *ctdb_db;
+
+       DEBUG(DEBUG_ERR,("Release freeze handler for prio %u\n", h->priority));
+
+       /* cancel any pending transactions */
+       if (ctdb->freeze_transaction_started) {
+               for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
+                       if (ctdb_db->priority != h->priority) {
+                               continue;
+                       }
+                       tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+                       if (tdb_transaction_cancel(ctdb_db->ltdb->tdb) != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " Failed to cancel transaction for db '%s'\n",
+                                        ctdb_db->db_name));
+                       }
+                       tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+               }
+               ctdb->freeze_transaction_started = false;
+       }
+
+       ctdb->freeze_mode[h->priority]    = CTDB_FREEZE_NONE;
+       ctdb->freeze_handles[h->priority] = NULL;
+
+       ctdb_lock_free_request_context(h->lreq);
+       return 0;
+}
+
+/*
+  called when the child writes its status to us
+ */
+static void ctdb_freeze_lock_handler(void *private_data, bool locked)
+{
+       struct ctdb_freeze_handle *h = talloc_get_type_abort(private_data,
+                                                            struct ctdb_freeze_handle);
+       struct ctdb_freeze_waiter *w;
+
+       if (h->ctdb->freeze_mode[h->priority] == CTDB_FREEZE_FROZEN) {
+               DEBUG(DEBUG_INFO,("freeze child died - unfreezing\n"));
+               talloc_free(h);
+               return;
+       }
+
+       if (!locked) {
+               DEBUG(DEBUG_ERR,("Failed to get locks in ctdb_freeze_child\n"));
+               /* we didn't get the locks - destroy the handle */
+               talloc_free(h);
+               return;
+       }
+
+       h->ctdb->freeze_mode[h->priority] = CTDB_FREEZE_FROZEN;
+
+       /* notify the waiters */
+       if (h != h->ctdb->freeze_handles[h->priority]) {
+               DEBUG(DEBUG_ERR,("lockwait finished but h is not linked\n"));
+       }
+       while ((w = h->waiters)) {
+               w->status = 0;
+               DLIST_REMOVE(h->waiters, w);
+               talloc_free(w);
+       }
+}
+
+/*
+  destroy a waiter for a freeze mode change
+ */
+static int ctdb_freeze_waiter_destructor(struct ctdb_freeze_waiter *w)
+{
+       ctdb_request_control_reply(w->ctdb, w->c, NULL, w->status, NULL);
+       return 0;
+}
+
+/*
+  start the freeze process for a certain priority
+ */
+void ctdb_start_freeze(struct ctdb_context *ctdb, uint32_t priority)
+{
+       struct ctdb_freeze_handle *h;
+
+       if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
+               DEBUG(DEBUG_ERR,(__location__ " Invalid db priority : %u\n", priority));
+               ctdb_fatal(ctdb, "Internal error");
+       }
+
+       if (ctdb->freeze_mode[priority] == CTDB_FREEZE_FROZEN) {
+               /* we're already frozen */
+               return;
+       }
+
+       DEBUG(DEBUG_ERR, ("Freeze priority %u\n", priority));
+
+       /* Stop any vacuuming going on: we don't want to wait. */
+       ctdb_stop_vacuuming(ctdb);
+
+       /* if there isn't a freeze lock child then create one */
+       if (ctdb->freeze_handles[priority] == NULL) {
+               h = talloc_zero(ctdb, struct ctdb_freeze_handle);
+               CTDB_NO_MEMORY_FATAL(ctdb, h);
+               h->ctdb = ctdb;
+               h->priority = priority;
+               talloc_set_destructor(h, ctdb_freeze_handle_destructor);
+
+               h->lreq = ctdb_lock_alldb_prio(ctdb, priority, false, ctdb_freeze_lock_handler, h);
+               CTDB_NO_MEMORY_FATAL(ctdb, h->lreq);
+               ctdb->freeze_handles[priority] = h;
+               ctdb->freeze_mode[priority] = CTDB_FREEZE_PENDING;
+       }
+}
+
+/*
+  freeze the databases
+ */
+int32_t ctdb_control_freeze(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
+{
+       struct ctdb_freeze_waiter *w;
+       uint32_t priority;
+
+       priority = (uint32_t)c->srvid;
+
+       if (priority == 0) {
+               DEBUG(DEBUG_ERR,("Freeze priority 0 requested, remapping to priority 1\n"));
+               priority = 1;
+       }
+
+       if ((priority < 1) || (priority > NUM_DB_PRIORITIES)) {
+               DEBUG(DEBUG_ERR,(__location__ " Invalid db priority : %u\n", priority));
+               return -1;
+       }
+
+       if (ctdb->freeze_mode[priority] == CTDB_FREEZE_FROZEN) {
+               DEBUG(DEBUG_ERR, ("Freeze priority %u\n", priority));
+               /* we're already frozen */
+               return 0;
+       }
+
+       ctdb_start_freeze(ctdb, priority);
+
+       /* add ourselves to list of waiters */
+       if (ctdb->freeze_handles[priority] == NULL) {
+               DEBUG(DEBUG_ERR,("No freeze lock handle when adding a waiter\n"));
+               return -1;
+       }
+
+       w = talloc(ctdb->freeze_handles[priority], struct ctdb_freeze_waiter);
+       CTDB_NO_MEMORY(ctdb, w);
+       w->ctdb     = ctdb;
+       w->c        = talloc_steal(w, c);
+       w->priority = priority;
+       w->status   = -1;
+       talloc_set_destructor(w, ctdb_freeze_waiter_destructor);
+       DLIST_ADD(ctdb->freeze_handles[priority]->waiters, w);
+
+       /* we won't reply till later */
+       *async_reply = true;
+       return 0;
+}
+
+
+/*
+  block until we are frozen, used during daemon startup
+ */
+bool ctdb_blocking_freeze(struct ctdb_context *ctdb)
+{
+       int i;
+
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               ctdb_start_freeze(ctdb, i);
+
+               /* block until frozen */
+               while (ctdb->freeze_mode[i] == CTDB_FREEZE_PENDING) {
+                       event_loop_once(ctdb->ev);
+               }
+       }
+
+       return true;
+}
+
+
+static void thaw_priority(struct ctdb_context *ctdb, uint32_t priority)
+{
+       DEBUG(DEBUG_ERR,("Thawing priority %u\n", priority));
+
+       /* cancel any pending transactions */
+       if (ctdb->freeze_transaction_started) {
+               struct ctdb_db_context *ctdb_db;
+
+               for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
+                       tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+                       if (tdb_transaction_cancel(ctdb_db->ltdb->tdb) != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " Failed to cancel transaction for db '%s'\n",
+                                        ctdb_db->db_name));
+                       }
+                       tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+               }
+       }
+       ctdb->freeze_transaction_started = false;
+
+#if 0
+       /* this hack can be used to get a copy of the databases at the end of a recovery */
+       system("mkdir -p /var/ctdb.saved; /usr/bin/rsync --delete -a /var/ctdb/ /var/ctdb.saved/$$ 2>&1 > /dev/null");
+#endif
+
+#if 0
+       /* and this one for local testing */
+       system("mkdir -p test.db.saved; /usr/bin/rsync --delete -a test.db/ test.db.saved/$$ 2>&1 > /dev/null");
+#endif
+
+       if (ctdb->freeze_handles[priority] != NULL) {
+               talloc_free(ctdb->freeze_handles[priority]);
+               ctdb->freeze_handles[priority] = NULL;
+       }
+}
+
+/*
+  thaw the databases
+ */
+int32_t ctdb_control_thaw(struct ctdb_context *ctdb, uint32_t priority)
+{
+
+       if (priority > NUM_DB_PRIORITIES) {
+               DEBUG(DEBUG_ERR,(__location__ " Invalid db priority : %u\n", priority));
+               return -1;
+       }
+
+       if (priority == 0) {
+               int i;
+               for (i=1;i<=NUM_DB_PRIORITIES; i++) {
+                       thaw_priority(ctdb, i);
+               }
+       } else {
+               thaw_priority(ctdb, priority);
+       }
+
+       ctdb_call_resend_all(ctdb);
+       return 0;
+}
+
+
+/*
+  start a transaction on all databases - used for recovery
+ */
+int32_t ctdb_control_transaction_start(struct ctdb_context *ctdb, uint32_t id)
+{
+       struct ctdb_db_context *ctdb_db;
+       int i;
+
+       for (i=1;i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed transaction_start while not frozen\n"));
+                       return -1;
+               }
+       }
+
+       for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
+               int ret;
+
+               tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+
+               if (ctdb->freeze_transaction_started) {
+                       if (tdb_transaction_cancel(ctdb_db->ltdb->tdb) != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " Failed to cancel transaction for db '%s'\n",
+                                        ctdb_db->db_name));
+                               /* not a fatal error */
+                       }
+               }
+
+               ret = tdb_transaction_start(ctdb_db->ltdb->tdb);
+
+               tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to start transaction for db '%s'\n",
+                                ctdb_db->db_name));
+                       return -1;
+               }
+       }
+
+       ctdb->freeze_transaction_started = true;
+       ctdb->freeze_transaction_id = id;
+
+       return 0;
+}
+
+/*
+  cancel a transaction for all databases - used for recovery
+ */
+int32_t ctdb_control_transaction_cancel(struct ctdb_context *ctdb)
+{
+       struct ctdb_db_context *ctdb_db;
+
+       DEBUG(DEBUG_ERR,(__location__ " recovery transaction cancelled called\n"));
+
+       for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
+               tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+
+               if (tdb_transaction_cancel(ctdb_db->ltdb->tdb) != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to cancel transaction for db '%s'\n",  ctdb_db->db_name));
+                       /* not a fatal error */
+               }
+
+               tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+       }
+
+       ctdb->freeze_transaction_started = false;
+
+       return 0;
+}
+
+/*
+  commit transactions on all databases
+ */
+int32_t ctdb_control_transaction_commit(struct ctdb_context *ctdb, uint32_t id)
+{
+       struct ctdb_db_context *ctdb_db;
+       int i;
+       int healthy_nodes = 0;
+
+       for (i=1;i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed transaction_start while not frozen\n"));
+                       return -1;
+               }
+       }
+
+       if (!ctdb->freeze_transaction_started) {
+               DEBUG(DEBUG_ERR,(__location__ " transaction not started\n"));
+               return -1;
+       }
+
+       if (id != ctdb->freeze_transaction_id) {
+               DEBUG(DEBUG_ERR,(__location__ " incorrect transaction id 0x%x in commit\n", id));
+               return -1;
+       }
+
+       DEBUG(DEBUG_DEBUG,(__location__ " num_nodes[%d]\n", ctdb->num_nodes));
+       for (i=0; i < ctdb->num_nodes; i++) {
+               DEBUG(DEBUG_DEBUG,(__location__ " node[%d].flags[0x%X]\n",
+                                  i, ctdb->nodes[i]->flags));
+               if (ctdb->nodes[i]->flags == 0) {
+                       healthy_nodes++;
+               }
+       }
+       DEBUG(DEBUG_INFO,(__location__ " healthy_nodes[%d]\n", healthy_nodes));
+
+       for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
+               int ret;
+
+               tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+               ret = tdb_transaction_commit(ctdb_db->ltdb->tdb);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to commit transaction for db '%s'. Cancel all transactions and resetting transaction_started to false.\n",
+                                ctdb_db->db_name));
+                       goto fail;
+               }
+               tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+
+               ret = ctdb_update_persistent_health(ctdb, ctdb_db, NULL, healthy_nodes);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " Failed to update persistent health for db '%s'. "
+                                        "Cancel all remaining transactions and resetting transaction_started to false.\n",
+                                        ctdb_db->db_name));
+                       goto fail;
+               }
+       }
+
+       ctdb->freeze_transaction_started = false;
+       ctdb->freeze_transaction_id = 0;
+
+       return 0;
+
+fail:
+       /* cancel any pending transactions */
+       for (ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next) {
+               tdb_add_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+               if (tdb_transaction_cancel(ctdb_db->ltdb->tdb) != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to cancel transaction for db '%s'\n",
+                                ctdb_db->db_name));
+               }
+               tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_NOLOCK);
+       }
+       ctdb->freeze_transaction_started = false;
+
+       return -1;
+}
+
+/*
+  wipe a database - only possible when in a frozen transaction
+ */
+int32_t ctdb_control_wipe_database(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_control_wipe_database w = *(struct ctdb_control_wipe_database *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+
+       ctdb_db = find_ctdb_db(ctdb, w.db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", w.db_id));
+               return -1;
+       }
+
+       if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed transaction_start while not frozen\n"));
+               return -1;
+       }
+
+       if (!ctdb->freeze_transaction_started) {
+               DEBUG(DEBUG_ERR,(__location__ " transaction not started\n"));
+               return -1;
+       }
+
+       if (w.transaction_id != ctdb->freeze_transaction_id) {
+               DEBUG(DEBUG_ERR,(__location__ " incorrect transaction id 0x%x in commit\n", w.transaction_id));
+               return -1;
+       }
+
+       if (tdb_wipe_all(ctdb_db->ltdb->tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to wipe database for db '%s'\n",
+                        ctdb_db->db_name));
+               return -1;
+       }
+
+       if (!ctdb_db->persistent) {
+               talloc_free(ctdb_db->delete_queue);
+               ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+               if (ctdb_db->delete_queue == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to re-create "
+                                         "the vacuum tree.\n"));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
diff --git a/ctdb/server/ctdb_keepalive.c b/ctdb/server/ctdb_keepalive.c
new file mode 100644 (file)
index 0000000..5c95eb0
--- /dev/null
@@ -0,0 +1,107 @@
+/* 
+   monitoring links to all other nodes to detect dead nodes
+
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+
+
+/*
+  see if any nodes are dead
+ */
+static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_event *te, 
+                                     struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int i;
+
+       /* send a keepalive to all other nodes, unless */
+       for (i=0;i<ctdb->num_nodes;i++) {
+               struct ctdb_node *node = ctdb->nodes[i];
+
+               if (node->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+
+               if (node->pnn == ctdb->pnn) {
+                       continue;
+               }
+               
+               if (node->flags & NODE_FLAGS_DISCONNECTED) {
+                       /* it might have come alive again */
+                       if (node->rx_cnt != 0) {
+                               ctdb_node_connected(node);
+                       }
+                       continue;
+               }
+
+
+               if (node->rx_cnt == 0) {
+                       node->dead_count++;
+               } else {
+                       node->dead_count = 0;
+               }
+
+               node->rx_cnt = 0;
+
+               if (node->dead_count >= ctdb->tunable.keepalive_limit) {
+                       DEBUG(DEBUG_NOTICE,("dead count reached for node %u\n", node->pnn));
+                       ctdb_node_dead(node);
+                       ctdb_send_keepalive(ctdb, node->pnn);
+                       /* maybe tell the transport layer to kill the
+                          sockets as well?
+                       */
+                       continue;
+               }
+               
+               DEBUG(DEBUG_DEBUG,("sending keepalive to %u\n", node->pnn));
+               ctdb_send_keepalive(ctdb, node->pnn);
+
+               node->tx_cnt = 0;
+       }
+       
+       event_add_timed(ctdb->ev, ctdb->keepalive_ctx,
+                       timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), 
+                       ctdb_check_for_dead_nodes, ctdb);
+}
+
+
+void ctdb_start_keepalive(struct ctdb_context *ctdb)
+{
+       struct timed_event *te;
+
+       ctdb->keepalive_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY_FATAL(ctdb, ctdb->keepalive_ctx);
+
+       te = event_add_timed(ctdb->ev, ctdb->keepalive_ctx,
+                            timeval_current_ofs(ctdb->tunable.keepalive_interval, 0), 
+                            ctdb_check_for_dead_nodes, ctdb);
+       CTDB_NO_MEMORY_FATAL(ctdb, te);
+
+       DEBUG(DEBUG_NOTICE,("Keepalive monitoring has been started\n"));
+}
+
+void ctdb_stop_keepalive(struct ctdb_context *ctdb)
+{
+       talloc_free(ctdb->keepalive_ctx);
+       ctdb->keepalive_ctx = NULL;
+}
+
diff --git a/ctdb/server/ctdb_lock.c b/ctdb/server/ctdb_lock.c
new file mode 100644 (file)
index 0000000..fc437b0
--- /dev/null
@@ -0,0 +1,1034 @@
+/*
+   ctdb lock handling
+   provide API to do non-blocking locks for single or all databases
+
+   Copyright (C) Amitay Isaacs  2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "include/ctdb_private.h"
+#include "include/ctdb_protocol.h"
+#include "tevent.h"
+#include "tdb.h"
+#include "db_wrap.h"
+#include "system/filesys.h"
+#include "lib/util/dlinklist.h"
+
+/*
+ * Non-blocking Locking API
+ *
+ * 1. Create a child process to do blocking locks.
+ * 2. Once the locks are obtained, signal parent process via fd.
+ * 3. Invoke registered callback routine with locking status.
+ * 4. If the child process cannot get locks within certain time,
+ *    diagnose using /proc/locks and log warning message
+ *
+ * ctdb_lock_record()      - get a lock on a record
+ * ctdb_lock_db()          - get a lock on a DB
+ * ctdb_lock_alldb_prio()  - get a lock on all DBs with given priority
+ * ctdb_lock_alldb()       - get a lock on all DBs
+ *
+ *  auto_mark              - whether to mark/unmark DBs in before/after callback
+ */
+
+/* FIXME: Add a tunable max_lock_processes_per_db */
+#define MAX_LOCK_PROCESSES_PER_DB              (100)
+
+enum lock_type {
+       LOCK_RECORD,
+       LOCK_DB,
+       LOCK_ALLDB_PRIO,
+       LOCK_ALLDB,
+};
+
+static const char * const lock_type_str[] = {
+       "lock_record",
+       "lock_db",
+       "lock_alldb_prio",
+       "lock_db",
+};
+
+struct lock_request;
+
+/* lock_context is the common part for a lock request */
+struct lock_context {
+       struct lock_context *next, *prev;
+       enum lock_type type;
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       TDB_DATA key;
+       uint32_t priority;
+       bool auto_mark;
+       struct lock_request *req_queue;
+       pid_t child;
+       int fd[2];
+       struct tevent_fd *tfd;
+       struct tevent_timer *ttimer;
+       pid_t block_child;
+       int block_fd[2];
+       struct timeval start_time;
+};
+
+/* lock_request is the client specific part for a lock request */
+struct lock_request {
+       struct lock_request *next, *prev;
+       struct lock_context *lctx;
+       void (*callback)(void *, bool);
+       void *private_data;
+};
+
+
+/*
+ * Support samba 3.6.x (and older) versions which do not set db priority.
+ *
+ * By default, all databases are set to priority 1. So only when priority
+ * is set to 1, check for databases that need higher priority.
+ */
+static bool later_db(struct ctdb_context *ctdb, const char *name)
+{
+       if (ctdb->tunable.samba3_hack == 0) {
+               return false;
+       }
+
+       if (strstr(name, "brlock") ||
+           strstr(name, "g_lock") ||
+           strstr(name, "notify_onelevel") ||
+           strstr(name, "serverid") ||
+           strstr(name, "xattr_tdb")) {
+               return true;
+       }
+
+       return false;
+}
+
+typedef int (*db_handler_t)(struct ctdb_db_context *ctdb_db,
+                           uint32_t priority,
+                           void *private_data);
+
+static int ctdb_db_iterator(struct ctdb_context *ctdb, uint32_t priority,
+                           db_handler_t handler, void *private_data)
+{
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+
+       for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+               if (ctdb_db->priority != priority) {
+                       continue;
+               }
+               if (later_db(ctdb, ctdb_db->db_name)) {
+                       continue;
+               }
+               ret = handler(ctdb_db, priority, private_data);
+               if (ret != 0) {
+                       return -1;
+               }
+       }
+
+       /* If priority != 1, later_db check is not required and can return */
+       if (priority != 1) {
+               return 0;
+       }
+
+       for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+               if (!later_db(ctdb, ctdb_db->db_name)) {
+                       continue;
+               }
+               ret = handler(ctdb_db, priority, private_data);
+               if (ret != 0) {
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+ * lock all databases - mark only
+ */
+static int db_lock_mark_handler(struct ctdb_db_context *ctdb_db, uint32_t priority,
+                               void *private_data)
+{
+       int tdb_transaction_write_lock_mark(struct tdb_context *);
+
+       DEBUG(DEBUG_INFO, ("marking locked database %s, priority:%u\n",
+                          ctdb_db->db_name, priority));
+
+       if (tdb_transaction_write_lock_mark(ctdb_db->ltdb->tdb) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to mark (transaction lock) database %s\n",
+                                 ctdb_db->db_name));
+               return -1;
+       }
+
+       if (tdb_lockall_mark(ctdb_db->ltdb->tdb) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to mark (all lock) database %s\n",
+                                 ctdb_db->db_name));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_lockall_mark_prio(struct ctdb_context *ctdb, uint32_t priority)
+{
+       /*
+        * This function is only used by the main dameon during recovery.
+        * At this stage, the databases have already been locked, by a
+        * dedicated child process. The freeze_mode variable is used to track
+        * whether the actual locks are held by the child process or not.
+        */
+
+       if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
+               DEBUG(DEBUG_ERR, ("Attempt to mark all databases locked when not frozen\n"));
+               return -1;
+       }
+
+       return ctdb_db_iterator(ctdb, priority, db_lock_mark_handler, NULL);
+}
+
+static int ctdb_lockall_mark(struct ctdb_context *ctdb)
+{
+       uint32_t priority;
+
+       for (priority=1; priority<=NUM_DB_PRIORITIES; priority++) {
+               if (ctdb_db_iterator(ctdb, priority, db_lock_mark_handler, NULL) != 0) {
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+ * lock all databases - unmark only
+ */
+static int db_lock_unmark_handler(struct ctdb_db_context *ctdb_db, uint32_t priority,
+                                 void *private_data)
+{
+       int tdb_transaction_write_lock_unmark(struct tdb_context *);
+
+       DEBUG(DEBUG_INFO, ("unmarking locked database %s, priority:%u\n",
+                          ctdb_db->db_name, priority));
+
+       if (tdb_transaction_write_lock_unmark(ctdb_db->ltdb->tdb) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to unmark (transaction lock) database %s\n",
+                                 ctdb_db->db_name));
+               return -1;
+       }
+
+       if (tdb_lockall_unmark(ctdb_db->ltdb->tdb) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to unmark (all lock) database %s\n",
+                                 ctdb_db->db_name));
+               return -1;
+       }
+
+       return 0;
+}
+
+int ctdb_lockall_unmark_prio(struct ctdb_context *ctdb, uint32_t priority)
+{
+       /*
+        * This function is only used by the main dameon during recovery.
+        * At this stage, the databases have already been locked, by a
+        * dedicated child process. The freeze_mode variable is used to track
+        * whether the actual locks are held by the child process or not.
+        */
+
+       if (ctdb->freeze_mode[priority] != CTDB_FREEZE_FROZEN) {
+               DEBUG(DEBUG_ERR, ("Attempt to unmark all databases locked when not frozen\n"));
+               return -1;
+       }
+
+       return ctdb_db_iterator(ctdb, priority, db_lock_unmark_handler, NULL);
+}
+
+static int ctdb_lockall_unmark(struct ctdb_context *ctdb)
+{
+       uint32_t priority;
+
+       for (priority=NUM_DB_PRIORITIES; priority>=0; priority--) {
+               if (ctdb_db_iterator(ctdb, priority, db_lock_unmark_handler, NULL) != 0) {
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+
+static void ctdb_lock_schedule(struct ctdb_context *ctdb);
+
+/*
+ * Destructor to kill the child locking process
+ */
+static int ctdb_lock_context_destructor(struct lock_context *lock_ctx)
+{
+       if (lock_ctx->child > 0) {
+               ctdb_kill(lock_ctx->ctdb, lock_ctx->child, SIGKILL);
+               DLIST_REMOVE(lock_ctx->ctdb->lock_current, lock_ctx);
+               lock_ctx->ctdb->lock_num_current--;
+               CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+               if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+                       CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
+               }
+       } else {
+               DLIST_REMOVE(lock_ctx->ctdb->lock_pending, lock_ctx);
+               lock_ctx->ctdb->lock_num_pending--;
+               CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+               if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+                       CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+               }
+       }
+
+       ctdb_lock_schedule(lock_ctx->ctdb);
+
+       return 0;
+}
+
+
+/*
+ * Destructor to remove lock request
+ */
+static int ctdb_lock_request_destructor(struct lock_request *lock_request)
+{
+       DLIST_REMOVE(lock_request->lctx->req_queue, lock_request);
+       return 0;
+}
+
+
+void ctdb_lock_free_request_context(struct lock_request *lock_req)
+{
+       struct lock_context *lock_ctx;
+
+       lock_ctx = lock_req->lctx;
+       talloc_free(lock_req);
+       talloc_free(lock_ctx);
+}
+
+
+/*
+ * Process all the callbacks waiting for lock
+ *
+ * If lock has failed, callback is executed with locked=false
+ */
+static void process_callbacks(struct lock_context *lock_ctx, bool locked)
+{
+       struct lock_request *request, *next;
+
+       if (lock_ctx->auto_mark && locked) {
+               switch (lock_ctx->type) {
+               case LOCK_RECORD:
+                       tdb_chainlock_mark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
+                       break;
+
+               case LOCK_DB:
+                       tdb_lockall_mark(lock_ctx->ctdb_db->ltdb->tdb);
+                       break;
+
+               case LOCK_ALLDB_PRIO:
+                       ctdb_lockall_mark_prio(lock_ctx->ctdb, lock_ctx->priority);
+                       break;
+
+               case LOCK_ALLDB:
+                       ctdb_lockall_mark(lock_ctx->ctdb);
+                       break;
+               }
+       }
+
+       /* Iterate through all callbacks */
+       request = lock_ctx->req_queue;
+       while (request) {
+               if (lock_ctx->auto_mark) {
+                       /* Reset the destructor, so request is not removed from the list */
+                       talloc_set_destructor(request, NULL);
+               }
+
+               /* In case, callback frees the request, store next */
+               next = request->next;
+               request->callback(request->private_data, locked);
+               request = next;
+       }
+
+       if (lock_ctx->auto_mark && locked) {
+               switch (lock_ctx->type) {
+               case LOCK_RECORD:
+                       tdb_chainlock_unmark(lock_ctx->ctdb_db->ltdb->tdb, lock_ctx->key);
+                       break;
+
+               case LOCK_DB:
+                       tdb_lockall_unmark(lock_ctx->ctdb_db->ltdb->tdb);
+                       break;
+
+               case LOCK_ALLDB_PRIO:
+                       ctdb_lockall_unmark_prio(lock_ctx->ctdb, lock_ctx->priority);
+                       break;
+
+               case LOCK_ALLDB:
+                       ctdb_lockall_unmark(lock_ctx->ctdb);
+                       break;
+               }
+       }
+}
+
+
+static int lock_bucket_id(double t)
+{
+       double ms = 1.e-3, s = 1;
+       int id;
+
+       if (t < 1*ms) {
+               id = 0;
+       } else if (t < 10*ms) {
+               id = 1;
+       } else if (t < 100*ms) {
+               id = 2;
+       } else if (t < 1*s) {
+               id = 3;
+       } else if (t < 2*s) {
+               id = 4;
+       } else if (t < 4*s) {
+               id = 5;
+       } else if (t < 8*s) {
+               id = 6;
+       } else if (t < 16*s) {
+               id = 7;
+       } else if (t < 32*s) {
+               id = 8;
+       } else if (t < 64*s) {
+               id = 9;
+       } else {
+               id = 10;
+       }
+
+       return id;
+}
+
+/*
+ * Callback routine when the required locks are obtained.
+ * Called from parent context
+ */
+static void ctdb_lock_handler(struct tevent_context *ev,
+                           struct tevent_fd *tfd,
+                           uint16_t flags,
+                           void *private_data)
+{
+       struct lock_context *lock_ctx;
+       TALLOC_CTX *tmp_ctx = NULL;
+       char c;
+       bool locked;
+       double t;
+       int id;
+
+       lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
+
+       /* cancel the timeout event */
+       if (lock_ctx->ttimer) {
+               TALLOC_FREE(lock_ctx->ttimer);
+       }
+
+       t = timeval_elapsed(&lock_ctx->start_time);
+       id = lock_bucket_id(t);
+
+       if (lock_ctx->auto_mark) {
+               tmp_ctx = talloc_new(ev);
+               talloc_steal(tmp_ctx, lock_ctx);
+       }
+
+       /* Read the status from the child process */
+       read(lock_ctx->fd[0], &c, 1);
+       locked = (c == 0 ? true : false);
+
+       /* Update statistics */
+       CTDB_DECREMENT_STAT(lock_ctx->ctdb, locks.num_pending);
+       CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_calls);
+       if (lock_ctx->ctdb_db) {
+               CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+               CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_calls);
+       }
+
+       if (locked) {
+               if (lock_ctx->ctdb_db) {
+                       CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_current);
+                       CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.buckets[id]);
+                       CTDB_UPDATE_LATENCY(lock_ctx->ctdb, lock_ctx->ctdb_db,
+                                           lock_type_str[lock_ctx->type], locks.latency,
+                                           lock_ctx->start_time);
+
+                       CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_current);
+                       CTDB_UPDATE_DB_LATENCY(lock_ctx->ctdb_db, lock_type_str[lock_ctx->type], locks.latency, t);
+                       CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.buckets[id]);
+               }
+       } else {
+               CTDB_INCREMENT_STAT(lock_ctx->ctdb, locks.num_failed);
+               if (lock_ctx->ctdb_db) {
+                       CTDB_INCREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_failed);
+               }
+       }
+
+       process_callbacks(lock_ctx, locked);
+
+       if (lock_ctx->auto_mark) {
+               talloc_free(tmp_ctx);
+       }
+}
+
+
+/*
+ * Callback routine when required locks are not obtained within timeout
+ * Called from parent context
+ */
+static void ctdb_lock_timeout_handler(struct tevent_context *ev,
+                                   struct tevent_timer *ttimer,
+                                   struct timeval current_time,
+                                   void *private_data)
+{
+       static const char * debug_locks = NULL;
+       struct lock_context *lock_ctx;
+       struct ctdb_context *ctdb;
+       pid_t pid;
+
+       lock_ctx = talloc_get_type_abort(private_data, struct lock_context);
+       ctdb = lock_ctx->ctdb;
+
+       if (lock_ctx->type == LOCK_RECORD || lock_ctx->type == LOCK_DB) {
+               DEBUG(DEBUG_WARNING,
+                     ("Unable to get %s lock on database %s for %.0lf seconds\n",
+                      (lock_ctx->type == LOCK_RECORD ? "RECORD" : "DB"),
+                      lock_ctx->ctdb_db->db_name,
+                      timeval_elapsed(&lock_ctx->start_time)));
+       } else {
+               DEBUG(DEBUG_WARNING,
+                     ("Unable to get ALLDB locks for %.0lf seconds\n",
+                      timeval_elapsed(&lock_ctx->start_time)));
+       }
+
+       /* Fire a child process to find the blocking process. */
+       if (debug_locks == NULL) {
+               debug_locks = getenv("CTDB_DEBUG_LOCKS");
+               if (debug_locks == NULL) {
+                       debug_locks = talloc_asprintf(ctdb,
+                                                     "%s/debug_locks.sh",
+                                                     getenv("CTDB_BASE"));
+               }
+       }
+       if (debug_locks != NULL) {
+               pid = fork();
+               if (pid == 0) {
+                       execl(debug_locks, debug_locks, NULL);
+               }
+       } else {
+               DEBUG(DEBUG_WARNING,
+                     (__location__
+                      " Unable to setup lock debugging - no memory?\n"));
+       }
+
+       /* reset the timeout timer */
+       // talloc_free(lock_ctx->ttimer);
+       lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
+                                           lock_ctx,
+                                           timeval_current_ofs(10, 0),
+                                           ctdb_lock_timeout_handler,
+                                           (void *)lock_ctx);
+}
+
+
+static int db_count_handler(struct ctdb_db_context *ctdb_db, uint32_t priority,
+                           void *private_data)
+{
+       int *count = (int *)private_data;
+
+       (*count)++;
+
+       return 0;
+}
+
+struct db_namelist {
+       char **names;
+       int n;
+};
+
+static int db_name_handler(struct ctdb_db_context *ctdb_db, uint32_t priority,
+                          void *private_data)
+{
+       struct db_namelist *list = (struct db_namelist *)private_data;
+
+       list->names[list->n] = talloc_strdup(list->names, ctdb_db->db_path);
+       list->n++;
+
+       return 0;
+}
+
+static char **lock_helper_args(TALLOC_CTX *mem_ctx, struct lock_context *lock_ctx, int fd)
+{
+       struct ctdb_context *ctdb = lock_ctx->ctdb;
+       char **args = NULL;
+       int nargs, i;
+       int priority;
+       struct db_namelist list;
+
+       switch (lock_ctx->type) {
+       case LOCK_RECORD:
+               nargs = 6;
+               break;
+
+       case LOCK_DB:
+               nargs = 5;
+               break;
+
+       case LOCK_ALLDB_PRIO:
+               nargs = 4;
+               ctdb_db_iterator(ctdb, lock_ctx->priority, db_count_handler, &nargs);
+               break;
+
+       case LOCK_ALLDB:
+               nargs = 4;
+               for (priority=1; priority<NUM_DB_PRIORITIES; priority++) {
+                       ctdb_db_iterator(ctdb, priority, db_count_handler, &nargs);
+               }
+               break;
+       }
+
+       /* Add extra argument for null termination */
+       nargs++;
+
+       args = talloc_array(mem_ctx, char *, nargs);
+       if (args == NULL) {
+               return NULL;
+       }
+
+       args[0] = talloc_strdup(args, "ctdb_lock_helper");
+       args[1] = talloc_asprintf(args, "%d", getpid());
+       args[2] = talloc_asprintf(args, "%d", fd);
+
+       switch (lock_ctx->type) {
+       case LOCK_RECORD:
+               args[3] = talloc_strdup(args, "RECORD");
+               args[4] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+               if (lock_ctx->key.dsize == 0) {
+                       args[5] = talloc_strdup(args, "NULL");
+               } else {
+                       args[5] = hex_encode_talloc(args, lock_ctx->key.dptr, lock_ctx->key.dsize);
+               }
+               break;
+
+       case LOCK_DB:
+               args[3] = talloc_strdup(args, "DB");
+               args[4] = talloc_strdup(args, lock_ctx->ctdb_db->db_path);
+               break;
+
+       case LOCK_ALLDB_PRIO:
+               args[3] = talloc_strdup(args, "DB");
+               list.names = args;
+               list.n = 4;
+               ctdb_db_iterator(ctdb, lock_ctx->priority, db_name_handler, &list);
+               break;
+
+       case LOCK_ALLDB:
+               args[3] = talloc_strdup(args, "DB");
+               list.names = args;
+               list.n = 4;
+               for (priority=1; priority<NUM_DB_PRIORITIES; priority++) {
+                       ctdb_db_iterator(ctdb, priority, db_name_handler, &list);
+               }
+               break;
+       }
+
+       /* Make sure last argument is NULL */
+       args[nargs-1] = NULL;
+
+       for (i=0; i<nargs-1; i++) {
+               if (args[i] == NULL) {
+                       talloc_free(args);
+                       return NULL;
+               }
+       }
+
+       return args;
+}
+
+
+/*
+ * Find the lock context of a given type
+ */
+static struct lock_context *find_lock_context(struct lock_context *lock_list,
+                                             struct ctdb_db_context *ctdb_db,
+                                             TDB_DATA key,
+                                             uint32_t priority,
+                                             enum lock_type type)
+{
+       struct lock_context *lock_ctx;
+
+       /* Search active locks */
+       for (lock_ctx=lock_list; lock_ctx; lock_ctx=lock_ctx->next) {
+               if (lock_ctx->type != type) {
+                       continue;
+               }
+
+               switch (lock_ctx->type) {
+               case LOCK_RECORD:
+                       if (ctdb_db == lock_ctx->ctdb_db &&
+                           key.dsize == lock_ctx->key.dsize &&
+                           memcmp(key.dptr, lock_ctx->key.dptr, key.dsize) == 0) {
+                               goto done;
+                       }
+                       break;
+
+               case LOCK_DB:
+                       if (ctdb_db == lock_ctx->ctdb_db) {
+                               goto done;
+                       }
+                       break;
+
+               case LOCK_ALLDB_PRIO:
+                       if (priority == lock_ctx->priority) {
+                               goto done;
+                       }
+                       break;
+
+               case LOCK_ALLDB:
+                       goto done;
+                       break;
+               }
+       }
+
+       /* Did not find the lock context we are searching for */
+       lock_ctx = NULL;
+
+done:
+       return lock_ctx;
+
+}
+
+
+/*
+ * Schedule a new lock child process
+ * Set up callback handler and timeout handler
+ */
+static void ctdb_lock_schedule(struct ctdb_context *ctdb)
+{
+       struct lock_context *lock_ctx, *next_ctx, *active_ctx;
+       int ret;
+       TALLOC_CTX *tmp_ctx;
+       const char *helper = BINDIR "/ctdb_lock_helper";
+       static const char *prog = NULL;
+       char **args;
+
+       if (prog == NULL) {
+               const char *t;
+
+               t = getenv("CTDB_LOCK_HELPER");
+               if (t != NULL) {
+                       prog = talloc_strdup(ctdb, t);
+               } else {
+                       prog = talloc_strdup(ctdb, helper);
+               }
+               CTDB_NO_MEMORY_VOID(ctdb, prog);
+       }
+
+       if (ctdb->lock_num_current >= MAX_LOCK_PROCESSES_PER_DB) {
+               return;
+       }
+
+       if (ctdb->lock_pending == NULL) {
+               return;
+       }
+
+       /* Find a lock context with requests */
+       lock_ctx = ctdb->lock_pending;
+       while (lock_ctx != NULL) {
+               next_ctx = lock_ctx->next;
+               if (! lock_ctx->req_queue) {
+                       DEBUG(DEBUG_INFO, ("Removing lock context without lock requests\n"));
+                       DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
+                       ctdb->lock_num_pending--;
+                       CTDB_DECREMENT_STAT(ctdb, locks.num_pending);
+                       if (lock_ctx->ctdb_db) {
+                               CTDB_DECREMENT_DB_STAT(lock_ctx->ctdb_db, locks.num_pending);
+                       }
+                       talloc_free(lock_ctx);
+               } else {
+                       active_ctx = find_lock_context(ctdb->lock_current, lock_ctx->ctdb_db,
+                                                      lock_ctx->key, lock_ctx->priority,
+                                                      lock_ctx->type);
+                       if (active_ctx == NULL) {
+                               /* Found a lock context with lock requests */
+                               break;
+                       }
+
+                       /* There is already a child waiting for the
+                        * same key.  So don't schedule another child
+                        * just yet.
+                        */
+               }
+               lock_ctx = next_ctx;
+       }
+
+       if (lock_ctx == NULL) {
+               return;
+       }
+
+       lock_ctx->child = -1;
+       ret = pipe(lock_ctx->fd);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to create pipe in ctdb_lock_schedule\n"));
+               return;
+       }
+
+       set_close_on_exec(lock_ctx->fd[0]);
+
+       /* Create data for child process */
+       tmp_ctx = talloc_new(lock_ctx);
+       if (tmp_ctx == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to allocate memory for helper args\n"));
+               close(lock_ctx->fd[0]);
+               close(lock_ctx->fd[1]);
+               return;
+       }
+
+       /* Create arguments for lock helper */
+       args = lock_helper_args(tmp_ctx, lock_ctx, lock_ctx->fd[1]);
+       if (args == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to create lock helper args\n"));
+               close(lock_ctx->fd[0]);
+               close(lock_ctx->fd[1]);
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       lock_ctx->child = ctdb_fork(ctdb);
+
+       if (lock_ctx->child == (pid_t)-1) {
+               DEBUG(DEBUG_ERR, ("Failed to create a child in ctdb_lock_schedule\n"));
+               close(lock_ctx->fd[0]);
+               close(lock_ctx->fd[1]);
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+
+       /* Child process */
+       if (lock_ctx->child == 0) {
+               ret = execv(prog, args);
+               if (ret < 0) {
+                       DEBUG(DEBUG_ERR, ("Failed to execute helper %s (%d, %s)\n",
+                                         prog, errno, strerror(errno)));
+               }
+               _exit(1);
+       }
+
+       /* Parent process */
+       close(lock_ctx->fd[1]);
+
+       talloc_set_destructor(lock_ctx, ctdb_lock_context_destructor);
+
+       talloc_free(tmp_ctx);
+
+       /* Set up timeout handler */
+       lock_ctx->ttimer = tevent_add_timer(ctdb->ev,
+                                           lock_ctx,
+                                           timeval_current_ofs(10, 0),
+                                           ctdb_lock_timeout_handler,
+                                           (void *)lock_ctx);
+       if (lock_ctx->ttimer == NULL) {
+               ctdb_kill(ctdb, lock_ctx->child, SIGKILL);
+               lock_ctx->child = -1;
+               talloc_set_destructor(lock_ctx, NULL);
+               close(lock_ctx->fd[0]);
+               return;
+       }
+
+       /* Set up callback */
+       lock_ctx->tfd = tevent_add_fd(ctdb->ev,
+                                     lock_ctx,
+                                     lock_ctx->fd[0],
+                                     EVENT_FD_READ,
+                                     ctdb_lock_handler,
+                                     (void *)lock_ctx);
+       if (lock_ctx->tfd == NULL) {
+               TALLOC_FREE(lock_ctx->ttimer);
+               ctdb_kill(ctdb, lock_ctx->child, SIGKILL);
+               lock_ctx->child = -1;
+               talloc_set_destructor(lock_ctx, NULL);
+               close(lock_ctx->fd[0]);
+               return;
+       }
+       tevent_fd_set_auto_close(lock_ctx->tfd);
+
+       /* Move the context from pending to current */
+       DLIST_REMOVE(ctdb->lock_pending, lock_ctx);
+       ctdb->lock_num_pending--;
+       DLIST_ADD_END(ctdb->lock_current, lock_ctx, NULL);
+       ctdb->lock_num_current++;
+}
+
+
+/*
+ * Lock record / db depending on type
+ */
+static struct lock_request *ctdb_lock_internal(struct ctdb_context *ctdb,
+                                              struct ctdb_db_context *ctdb_db,
+                                              TDB_DATA key,
+                                              uint32_t priority,
+                                              void (*callback)(void *, bool),
+                                              void *private_data,
+                                              enum lock_type type,
+                                              bool auto_mark)
+{
+       struct lock_context *lock_ctx;
+       struct lock_request *request;
+
+       if (callback == NULL) {
+               DEBUG(DEBUG_WARNING, ("No callback function specified, not locking\n"));
+               return NULL;
+       }
+
+       /* get a context for this key - search only the pending contexts,
+        * current contexts might in the middle of processing callbacks */
+       lock_ctx = find_lock_context(ctdb->lock_pending, ctdb_db, key, priority, type);
+
+       /* No existing context, create one */
+       if (lock_ctx == NULL) {
+               lock_ctx = talloc_zero(ctdb, struct lock_context);
+               if (lock_ctx == NULL) {
+                       DEBUG(DEBUG_ERR, ("Failed to create a new lock context\n"));
+                       return NULL;
+               }
+
+               lock_ctx->type = type;
+               lock_ctx->ctdb = ctdb;
+               lock_ctx->ctdb_db = ctdb_db;
+               lock_ctx->key.dsize = key.dsize;
+               if (key.dsize > 0) {
+                       lock_ctx->key.dptr = talloc_memdup(lock_ctx, key.dptr, key.dsize);
+               } else {
+                       lock_ctx->key.dptr = NULL;
+               }
+               lock_ctx->priority = priority;
+               lock_ctx->auto_mark = auto_mark;
+
+               lock_ctx->child = -1;
+               lock_ctx->block_child = -1;
+
+               DLIST_ADD_END(ctdb->lock_pending, lock_ctx, NULL);
+               ctdb->lock_num_pending++;
+               CTDB_INCREMENT_STAT(ctdb, locks.num_pending);
+               if (ctdb_db) {
+                       CTDB_INCREMENT_DB_STAT(ctdb_db, locks.num_pending);
+               }
+
+               /* Start the timer when we activate the context */
+               lock_ctx->start_time = timeval_current();
+       }
+
+       if ((request = talloc_zero(lock_ctx, struct lock_request)) == NULL) {
+               return NULL;
+       }
+
+       request->lctx = lock_ctx;
+       request->callback = callback;
+       request->private_data = private_data;
+
+       talloc_set_destructor(request, ctdb_lock_request_destructor);
+       DLIST_ADD_END(lock_ctx->req_queue, request, NULL);
+
+       ctdb_lock_schedule(ctdb);
+
+       return request;
+}
+
+
+/*
+ * obtain a lock on a record in a database
+ */
+struct lock_request *ctdb_lock_record(struct ctdb_db_context *ctdb_db,
+                                     TDB_DATA key,
+                                     bool auto_mark,
+                                     void (*callback)(void *, bool),
+                                     void *private_data)
+{
+       return ctdb_lock_internal(ctdb_db->ctdb,
+                                 ctdb_db,
+                                 key,
+                                 0,
+                                 callback,
+                                 private_data,
+                                 LOCK_RECORD,
+                                 auto_mark);
+}
+
+
+/*
+ * obtain a lock on a database
+ */
+struct lock_request *ctdb_lock_db(struct ctdb_db_context *ctdb_db,
+                                 bool auto_mark,
+                                 void (*callback)(void *, bool),
+                                 void *private_data)
+{
+       return ctdb_lock_internal(ctdb_db->ctdb,
+                                 ctdb_db,
+                                 tdb_null,
+                                 0,
+                                 callback,
+                                 private_data,
+                                 LOCK_DB,
+                                 auto_mark);
+}
+
+
+/*
+ * obtain locks on all databases of specified priority
+ */
+struct lock_request *ctdb_lock_alldb_prio(struct ctdb_context *ctdb,
+                                         uint32_t priority,
+                                         bool auto_mark,
+                                         void (*callback)(void *, bool),
+                                         void *private_data)
+{
+       if (priority < 0 || priority > NUM_DB_PRIORITIES) {
+               DEBUG(DEBUG_ERR, ("Invalid db priority: %u\n", priority));
+               return NULL;
+       }
+
+       return ctdb_lock_internal(ctdb,
+                                 NULL,
+                                 tdb_null,
+                                 priority,
+                                 callback,
+                                 private_data,
+                                 LOCK_ALLDB_PRIO,
+                                 auto_mark);
+}
+
+
+/*
+ * obtain locks on all databases
+ */
+struct lock_request *ctdb_lock_alldb(struct ctdb_context *ctdb,
+                                    bool auto_mark,
+                                    void (*callback)(void *, bool),
+                                    void *private_data)
+{
+       return ctdb_lock_internal(ctdb,
+                                 NULL,
+                                 tdb_null,
+                                 0,
+                                 callback,
+                                 private_data,
+                                 LOCK_ALLDB,
+                                 auto_mark);
+}
+
diff --git a/ctdb/server/ctdb_lock_helper.c b/ctdb/server/ctdb_lock_helper.c
new file mode 100644 (file)
index 0000000..d8a1d24
--- /dev/null
@@ -0,0 +1,148 @@
+/*
+   ctdb lock helper
+
+   Copyright (C) Amitay Isaacs  2013
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+
+static char *progname = NULL;
+
+static void send_result(int fd, char result)
+{
+       write(fd, &result, 1);
+       if (result == 1) {
+               exit(1);
+       }
+}
+
+
+static void usage(void)
+{
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Usage: %s <ctdbd-pid> <output-fd> RECORD <db-path> <db-key>\n",
+               progname);
+       fprintf(stderr, "       %s <ctdbd-pid> <output-fd> DB <db1-path> [<db2-path> ...]\n",
+               progname);
+}
+
+
+static int lock_record(const char *dbpath, const char *dbkey)
+{
+       TDB_DATA key;
+       struct tdb_context *tdb;
+
+       /* Convert hex key to key */
+       if (strcmp(dbkey, "NULL") == 0) {
+               key.dptr = NULL;
+               key.dsize = 0;
+       } else {
+               key.dptr = hex_decode_talloc(NULL, dbkey, &key.dsize);
+       }
+
+       tdb = tdb_open(dbpath, 0, TDB_DEFAULT, O_RDWR, 0600);
+       if (tdb == NULL) {
+               fprintf(stderr, "%s: Error opening database %s\n", progname, dbpath);
+               return 1;
+       }
+
+       if (tdb_chainlock(tdb, key) < 0) {
+               fprintf(stderr, "%s: Error getting record lock (%s)\n",
+                       progname, tdb_errorstr(tdb));
+               return 1;
+       }
+
+       return 0;
+
+}
+
+
+static int lock_db(const char *dbpath)
+{
+       struct tdb_context *tdb;
+
+       tdb = tdb_open(dbpath, 0, TDB_DEFAULT, O_RDWR, 0600);
+       if (tdb == NULL) {
+               fprintf(stderr, "%s: Error opening database %s\n", progname, dbpath);
+               return 1;
+       }
+
+       if (tdb_lockall(tdb) < 0) {
+               fprintf(stderr, "%s: Error getting db lock (%s)\n",
+                       progname, tdb_errorstr(tdb));
+               return 1;
+       }
+
+       return 0;
+}
+
+
+int main(int argc, char *argv[])
+{
+       int write_fd;
+       char result = 0;
+       int ppid;
+       const char *lock_type;
+
+       progname = argv[0];
+
+       if (argc < 4) {
+               usage();
+               exit(1);
+       }
+
+       ppid = atoi(argv[1]);
+       write_fd = atoi(argv[2]);
+       lock_type = argv[3];
+
+       if (strcmp(lock_type, "RECORD") == 0) {
+               if (argc != 6) {
+                       fprintf(stderr, "%s: Invalid number of arguments (%d)\n",
+                               progname, argc);
+                       usage();
+                       exit(1);
+               }
+               result = lock_record(argv[4], argv[5]);
+
+       } else if (strcmp(lock_type, "DB") == 0) {
+               int n;
+
+               /* If there are no databases specified, no need for lock */
+               if (argc > 4) {
+                       for (n=4; n<argc; n++) {
+                               result = lock_db(argv[n]);
+                               if (result != 0) {
+                                       break;
+                               }
+                       }
+               }
+
+       } else {
+               fprintf(stderr, "%s: Invalid lock-type '%s'\n", progname, lock_type);
+               usage();
+               exit(1);
+       }
+
+       send_result(write_fd, result);
+
+       while (kill(ppid, 0) == 0 || errno != ESRCH) {
+               sleep(5);
+       }
+       return 0;
+}
diff --git a/ctdb/server/ctdb_logging.c b/ctdb/server/ctdb_logging.c
new file mode 100644 (file)
index 0000000..218186e
--- /dev/null
@@ -0,0 +1,626 @@
+/* 
+   ctdb logging code
+
+   Copyright (C) Andrew Tridgell  2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "system/syslog.h"
+#include "system/time.h"
+#include "system/filesys.h"
+
+struct syslog_message {
+       uint32_t level;
+       uint32_t len;
+       char message[1];
+};
+
+
+struct ctdb_syslog_state {
+       int syslog_fd;
+       int fd[2];
+};
+
+static int syslogd_is_started = 0;
+
+
+/* called when child is finished
+ * this is for the syslog daemon, we can not use DEBUG here
+ */
+static void ctdb_syslog_handler(struct event_context *ev, struct fd_event *fde, 
+                                     uint16_t flags, void *p)
+{
+       struct ctdb_syslog_state *state = talloc_get_type(p, struct ctdb_syslog_state);
+
+       int count;
+       char str[65536];
+       struct syslog_message *msg;
+
+       if (state == NULL) {
+               return;
+       }
+
+       count = recv(state->syslog_fd, str, sizeof(str), 0);
+       if (count < sizeof(struct syslog_message)) {
+               return;
+       }
+       msg = (struct syslog_message *)str;
+
+       syslog(msg->level, "%s", msg->message);
+}
+
+
+/* called when the pipe from the main daemon has closed
+ * this is for the syslog daemon, we can not use DEBUG here
+ */
+static void ctdb_syslog_terminate_handler(struct event_context *ev, struct fd_event *fde, 
+                                     uint16_t flags, void *p)
+{
+       syslog(LOG_ERR, "Shutting down SYSLOG daemon with pid:%d", (int)getpid());
+       _exit(0);
+}
+
+
+
+/*
+ * this is for the syslog daemon, we can not use DEBUG here
+ */
+int start_syslog_daemon(struct ctdb_context *ctdb)
+{
+       struct sockaddr_in syslog_sin;
+       struct ctdb_syslog_state *state;
+       struct tevent_fd *fde;
+       int startup_fd[2];
+       int ret = -1;
+
+       state = talloc(ctdb, struct ctdb_syslog_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       if (pipe(state->fd) != 0) {
+               printf("Failed to create syslog pipe\n");
+               talloc_free(state);
+               return -1;
+       }
+       
+       if (pipe(startup_fd) != 0) {
+               printf("Failed to create syslog startup pipe\n");
+               close(state->fd[0]);
+               close(state->fd[1]);
+               talloc_free(state);
+               return -1;
+       }
+       
+       ctdb->syslogd_pid = ctdb_fork(ctdb);
+       if (ctdb->syslogd_pid == (pid_t)-1) {
+               printf("Failed to create syslog child process\n");
+               close(state->fd[0]);
+               close(state->fd[1]);
+               close(startup_fd[0]);
+               close(startup_fd[1]);
+               talloc_free(state);
+               return -1;
+       }
+
+       if (ctdb->syslogd_pid != 0) {
+               ssize_t n;
+               int dummy;
+
+               DEBUG(DEBUG_ERR,("Starting SYSLOG child process with pid:%d\n", (int)ctdb->syslogd_pid));
+
+               close(state->fd[1]);
+               set_close_on_exec(state->fd[0]);
+
+               close(startup_fd[1]);
+               n = read(startup_fd[0], &dummy, sizeof(dummy));
+               close(startup_fd[0]);
+               if (n < sizeof(dummy)) {
+                       return -1;
+               }
+
+               syslogd_is_started = 1;
+               return 0;
+       }
+
+       debug_extra = talloc_asprintf(NULL, "syslogd:");
+       talloc_free(ctdb->ev);
+       ctdb->ev = event_context_init(NULL);
+
+       syslog(LOG_ERR, "Starting SYSLOG daemon with pid:%d", (int)getpid());
+       ctdb_set_process_name("ctdb_syslogd");
+
+       close(state->fd[0]);
+       close(startup_fd[0]);
+       set_close_on_exec(state->fd[1]);
+       set_close_on_exec(startup_fd[1]);
+       fde = event_add_fd(ctdb->ev, state, state->fd[1], EVENT_FD_READ,
+                    ctdb_syslog_terminate_handler, state);
+       tevent_fd_set_auto_close(fde);
+
+       state->syslog_fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+       if (state->syslog_fd == -1) {
+               printf("Failed to create syslog socket\n");
+               close(startup_fd[1]);
+               return ret;
+       }
+
+       set_close_on_exec(state->syslog_fd);
+
+       syslog_sin.sin_family = AF_INET;
+       syslog_sin.sin_port   = htons(CTDB_PORT);
+       syslog_sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);    
+
+       if (bind(state->syslog_fd, (struct sockaddr *)&syslog_sin,
+                sizeof(syslog_sin)) == -1)
+       {
+               printf("syslog daemon failed to bind to socket. errno:%d(%s)\n", errno, strerror(errno));
+               close(startup_fd[1]);
+               _exit(10);
+       }
+
+
+       fde = event_add_fd(ctdb->ev, state, state->syslog_fd, EVENT_FD_READ,
+                    ctdb_syslog_handler, state);
+       tevent_fd_set_auto_close(fde);
+
+       /* Tell parent that we're up */
+       ret = 0;
+       write(startup_fd[1], &ret, sizeof(ret));
+       close(startup_fd[1]);
+
+       event_loop_wait(ctdb->ev);
+
+       /* this should not happen */
+       _exit(10);
+}
+
+struct ctdb_log_state {
+       struct ctdb_context *ctdb;
+       const char *prefix;
+       int fd, pfd;
+       char buf[1024];
+       uint16_t buf_used;
+       bool use_syslog;
+       void (*logfn)(const char *, uint16_t, void *);
+       void *logfn_private;
+};
+
+/* we need this global to keep the DEBUG() syntax */
+static struct ctdb_log_state *log_state;
+
+/*
+  syslog logging function
+ */
+static void ctdb_syslog_log(const char *format, va_list ap)
+{
+       struct syslog_message *msg;
+       int level = LOG_DEBUG;
+       char *s = NULL;
+       int len, ret;
+       int syslog_fd;
+       struct sockaddr_in syslog_sin;
+
+       ret = vasprintf(&s, format, ap);
+       if (ret == -1) {
+               return;
+       }
+
+       switch (this_log_level) {
+       case DEBUG_EMERG: 
+               level = LOG_EMERG; 
+               break;
+       case DEBUG_ALERT: 
+               level = LOG_ALERT; 
+               break;
+       case DEBUG_CRIT: 
+               level = LOG_CRIT; 
+               break;
+       case DEBUG_ERR: 
+               level = LOG_ERR; 
+               break;
+       case DEBUG_WARNING: 
+               level = LOG_WARNING; 
+               break;
+       case DEBUG_NOTICE: 
+               level = LOG_NOTICE;
+               break;
+       case DEBUG_INFO: 
+               level = LOG_INFO;
+               break;
+       default:
+               level = LOG_DEBUG;
+               break;          
+       }
+
+       len = offsetof(struct syslog_message, message) + strlen(debug_extra) + strlen(s) + 1;
+       msg = malloc(len);
+       if (msg == NULL) {
+               free(s);
+               return;
+       }
+       msg->level = level;
+       msg->len   = strlen(debug_extra) + strlen(s);
+       strcpy(msg->message, debug_extra);
+       strcat(msg->message, s);
+
+       if (syslogd_is_started == 0) {
+               syslog(msg->level, "%s", msg->message);
+       } else {
+               syslog_fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
+               if (syslog_fd == -1) {
+                       printf("Failed to create syslog socket\n");
+                       free(s);
+                       free(msg);
+                       return;
+               }
+
+               syslog_sin.sin_family = AF_INET;
+               syslog_sin.sin_port   = htons(CTDB_PORT);
+               syslog_sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+
+               ret = sendto(syslog_fd, msg, len, 0,
+                            (struct sockaddr *)&syslog_sin,
+                            sizeof(syslog_sin));
+               /* no point in checking here since we cant log an error */
+
+               close(syslog_fd);
+       }
+
+       free(s);
+       free(msg);
+}
+
+
+/*
+  log file logging function
+ */
+static void ctdb_logfile_log(const char *format, va_list ap)
+{
+       struct timeval t;
+       char *s = NULL;
+       struct tm *tm;
+       char tbuf[100];
+       char *s2 = NULL;
+       int ret;
+
+       ret = vasprintf(&s, format, ap);
+       if (ret == -1) {
+               const char *errstr = "vasprintf failed\n";
+
+               write(log_state->fd, errstr, strlen(errstr));
+               return;
+       }
+
+       t = timeval_current();
+       tm = localtime(&t.tv_sec);
+
+       strftime(tbuf,sizeof(tbuf)-1,"%Y/%m/%d %H:%M:%S", tm);
+
+       ret = asprintf(&s2, "%s.%06u [%s%5u]: %s",
+                      tbuf, (unsigned)t.tv_usec,
+                      debug_extra, (unsigned)getpid(), s);
+       free(s);
+       if (ret == -1) {
+               const char *errstr = "asprintf failed\n";
+               write(log_state->fd, errstr, strlen(errstr));
+               return;
+       }
+       if (s2) {
+               write(log_state->fd, s2, strlen(s2));
+               free(s2);
+       }
+}
+
+static void ctdb_logfile_log_add(const char *format, va_list ap)
+{
+       char *s = NULL;
+       int ret;
+
+       ret = vasprintf(&s, format, ap);
+       if (ret == -1) {
+               const char *errstr = "vasprintf failed\n";
+
+               write(log_state->fd, errstr, strlen(errstr));
+               return;
+       }
+
+       if (s) {
+               write(log_state->fd, s, strlen(s));
+               free(s);
+       }
+}
+
+
+
+/*
+  choose the logfile location
+*/
+int ctdb_set_logfile(struct ctdb_context *ctdb, const char *logfile, bool use_syslog)
+{
+       int ret;
+
+       ctdb->log = talloc_zero(ctdb, struct ctdb_log_state);
+       if (ctdb->log == NULL) {
+               printf("talloc_zero failed\n");
+               abort();
+       }
+
+       ctdb->log->ctdb = ctdb;
+       log_state = ctdb->log;
+
+       if (use_syslog) {
+               do_debug_v = ctdb_syslog_log;
+               do_debug_add_v = ctdb_syslog_log;
+               ctdb->log->use_syslog = true;
+       } else if (logfile == NULL || strcmp(logfile, "-") == 0) {
+               do_debug_v = ctdb_logfile_log;
+               do_debug_add_v = ctdb_logfile_log_add;
+               ctdb->log->fd = 1;
+               /* also catch stderr of subcommands to stdout */
+               ret = dup2(1, 2);
+               if (ret == -1) {
+                       printf("dup2 failed: %s\n", strerror(errno));
+                       abort();
+               }
+       } else {
+               do_debug_v = ctdb_logfile_log;
+               do_debug_add_v = ctdb_logfile_log_add;
+
+               ctdb->log->fd = open(logfile, O_WRONLY|O_APPEND|O_CREAT, 0666);
+               if (ctdb->log->fd == -1) {
+                       printf("Failed to open logfile %s\n", logfile);
+                       abort();
+               }
+       }
+
+       return 0;
+}
+
+/* Note that do_debug always uses the global log state. */
+static void write_to_log(struct ctdb_log_state *log,
+                        const char *buf, unsigned int len)
+{
+       if (script_log_level <= LogLevel) {
+               if (log != NULL && log->prefix != NULL) {
+                       do_debug("%s: %*.*s\n", log->prefix, len, len, buf);
+               } else {
+                       do_debug("%*.*s\n", len, len, buf);
+               }
+               /* log it in the eventsystem as well */
+               if (log->logfn)
+                       log->logfn(log->buf, len, log->logfn_private);
+       }
+}
+
+/*
+  called when log data comes in from a child process
+ */
+static void ctdb_log_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private)
+{
+       struct ctdb_log_state *log = talloc_get_type(private, struct ctdb_log_state);
+       char *p;
+       int n;
+
+       if (!(flags & EVENT_FD_READ)) {
+               return;
+       }
+
+       n = read(log->pfd, &log->buf[log->buf_used],
+                sizeof(log->buf) - log->buf_used);
+       if (n > 0) {
+               log->buf_used += n;
+       } else if (n == 0) {
+               if (log != log_state) {
+                       talloc_free(log);
+               }
+               return;
+       }
+
+       this_log_level = script_log_level;
+
+       while (log->buf_used > 0 &&
+              (p = memchr(log->buf, '\n', log->buf_used)) != NULL) {
+               int n1 = (p - log->buf)+1;
+               int n2 = n1 - 1;
+               /* swallow \r from child processes */
+               if (n2 > 0 && log->buf[n2-1] == '\r') {
+                       n2--;
+               }
+               write_to_log(log, log->buf, n2);
+               memmove(log->buf, p+1, sizeof(log->buf) - n1);
+               log->buf_used -= n1;
+       }
+
+       /* the buffer could have completely filled - unfortunately we have
+          no choice but to dump it out straight away */
+       if (log->buf_used == sizeof(log->buf)) {
+               write_to_log(log, log->buf, log->buf_used);
+               log->buf_used = 0;
+       }
+}
+
+static int log_context_destructor(struct ctdb_log_state *log)
+{
+       /* Flush buffer in case it wasn't \n-terminated. */
+       if (log->buf_used > 0) {
+               this_log_level = script_log_level;
+               write_to_log(log, log->buf, log->buf_used);
+       }
+       return 0;
+}
+
+/*
+   fork(), redirecting child output to logging and specified callback.
+*/
+struct ctdb_log_state *ctdb_fork_with_logging(TALLOC_CTX *mem_ctx,
+                                             struct ctdb_context *ctdb,
+                                             const char *log_prefix,
+                                             void (*logfn)(const char *, uint16_t, void *),
+                                             void *logfn_private, pid_t *pid)
+{
+       int p[2];
+       struct ctdb_log_state *log;
+       struct tevent_fd *fde;
+
+       log = talloc_zero(mem_ctx, struct ctdb_log_state);
+       CTDB_NO_MEMORY_NULL(ctdb, log);
+       log->ctdb = ctdb;
+       log->prefix = log_prefix;
+       log->logfn = logfn;
+       log->logfn_private = (void *)logfn_private;
+
+       if (pipe(p) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to setup for child logging pipe\n"));
+               goto free_log;
+       }
+
+       *pid = ctdb_fork(ctdb);
+
+       /* Child? */
+       if (*pid == 0) {
+               close(STDOUT_FILENO);
+               close(STDERR_FILENO);
+               dup2(p[1], STDOUT_FILENO);
+               dup2(p[1], STDERR_FILENO);
+               close(p[0]);
+               close(p[1]);
+               return log;
+       }
+       close(p[1]);
+
+       /* We failed? */
+       if (*pid < 0) {
+               DEBUG(DEBUG_ERR, (__location__ " fork failed for child process\n"));
+               close(p[0]);
+               goto free_log;
+       }
+
+       log->pfd = p[0];
+       set_close_on_exec(log->pfd);
+       talloc_set_destructor(log, log_context_destructor);
+       fde = event_add_fd(ctdb->ev, log, log->pfd,
+                          EVENT_FD_READ, ctdb_log_handler, log);
+       tevent_fd_set_auto_close(fde);
+
+       return log;
+
+free_log:
+       talloc_free(log);
+       return NULL;
+}
+
+/*
+  setup for logging of child process stdout
+*/
+int ctdb_set_child_logging(struct ctdb_context *ctdb)
+{
+       int p[2];
+       int old_stdout, old_stderr;
+       struct tevent_fd *fde;
+
+       if (ctdb->log->fd == STDOUT_FILENO) {
+               /* not needed for stdout logging */
+               return 0;
+       }
+
+       /* setup a pipe to catch IO from subprocesses */
+       if (pipe(p) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to setup for child logging pipe\n"));
+               return -1;
+       }
+
+       /* We'll fail if stderr/stdout not already open; it's simpler. */
+       old_stdout = dup(STDOUT_FILENO);
+       old_stderr = dup(STDERR_FILENO);
+       if (old_stdout < 0 || old_stderr < 0) {
+               DEBUG(DEBUG_ERR, ("Failed to dup stdout/stderr for child logging\n"));
+               return -1;
+       }
+       if (dup2(p[1], STDOUT_FILENO) < 0 || dup2(p[1], STDERR_FILENO) < 0) {
+               int saved_errno = errno;
+               dup2(old_stdout, STDOUT_FILENO);
+               dup2(old_stderr, STDERR_FILENO);
+               close(old_stdout);
+               close(old_stderr);
+               close(p[0]);
+               close(p[1]);
+               errno = saved_errno;
+
+               printf(__location__ " dup2 failed: %s\n",
+                       strerror(errno));
+               return -1;
+       }
+       close(p[1]);
+       close(old_stdout);
+       close(old_stderr);
+
+       fde = event_add_fd(ctdb->ev, ctdb->log, p[0],
+                          EVENT_FD_READ, ctdb_log_handler, ctdb->log);
+       tevent_fd_set_auto_close(fde);
+
+       ctdb->log->pfd = p[0];
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for logging\n", p[0]));
+
+       return 0;
+}
+
+
+/*
+ * set up a log handler to catch logging from TEVENT
+ */
+static void ctdb_tevent_logging(void *private_data,
+                               enum tevent_debug_level level,
+                               const char *fmt,
+                               va_list ap)
+{
+       enum debug_level lvl = DEBUG_CRIT;
+
+       switch (level) {
+       case TEVENT_DEBUG_FATAL:
+               lvl = DEBUG_CRIT;
+               break;
+       case TEVENT_DEBUG_ERROR:
+               lvl = DEBUG_ERR;
+               break;
+       case TEVENT_DEBUG_WARNING:
+               lvl = DEBUG_WARNING;
+               break;
+       case TEVENT_DEBUG_TRACE:
+               lvl = DEBUG_DEBUG;
+               break;
+       }
+
+       if (lvl <= LogLevel) {
+               this_log_level = lvl;
+               do_debug_v(fmt, ap);
+       }
+}
+
+int ctdb_init_tevent_logging(struct ctdb_context *ctdb)
+{
+       int ret;
+
+       ret = tevent_set_debug(ctdb->ev,
+                       ctdb_tevent_logging,
+                       ctdb);
+       return ret;
+}
+
+
+       
diff --git a/ctdb/server/ctdb_ltdb_server.c b/ctdb/server/ctdb_ltdb_server.c
new file mode 100644 (file)
index 0000000..234ecac
--- /dev/null
@@ -0,0 +1,1513 @@
+/* 
+   ctdb ltdb code - server side
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/dir.h"
+#include "system/time.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+#include "db_wrap.h"
+#include "lib/util/dlinklist.h"
+#include <ctype.h>
+
+#define PERSISTENT_HEALTH_TDB "persistent_health.tdb"
+
+/**
+ * write a record to a normal database
+ *
+ * This is the server-variant of the ctdb_ltdb_store function.
+ * It contains logic to determine whether a record should be
+ * stored or deleted. It also sends SCHEDULE_FOR_DELETION
+ * controls to the local ctdb daemon if apporpriate.
+ */
+static int ctdb_ltdb_store_server(struct ctdb_db_context *ctdb_db,
+                                 TDB_DATA key,
+                                 struct ctdb_ltdb_header *header,
+                                 TDB_DATA data)
+{
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       TDB_DATA rec;
+       int ret;
+       bool seqnum_suppressed = false;
+       bool keep = false;
+       bool schedule_for_deletion = false;
+       bool remove_from_delete_queue = false;
+       uint32_t lmaster;
+
+       if (ctdb->flags & CTDB_FLAG_TORTURE) {
+               struct ctdb_ltdb_header *h2;
+               rec = tdb_fetch(ctdb_db->ltdb->tdb, key);
+               h2 = (struct ctdb_ltdb_header *)rec.dptr;
+               if (rec.dptr && rec.dsize >= sizeof(h2) && h2->rsn > header->rsn) {
+                       DEBUG(DEBUG_CRIT,("RSN regression! %llu %llu\n",
+                                (unsigned long long)h2->rsn, (unsigned long long)header->rsn));
+               }
+               if (rec.dptr) free(rec.dptr);
+       }
+
+       if (ctdb->vnn_map == NULL) {
+               /*
+                * Called from a client: always store the record
+                * Also don't call ctdb_lmaster since it uses the vnn_map!
+                */
+               keep = true;
+               goto store;
+       }
+
+       lmaster = ctdb_lmaster(ctdb_db->ctdb, &key);
+
+       /*
+        * If we migrate an empty record off to another node
+        * and the record has not been migrated with data,
+        * delete the record instead of storing the empty record.
+        */
+       if (data.dsize != 0) {
+               keep = true;
+       } else if (header->flags & CTDB_REC_RO_FLAGS) {
+               keep = true;
+       } else if (ctdb_db->persistent) {
+               keep = true;
+       } else if (header->flags & CTDB_REC_FLAG_AUTOMATIC) {
+               /*
+                * The record is not created by the client but
+                * automatically by the ctdb_ltdb_fetch logic that
+                * creates a record with an initial header in the
+                * ltdb before trying to migrate the record from
+                * the current lmaster. Keep it instead of trying
+                * to delete the non-existing record...
+                */
+               keep = true;
+               schedule_for_deletion = true;
+       } else if (header->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) {
+               keep = true;
+       } else if (ctdb_db->ctdb->pnn == lmaster) {
+               /*
+                * If we are lmaster, then we usually keep the record.
+                * But if we retrieve the dmaster role by a VACUUM_MIGRATE
+                * and the record is empty and has never been migrated
+                * with data, then we should delete it instead of storing it.
+                * This is part of the vacuuming process.
+                *
+                * The reason that we usually need to store even empty records
+                * on the lmaster is that a client operating directly on the
+                * lmaster (== dmaster) expects the local copy of the record to
+                * exist after successful ctdb migrate call. If the record does
+                * not exist, the client goes into a migrate loop and eventually
+                * fails. So storing the empty record makes sure that we do not
+                * need to change the client code.
+                */
+               if (!(header->flags & CTDB_REC_FLAG_VACUUM_MIGRATED)) {
+                       keep = true;
+               } else if (ctdb_db->ctdb->pnn != header->dmaster) {
+                       keep = true;
+               }
+       } else if (ctdb_db->ctdb->pnn == header->dmaster) {
+               keep = true;
+       }
+
+       if (keep) {
+               if (!ctdb_db->persistent &&
+                   (ctdb_db->ctdb->pnn == header->dmaster) &&
+                   !(header->flags & CTDB_REC_RO_FLAGS))
+               {
+                       header->rsn++;
+
+                       if (data.dsize == 0) {
+                               schedule_for_deletion = true;
+                       }
+               }
+               remove_from_delete_queue = !schedule_for_deletion;
+       }
+
+store:
+       /*
+        * The VACUUM_MIGRATED flag is only set temporarily for
+        * the above logic when the record was retrieved by a
+        * VACUUM_MIGRATE call and should not be stored in the
+        * database.
+        *
+        * The VACUUM_MIGRATE call is triggered by a vacuum fetch,
+        * and there are two cases in which the corresponding record
+        * is stored in the local database:
+        * 1. The record has been migrated with data in the past
+        *    (the MIGRATED_WITH_DATA record flag is set).
+        * 2. The record has been filled with data again since it
+        *    had been submitted in the VACUUM_FETCH message to the
+        *    lmaster.
+        * For such records it is important to not store the
+        * VACUUM_MIGRATED flag in the database.
+        */
+       header->flags &= ~CTDB_REC_FLAG_VACUUM_MIGRATED;
+
+       /*
+        * Similarly, clear the AUTOMATIC flag which should not enter
+        * the local database copy since this would require client
+        * modifications to clear the flag when the client stores
+        * the record.
+        */
+       header->flags &= ~CTDB_REC_FLAG_AUTOMATIC;
+
+       rec.dsize = sizeof(*header) + data.dsize;
+       rec.dptr = talloc_size(ctdb, rec.dsize);
+       CTDB_NO_MEMORY(ctdb, rec.dptr);
+
+       memcpy(rec.dptr, header, sizeof(*header));
+       memcpy(rec.dptr + sizeof(*header), data.dptr, data.dsize);
+
+       /* Databases with seqnum updates enabled only get their seqnum
+          changes when/if we modify the data */
+       if (ctdb_db->seqnum_update != NULL) {
+               TDB_DATA old;
+               old = tdb_fetch(ctdb_db->ltdb->tdb, key);
+
+               if ( (old.dsize == rec.dsize)
+               && !memcmp(old.dptr+sizeof(struct ctdb_ltdb_header),
+                         rec.dptr+sizeof(struct ctdb_ltdb_header),
+                         rec.dsize-sizeof(struct ctdb_ltdb_header)) ) {
+                       tdb_remove_flags(ctdb_db->ltdb->tdb, TDB_SEQNUM);
+                       seqnum_suppressed = true;
+               }
+               if (old.dptr) free(old.dptr);
+       }
+
+       DEBUG(DEBUG_DEBUG, (__location__ " db[%s]: %s record: hash[0x%08x]\n",
+                           ctdb_db->db_name,
+                           keep?"storing":"deleting",
+                           ctdb_hash(&key)));
+
+       if (keep) {
+               ret = tdb_store(ctdb_db->ltdb->tdb, key, rec, TDB_REPLACE);
+       } else {
+               ret = tdb_delete(ctdb_db->ltdb->tdb, key);
+       }
+
+       if (ret != 0) {
+               int lvl = DEBUG_ERR;
+
+               if (keep == false &&
+                   tdb_error(ctdb_db->ltdb->tdb) == TDB_ERR_NOEXIST)
+               {
+                       lvl = DEBUG_DEBUG;
+               }
+
+               DEBUG(lvl, (__location__ " db[%s]: Failed to %s record: "
+                           "%d - %s\n",
+                           ctdb_db->db_name,
+                           keep?"store":"delete", ret,
+                           tdb_errorstr(ctdb_db->ltdb->tdb)));
+
+               schedule_for_deletion = false;
+               remove_from_delete_queue = false;
+       }
+       if (seqnum_suppressed) {
+               tdb_add_flags(ctdb_db->ltdb->tdb, TDB_SEQNUM);
+       }
+
+       talloc_free(rec.dptr);
+
+       if (schedule_for_deletion) {
+               int ret2;
+               ret2 = ctdb_local_schedule_for_deletion(ctdb_db, header, key);
+               if (ret2 != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " ctdb_local_schedule_for_deletion failed.\n"));
+               }
+       }
+
+       if (remove_from_delete_queue) {
+               ctdb_local_remove_from_delete_queue(ctdb_db, header, key);
+       }
+
+       return ret;
+}
+
+struct lock_fetch_state {
+       struct ctdb_context *ctdb;
+       void (*recv_pkt)(void *, struct ctdb_req_header *);
+       void *recv_context;
+       struct ctdb_req_header *hdr;
+       uint32_t generation;
+       bool ignore_generation;
+};
+
+/*
+  called when we should retry the operation
+ */
+static void lock_fetch_callback(void *p, bool locked)
+{
+       struct lock_fetch_state *state = talloc_get_type(p, struct lock_fetch_state);
+       if (!state->ignore_generation &&
+           state->generation != state->ctdb->vnn_map->generation) {
+               DEBUG(DEBUG_NOTICE,("Discarding previous generation lockwait packet\n"));
+               talloc_free(state->hdr);
+               return;
+       }
+       state->recv_pkt(state->recv_context, state->hdr);
+       DEBUG(DEBUG_INFO,(__location__ " PACKET REQUEUED\n"));
+}
+
+
+/*
+  do a non-blocking ltdb_lock, deferring this ctdb request until we
+  have the chainlock
+
+  It does the following:
+
+   1) tries to get the chainlock. If it succeeds, then it returns 0
+
+   2) if it fails to get a chainlock immediately then it sets up a
+   non-blocking chainlock via ctdb_lock_record, and when it gets the
+   chainlock it re-submits this ctdb request to the main packet
+   receive function.
+
+   This effectively queues all ctdb requests that cannot be
+   immediately satisfied until it can get the lock. This means that
+   the main ctdb daemon will not block waiting for a chainlock held by
+   a client
+
+   There are 3 possible return values:
+
+       0:    means that it got the lock immediately.
+      -1:    means that it failed to get the lock, and won't retry
+      -2:    means that it failed to get the lock immediately, but will retry
+ */
+int ctdb_ltdb_lock_requeue(struct ctdb_db_context *ctdb_db, 
+                          TDB_DATA key, struct ctdb_req_header *hdr,
+                          void (*recv_pkt)(void *, struct ctdb_req_header *),
+                          void *recv_context, bool ignore_generation)
+{
+       int ret;
+       struct tdb_context *tdb = ctdb_db->ltdb->tdb;
+       struct lock_request *lreq;
+       struct lock_fetch_state *state;
+       
+       ret = tdb_chainlock_nonblock(tdb, key);
+
+       if (ret != 0 &&
+           !(errno == EACCES || errno == EAGAIN || errno == EDEADLK)) {
+               /* a hard failure - don't try again */
+               return -1;
+       }
+
+       /* when torturing, ensure we test the contended path */
+       if ((ctdb_db->ctdb->flags & CTDB_FLAG_TORTURE) &&
+           random() % 5 == 0) {
+               ret = -1;
+               tdb_chainunlock(tdb, key);
+       }
+
+       /* first the non-contended path */
+       if (ret == 0) {
+               return 0;
+       }
+
+       state = talloc(hdr, struct lock_fetch_state);
+       state->ctdb = ctdb_db->ctdb;
+       state->hdr = hdr;
+       state->recv_pkt = recv_pkt;
+       state->recv_context = recv_context;
+       state->generation = ctdb_db->ctdb->vnn_map->generation;
+       state->ignore_generation = ignore_generation;
+
+       /* now the contended path */
+       lreq = ctdb_lock_record(ctdb_db, key, true, lock_fetch_callback, state);
+       if (lreq == NULL) {
+               return -1;
+       }
+
+       /* we need to move the packet off the temporary context in ctdb_input_pkt(),
+          so it won't be freed yet */
+       talloc_steal(state, hdr);
+
+       /* now tell the caller than we will retry asynchronously */
+       return -2;
+}
+
+/*
+  a varient of ctdb_ltdb_lock_requeue that also fetches the record
+ */
+int ctdb_ltdb_lock_fetch_requeue(struct ctdb_db_context *ctdb_db, 
+                                TDB_DATA key, struct ctdb_ltdb_header *header, 
+                                struct ctdb_req_header *hdr, TDB_DATA *data,
+                                void (*recv_pkt)(void *, struct ctdb_req_header *),
+                                void *recv_context, bool ignore_generation)
+{
+       int ret;
+
+       ret = ctdb_ltdb_lock_requeue(ctdb_db, key, hdr, recv_pkt, 
+                                    recv_context, ignore_generation);
+       if (ret == 0) {
+               ret = ctdb_ltdb_fetch(ctdb_db, key, header, hdr, data);
+               if (ret != 0) {
+                       int uret;
+                       uret = ctdb_ltdb_unlock(ctdb_db, key);
+                       if (uret != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " ctdb_ltdb_unlock() failed with error %d\n", uret));
+                       }
+               }
+       }
+       return ret;
+}
+
+
+/*
+  paraoid check to see if the db is empty
+ */
+static void ctdb_check_db_empty(struct ctdb_db_context *ctdb_db)
+{
+       struct tdb_context *tdb = ctdb_db->ltdb->tdb;
+       int count = tdb_traverse_read(tdb, NULL, NULL);
+       if (count != 0) {
+               DEBUG(DEBUG_ALERT,(__location__ " tdb '%s' not empty on attach! aborting\n",
+                        ctdb_db->db_path));
+               ctdb_fatal(ctdb_db->ctdb, "database not empty on attach");
+       }
+}
+
+int ctdb_load_persistent_health(struct ctdb_context *ctdb,
+                               struct ctdb_db_context *ctdb_db)
+{
+       struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+       char *old;
+       char *reason = NULL;
+       TDB_DATA key;
+       TDB_DATA val;
+
+       key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+       key.dsize = strlen(ctdb_db->db_name);
+
+       old = ctdb_db->unhealthy_reason;
+       ctdb_db->unhealthy_reason = NULL;
+
+       val = tdb_fetch(tdb, key);
+       if (val.dsize > 0) {
+               reason = talloc_strndup(ctdb_db,
+                                       (const char *)val.dptr,
+                                       val.dsize);
+               if (reason == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " talloc_strndup(%d) failed\n",
+                                          (int)val.dsize));
+                       ctdb_db->unhealthy_reason = old;
+                       free(val.dptr);
+                       return -1;
+               }
+       }
+
+       if (val.dptr) {
+               free(val.dptr);
+       }
+
+       talloc_free(old);
+       ctdb_db->unhealthy_reason = reason;
+       return 0;
+}
+
+int ctdb_update_persistent_health(struct ctdb_context *ctdb,
+                                 struct ctdb_db_context *ctdb_db,
+                                 const char *given_reason,/* NULL means healthy */
+                                 int num_healthy_nodes)
+{
+       struct tdb_context *tdb = ctdb->db_persistent_health->tdb;
+       int ret;
+       TDB_DATA key;
+       TDB_DATA val;
+       char *new_reason = NULL;
+       char *old_reason = NULL;
+
+       ret = tdb_transaction_start(tdb);
+       if (ret != 0) {
+               DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_start('%s') failed: %d - %s\n",
+                                  tdb_name(tdb), ret, tdb_errorstr(tdb)));
+               return -1;
+       }
+
+       ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+       if (ret != 0) {
+               DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+                                  ctdb_db->db_name, ret));
+               return -1;
+       }
+       old_reason = ctdb_db->unhealthy_reason;
+
+       key.dptr = discard_const_p(uint8_t, ctdb_db->db_name);
+       key.dsize = strlen(ctdb_db->db_name);
+
+       if (given_reason) {
+               new_reason = talloc_strdup(ctdb_db, given_reason);
+               if (new_reason == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " talloc_strdup(%s) failed\n",
+                                         given_reason));
+                       return -1;
+               }
+       } else if (old_reason && num_healthy_nodes == 0) {
+               /*
+                * If the reason indicates ok, but there where no healthy nodes
+                * available, that it means, we have not recovered valid content
+                * of the db. So if there's an old reason, prefix it with
+                * "NO-HEALTHY-NODES - "
+                */
+               const char *prefix;
+
+#define _TMP_PREFIX "NO-HEALTHY-NODES - "
+               ret = strncmp(_TMP_PREFIX, old_reason, strlen(_TMP_PREFIX));
+               if (ret != 0) {
+                       prefix = _TMP_PREFIX;
+               } else {
+                       prefix = "";
+               }
+               new_reason = talloc_asprintf(ctdb_db, "%s%s",
+                                        prefix, old_reason);
+               if (new_reason == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " talloc_asprintf(%s%s) failed\n",
+                                         prefix, old_reason));
+                       return -1;
+               }
+#undef _TMP_PREFIX
+       }
+
+       if (new_reason) {
+               val.dptr = discard_const_p(uint8_t, new_reason);
+               val.dsize = strlen(new_reason);
+
+               ret = tdb_store(tdb, key, val, TDB_REPLACE);
+               if (ret != 0) {
+                       tdb_transaction_cancel(tdb);
+                       DEBUG(DEBUG_ALERT,(__location__ " tdb_store('%s', %s, %s) failed: %d - %s\n",
+                                          tdb_name(tdb), ctdb_db->db_name, new_reason,
+                                          ret, tdb_errorstr(tdb)));
+                       talloc_free(new_reason);
+                       return -1;
+               }
+               DEBUG(DEBUG_ALERT,("Updated db health for db(%s) to: %s\n",
+                                  ctdb_db->db_name, new_reason));
+       } else if (old_reason) {
+               ret = tdb_delete(tdb, key);
+               if (ret != 0) {
+                       tdb_transaction_cancel(tdb);
+                       DEBUG(DEBUG_ALERT,(__location__ " tdb_delete('%s', %s) failed: %d - %s\n",
+                                          tdb_name(tdb), ctdb_db->db_name,
+                                          ret, tdb_errorstr(tdb)));
+                       talloc_free(new_reason);
+                       return -1;
+               }
+               DEBUG(DEBUG_NOTICE,("Updated db health for db(%s): OK\n",
+                                  ctdb_db->db_name));
+       }
+
+       ret = tdb_transaction_commit(tdb);
+       if (ret != TDB_SUCCESS) {
+               DEBUG(DEBUG_ALERT,(__location__ " tdb_transaction_commit('%s') failed: %d - %s\n",
+                                  tdb_name(tdb), ret, tdb_errorstr(tdb)));
+               talloc_free(new_reason);
+               return -1;
+       }
+
+       talloc_free(old_reason);
+       ctdb_db->unhealthy_reason = new_reason;
+
+       return 0;
+}
+
+static int ctdb_backup_corrupted_tdb(struct ctdb_context *ctdb,
+                                    struct ctdb_db_context *ctdb_db)
+{
+       time_t now = time(NULL);
+       char *new_path;
+       char *new_reason;
+       int ret;
+       struct tm *tm;
+
+       tm = gmtime(&now);
+
+       /* formatted like: foo.tdb.0.corrupted.20091204160825.0Z */
+       new_path = talloc_asprintf(ctdb_db, "%s.corrupted."
+                                  "%04u%02u%02u%02u%02u%02u.0Z",
+                                  ctdb_db->db_path,
+                                  tm->tm_year+1900, tm->tm_mon+1,
+                                  tm->tm_mday, tm->tm_hour, tm->tm_min,
+                                  tm->tm_sec);
+       if (new_path == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+               return -1;
+       }
+
+       new_reason = talloc_asprintf(ctdb_db,
+                                    "ERROR - Backup of corrupted TDB in '%s'",
+                                    new_path);
+       if (new_reason == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+               return -1;
+       }
+       ret = ctdb_update_persistent_health(ctdb, ctdb_db, new_reason, 0);
+       talloc_free(new_reason);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,(__location__
+                                ": ctdb_backup_corrupted_tdb(%s) not implemented yet\n",
+                                ctdb_db->db_path));
+               return -1;
+       }
+
+       ret = rename(ctdb_db->db_path, new_path);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,(__location__
+                                 ": ctdb_backup_corrupted_tdb(%s) rename to %s failed: %d - %s\n",
+                                 ctdb_db->db_path, new_path,
+                                 errno, strerror(errno)));
+               talloc_free(new_path);
+               return -1;
+       }
+
+       DEBUG(DEBUG_CRIT,(__location__
+                        ": ctdb_backup_corrupted_tdb(%s) renamed to %s\n",
+                        ctdb_db->db_path, new_path));
+       talloc_free(new_path);
+       return 0;
+}
+
+int ctdb_recheck_persistent_health(struct ctdb_context *ctdb)
+{
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+       int ok = 0;
+       int fail = 0;
+
+       for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+               if (!ctdb_db->persistent) {
+                       continue;
+               }
+
+               ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ALERT,(__location__
+                                          " load persistent health for '%s' failed\n",
+                                          ctdb_db->db_path));
+                       return -1;
+               }
+
+               if (ctdb_db->unhealthy_reason == NULL) {
+                       ok++;
+                       DEBUG(DEBUG_INFO,(__location__
+                                  " persistent db '%s' healthy\n",
+                                  ctdb_db->db_path));
+                       continue;
+               }
+
+               fail++;
+               DEBUG(DEBUG_ALERT,(__location__
+                                  " persistent db '%s' unhealthy: %s\n",
+                                  ctdb_db->db_path,
+                                  ctdb_db->unhealthy_reason));
+       }
+       DEBUG((fail!=0)?DEBUG_ALERT:DEBUG_NOTICE,
+             ("ctdb_recheck_presistent_health: OK[%d] FAIL[%d]\n",
+              ok, fail));
+
+       if (fail != 0) {
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  mark a database - as healthy
+ */
+int32_t ctdb_control_db_set_healthy(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       uint32_t db_id = *(uint32_t *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+       bool may_recover = false;
+
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               may_recover = true;
+       }
+
+       ret = ctdb_update_persistent_health(ctdb, ctdb_db, NULL, 1);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__
+                                " ctdb_update_persistent_health(%s) failed\n",
+                                ctdb_db->db_name));
+               return -1;
+       }
+
+       if (may_recover && ctdb->runstate == CTDB_RUNSTATE_STARTUP) {
+               DEBUG(DEBUG_ERR, (__location__ " db %s become healthy  - force recovery for startup\n",
+                                 ctdb_db->db_name));
+               ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+       }
+
+       return 0;
+}
+
+int32_t ctdb_control_db_get_health(struct ctdb_context *ctdb,
+                                  TDB_DATA indata,
+                                  TDB_DATA *outdata)
+{
+       uint32_t db_id = *(uint32_t *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%x\n", db_id));
+               return -1;
+       }
+
+       ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__
+                                " ctdb_load_persistent_health(%s) failed\n",
+                                ctdb_db->db_name));
+               return -1;
+       }
+
+       *outdata = tdb_null;
+       if (ctdb_db->unhealthy_reason) {
+               outdata->dptr = (uint8_t *)ctdb_db->unhealthy_reason;
+               outdata->dsize = strlen(ctdb_db->unhealthy_reason)+1;
+       }
+
+       return 0;
+}
+
+
+int ctdb_set_db_readonly(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db)
+{
+       char *ropath;
+
+       if (ctdb_db->readonly) {
+               return 0;
+       }
+
+       if (ctdb_db->persistent) {
+               DEBUG(DEBUG_ERR,("Persistent databases do not support readonly property\n"));
+               return -1;
+       }
+
+       ropath = talloc_asprintf(ctdb_db, "%s.RO", ctdb_db->db_path);
+       if (ropath == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to asprintf the tracking database\n"));
+               return -1;
+       }
+       ctdb_db->rottdb = tdb_open(ropath, 
+                             ctdb->tunable.database_hash_size, 
+                             TDB_NOLOCK|TDB_CLEAR_IF_FIRST|TDB_NOSYNC,
+                             O_CREAT|O_RDWR, 0);
+       if (ctdb_db->rottdb == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to open/create the tracking database '%s'\n", ropath));
+               talloc_free(ropath);
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE,("OPENED tracking database : '%s'\n", ropath));
+
+       ctdb_db->readonly = true;
+
+       DEBUG(DEBUG_NOTICE, ("Readonly property set on DB %s\n", ctdb_db->db_name));
+
+       talloc_free(ropath);
+       return 0;
+}
+
+/*
+  attach to a database, handling both persistent and non-persistent databases
+  return 0 on success, -1 on failure
+ */
+static int ctdb_local_attach(struct ctdb_context *ctdb, const char *db_name,
+                            bool persistent, const char *unhealthy_reason,
+                            bool jenkinshash)
+{
+       struct ctdb_db_context *ctdb_db, *tmp_db;
+       int ret;
+       struct TDB_DATA key;
+       unsigned tdb_flags;
+       int mode = 0600;
+       int remaining_tries = 0;
+
+       ctdb_db = talloc_zero(ctdb, struct ctdb_db_context);
+       CTDB_NO_MEMORY(ctdb, ctdb_db);
+
+       ctdb_db->priority = 1;
+       ctdb_db->ctdb = ctdb;
+       ctdb_db->db_name = talloc_strdup(ctdb_db, db_name);
+       CTDB_NO_MEMORY(ctdb, ctdb_db->db_name);
+
+       key.dsize = strlen(db_name)+1;
+       key.dptr  = discard_const(db_name);
+       ctdb_db->db_id = ctdb_hash(&key);
+       ctdb_db->persistent = persistent;
+
+       if (!ctdb_db->persistent) {
+               ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+               if (ctdb_db->delete_queue == NULL) {
+                       CTDB_NO_MEMORY(ctdb, ctdb_db->delete_queue);
+               }
+
+               ctdb_db->ctdb_ltdb_store_fn = ctdb_ltdb_store_server;
+       }
+
+       /* check for hash collisions */
+       for (tmp_db=ctdb->db_list;tmp_db;tmp_db=tmp_db->next) {
+               if (tmp_db->db_id == ctdb_db->db_id) {
+                       DEBUG(DEBUG_CRIT,("db_id 0x%x hash collision. name1='%s' name2='%s'\n",
+                                tmp_db->db_id, db_name, tmp_db->db_name));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+       }
+
+       if (persistent) {
+               if (unhealthy_reason) {
+                       ret = ctdb_update_persistent_health(ctdb, ctdb_db,
+                                                           unhealthy_reason, 0);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ALERT,(__location__ " ctdb_update_persistent_health('%s','%s') failed: %d\n",
+                                                  ctdb_db->db_name, unhealthy_reason, ret));
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+               }
+
+               if (ctdb->max_persistent_check_errors > 0) {
+                       remaining_tries = 1;
+               }
+               if (ctdb->runstate == CTDB_RUNSTATE_RUNNING) {
+                       remaining_tries = 0;
+               }
+
+               ret = ctdb_load_persistent_health(ctdb, ctdb_db);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ALERT,(__location__ " ctdb_load_persistent_health('%s') failed: %d\n",
+                                  ctdb_db->db_name, ret));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+       }
+
+       if (ctdb_db->unhealthy_reason && remaining_tries == 0) {
+               DEBUG(DEBUG_ALERT,(__location__ "ERROR: tdb %s is marked as unhealthy: %s\n",
+                                  ctdb_db->db_name, ctdb_db->unhealthy_reason));
+               talloc_free(ctdb_db);
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               /* this is just a warning, but we want that in the log file! */
+               DEBUG(DEBUG_ALERT,(__location__ "Warning: tdb %s is marked as unhealthy: %s\n",
+                                  ctdb_db->db_name, ctdb_db->unhealthy_reason));
+       }
+
+       /* open the database */
+       ctdb_db->db_path = talloc_asprintf(ctdb_db, "%s/%s.%u", 
+                                          persistent?ctdb->db_directory_persistent:ctdb->db_directory, 
+                                          db_name, ctdb->pnn);
+
+       tdb_flags = persistent? TDB_DEFAULT : TDB_CLEAR_IF_FIRST | TDB_NOSYNC;
+       if (ctdb->valgrinding) {
+               tdb_flags |= TDB_NOMMAP;
+       }
+       tdb_flags |= TDB_DISALLOW_NESTING;
+       if (jenkinshash) {
+               tdb_flags |= TDB_INCOMPATIBLE_HASH;
+       }
+
+again:
+       ctdb_db->ltdb = tdb_wrap_open(ctdb, ctdb_db->db_path, 
+                                     ctdb->tunable.database_hash_size, 
+                                     tdb_flags, 
+                                     O_CREAT|O_RDWR, mode);
+       if (ctdb_db->ltdb == NULL) {
+               struct stat st;
+               int saved_errno = errno;
+
+               if (!persistent) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               if (remaining_tries == 0) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         "Failed to open persistent tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               ret = stat(ctdb_db->db_path, &st);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         "Failed to open persistent tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         "Failed to open persistent tdb '%s': %d - %s\n",
+                                         ctdb_db->db_path,
+                                         saved_errno,
+                                         strerror(saved_errno)));
+                       talloc_free(ctdb_db);
+                       return -1;
+               }
+
+               remaining_tries--;
+               mode = st.st_mode;
+               goto again;
+       }
+
+       if (!persistent) {
+               ctdb_check_db_empty(ctdb_db);
+       } else {
+               ret = tdb_check(ctdb_db->ltdb->tdb, NULL, NULL);
+               if (ret != 0) {
+                       int fd;
+                       struct stat st;
+
+                       DEBUG(DEBUG_CRIT,("tdb_check(%s) failed: %d - %s\n",
+                                         ctdb_db->db_path, ret,
+                                         tdb_errorstr(ctdb_db->ltdb->tdb)));
+                       if (remaining_tries == 0) {
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+
+                       fd = tdb_fd(ctdb_db->ltdb->tdb);
+                       ret = fstat(fd, &st);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_CRIT,(__location__
+                                                 "Failed to fstat() persistent tdb '%s': %d - %s\n",
+                                                 ctdb_db->db_path,
+                                                 errno,
+                                                 strerror(errno)));
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+
+                       /* close the TDB */
+                       talloc_free(ctdb_db->ltdb);
+                       ctdb_db->ltdb = NULL;
+
+                       ret = ctdb_backup_corrupted_tdb(ctdb, ctdb_db);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_CRIT,("Failed to backup corrupted tdb '%s'\n",
+                                                 ctdb_db->db_path));
+                               talloc_free(ctdb_db);
+                               return -1;
+                       }
+
+                       remaining_tries--;
+                       mode = st.st_mode;
+                       goto again;
+               }
+       }
+
+       /* set up a rb tree we can use to track which records we have a 
+          fetch-lock in-flight for so we can defer any additional calls
+          for the same record.
+        */
+       ctdb_db->deferred_fetch = trbt_create(ctdb_db, 0);
+       if (ctdb_db->deferred_fetch == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to create deferred fetch rb tree for ctdb database\n"));
+               talloc_free(ctdb_db);
+               return -1;
+       }
+
+       DLIST_ADD(ctdb->db_list, ctdb_db);
+
+       /* setting this can help some high churn databases */
+       tdb_set_max_dead(ctdb_db->ltdb->tdb, ctdb->tunable.database_max_dead);
+
+       /* 
+          all databases support the "null" function. we need this in
+          order to do forced migration of records
+       */
+       ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_null_func, CTDB_NULL_FUNC);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,("Failed to setup null function for '%s'\n", ctdb_db->db_name));
+               talloc_free(ctdb_db);
+               return -1;
+       }
+
+       /* 
+          all databases support the "fetch" function. we need this
+          for efficient Samba3 ctdb fetch
+       */
+       ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_func, CTDB_FETCH_FUNC);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+               talloc_free(ctdb_db);
+               return -1;
+       }
+
+       /* 
+          all databases support the "fetch_with_header" function. we need this
+          for efficient readonly record fetches
+       */
+       ret = ctdb_daemon_set_call(ctdb, ctdb_db->db_id, ctdb_fetch_with_header_func, CTDB_FETCH_WITH_HEADER_FUNC);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,("Failed to setup fetch function for '%s'\n", ctdb_db->db_name));
+               talloc_free(ctdb_db);
+               return -1;
+       }
+
+       ret = ctdb_vacuum_init(ctdb_db);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,("Failed to setup vacuuming for "
+                                 "database '%s'\n", ctdb_db->db_name));
+               talloc_free(ctdb_db);
+               return -1;
+       }
+
+
+       DEBUG(DEBUG_NOTICE,("Attached to database '%s' with flags 0x%x\n",
+                           ctdb_db->db_path, tdb_flags));
+
+       /* success */
+       return 0;
+}
+
+
+struct ctdb_deferred_attach_context {
+       struct ctdb_deferred_attach_context *next, *prev;
+       struct ctdb_context *ctdb;
+       struct ctdb_req_control *c;
+};
+
+
+static int ctdb_deferred_attach_destructor(struct ctdb_deferred_attach_context *da_ctx)
+{
+       DLIST_REMOVE(da_ctx->ctdb->deferred_attach, da_ctx);
+
+       return 0;
+}
+
+static void ctdb_deferred_attach_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data)
+{
+       struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context);
+       struct ctdb_context *ctdb = da_ctx->ctdb;
+
+       ctdb_request_control_reply(ctdb, da_ctx->c, NULL, -1, NULL);
+       talloc_free(da_ctx);
+}
+
+static void ctdb_deferred_attach_callback(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data)
+{
+       struct ctdb_deferred_attach_context *da_ctx = talloc_get_type(private_data, struct ctdb_deferred_attach_context);
+       struct ctdb_context *ctdb = da_ctx->ctdb;
+
+       /* This talloc-steals the packet ->c */
+       ctdb_input_pkt(ctdb, (struct ctdb_req_header *)da_ctx->c);
+       talloc_free(da_ctx);
+}
+
+int ctdb_process_deferred_attach(struct ctdb_context *ctdb)
+{
+       struct ctdb_deferred_attach_context *da_ctx;
+
+       /* call it from the main event loop as soon as the current event 
+          finishes.
+        */
+       while ((da_ctx = ctdb->deferred_attach) != NULL) {
+               DLIST_REMOVE(ctdb->deferred_attach, da_ctx);
+               event_add_timed(ctdb->ev, da_ctx, timeval_current_ofs(1,0), ctdb_deferred_attach_callback, da_ctx);
+       }
+
+       return 0;
+}
+
+/*
+  a client has asked to attach a new database
+ */
+int32_t ctdb_control_db_attach(struct ctdb_context *ctdb, TDB_DATA indata,
+                              TDB_DATA *outdata, uint64_t tdb_flags, 
+                              bool persistent, uint32_t client_id,
+                              struct ctdb_req_control *c,
+                              bool *async_reply)
+{
+       const char *db_name = (const char *)indata.dptr;
+       struct ctdb_db_context *db;
+       struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+       struct ctdb_client *client = NULL;
+
+       if (ctdb->tunable.allow_client_db_attach == 0) {
+               DEBUG(DEBUG_ERR, ("DB Attach to database %s denied by tunable "
+                                 "AllowClientDBAccess == 0\n", db_name));
+               return -1;
+       }
+
+       /* dont allow any local clients to attach while we are in recovery mode
+        * except for the recovery daemon.
+        * allow all attach from the network since these are always from remote
+        * recovery daemons.
+        */
+       if (client_id != 0) {
+               client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+       }
+       if (client != NULL) {
+               /* If the node is inactive it is not part of the cluster
+                  and we should not allow clients to attach to any
+                  databases
+               */
+               if (node->flags & NODE_FLAGS_INACTIVE) {
+                       DEBUG(DEBUG_ERR,("DB Attach to database %s refused since node is inactive (flags=0x%x)\n", db_name, node->flags));
+                       return -1;
+               }
+
+               if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE &&
+                   client->pid != ctdb->recoverd_pid &&
+                   ctdb->runstate < CTDB_RUNSTATE_RUNNING) {
+                       struct ctdb_deferred_attach_context *da_ctx = talloc(client, struct ctdb_deferred_attach_context);
+
+                       if (da_ctx == NULL) {
+                               DEBUG(DEBUG_ERR,("DB Attach to database %s deferral for client with pid:%d failed due to OOM.\n", db_name, client->pid));
+                               return -1;
+                       }
+
+                       da_ctx->ctdb = ctdb;
+                       da_ctx->c = talloc_steal(da_ctx, c);
+                       talloc_set_destructor(da_ctx, ctdb_deferred_attach_destructor);
+                       DLIST_ADD(ctdb->deferred_attach, da_ctx);
+
+                       event_add_timed(ctdb->ev, da_ctx, timeval_current_ofs(ctdb->tunable.deferred_attach_timeout, 0), ctdb_deferred_attach_timeout, da_ctx);
+
+                       DEBUG(DEBUG_ERR,("DB Attach to database %s deferred for client with pid:%d since node is in recovery mode.\n", db_name, client->pid));
+                       *async_reply = true;
+                       return 0;
+               }
+       }
+
+       /* the client can optionally pass additional tdb flags, but we
+          only allow a subset of those on the database in ctdb. Note
+          that tdb_flags is passed in via the (otherwise unused)
+          srvid to the attach control */
+       tdb_flags &= (TDB_NOSYNC|TDB_INCOMPATIBLE_HASH);
+
+       /* see if we already have this name */
+       db = ctdb_db_handle(ctdb, db_name);
+       if (db) {
+               if (db->persistent != persistent) {
+                       DEBUG(DEBUG_ERR, ("ERROR: DB Attach %spersistent to %spersistent "
+                                         "database %s\n", persistent ? "" : "non-",
+                                         db-> persistent ? "" : "non-", db_name));
+                       return -1;
+               }
+               outdata->dptr  = (uint8_t *)&db->db_id;
+               outdata->dsize = sizeof(db->db_id);
+               tdb_add_flags(db->ltdb->tdb, tdb_flags);
+               return 0;
+       }
+
+       if (ctdb_local_attach(ctdb, db_name, persistent, NULL, (tdb_flags&TDB_INCOMPATIBLE_HASH)?true:false) != 0) {
+               return -1;
+       }
+
+       db = ctdb_db_handle(ctdb, db_name);
+       if (!db) {
+               DEBUG(DEBUG_ERR,("Failed to find db handle for name '%s'\n", db_name));
+               return -1;
+       }
+
+       /* remember the flags the client has specified */
+       tdb_add_flags(db->ltdb->tdb, tdb_flags);
+
+       outdata->dptr  = (uint8_t *)&db->db_id;
+       outdata->dsize = sizeof(db->db_id);
+
+       /* Try to ensure it's locked in mem */
+       ctdb_lockdown_memory(ctdb);
+
+       /* tell all the other nodes about this database */
+       ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_ALL, tdb_flags,
+                                persistent?CTDB_CONTROL_DB_ATTACH_PERSISTENT:
+                                               CTDB_CONTROL_DB_ATTACH,
+                                0, CTDB_CTRL_FLAG_NOREPLY,
+                                indata, NULL, NULL);
+
+       /* success */
+       return 0;
+}
+
+
+/*
+  attach to all existing persistent databases
+ */
+static int ctdb_attach_persistent(struct ctdb_context *ctdb,
+                                 const char *unhealthy_reason)
+{
+       DIR *d;
+       struct dirent *de;
+
+       /* open the persistent db directory and scan it for files */
+       d = opendir(ctdb->db_directory_persistent);
+       if (d == NULL) {
+               return 0;
+       }
+
+       while ((de=readdir(d))) {
+               char *p, *s, *q;
+               size_t len = strlen(de->d_name);
+               uint32_t node;
+               int invalid_name = 0;
+               
+               s = talloc_strdup(ctdb, de->d_name);
+               if (s == NULL) {
+                       closedir(d);
+                       CTDB_NO_MEMORY(ctdb, s);
+               }
+
+               /* only accept names ending in .tdb */
+               p = strstr(s, ".tdb.");
+               if (len < 7 || p == NULL) {
+                       talloc_free(s);
+                       continue;
+               }
+
+               /* only accept names ending with .tdb. and any number of digits */
+               q = p+5;
+               while (*q != 0 && invalid_name == 0) {
+                       if (!isdigit(*q++)) {
+                               invalid_name = 1;
+                       }
+               }
+               if (invalid_name == 1 || sscanf(p+5, "%u", &node) != 1 || node != ctdb->pnn) {
+                       DEBUG(DEBUG_ERR,("Ignoring persistent database '%s'\n", de->d_name));
+                       talloc_free(s);
+                       continue;
+               }
+               p[4] = 0;
+
+               if (ctdb_local_attach(ctdb, s, true, unhealthy_reason, 0) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to attach to persistent database '%s'\n", de->d_name));
+                       closedir(d);
+                       talloc_free(s);
+                       return -1;
+               }
+
+               DEBUG(DEBUG_INFO,("Attached to persistent database %s\n", s));
+
+               talloc_free(s);
+       }
+       closedir(d);
+       return 0;
+}
+
+int ctdb_attach_databases(struct ctdb_context *ctdb)
+{
+       int ret;
+       char *persistent_health_path = NULL;
+       char *unhealthy_reason = NULL;
+       bool first_try = true;
+
+       persistent_health_path = talloc_asprintf(ctdb, "%s/%s.%u",
+                                                ctdb->db_directory_state,
+                                                PERSISTENT_HEALTH_TDB,
+                                                ctdb->pnn);
+       if (persistent_health_path == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+               return -1;
+       }
+
+again:
+
+       ctdb->db_persistent_health = tdb_wrap_open(ctdb, persistent_health_path,
+                                                  0, TDB_DISALLOW_NESTING,
+                                                  O_CREAT | O_RDWR, 0600);
+       if (ctdb->db_persistent_health == NULL) {
+               struct tdb_wrap *tdb;
+
+               if (!first_try) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s': %d - %s\n",
+                                         persistent_health_path,
+                                         errno,
+                                         strerror(errno)));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+               first_try = false;
+
+               unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+                                                  persistent_health_path,
+                                                  "was cleared after a failure",
+                                                  "manual verification needed");
+               if (unhealthy_reason == NULL) {
+                       DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+                       talloc_free(persistent_health_path);
+                       return -1;
+               }
+
+               DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - retrying after CLEAR_IF_FIRST\n",
+                                 persistent_health_path));
+               tdb = tdb_wrap_open(ctdb, persistent_health_path,
+                                   0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+                                   O_CREAT | O_RDWR, 0600);
+               if (tdb) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+                                         persistent_health_path,
+                                         errno,
+                                         strerror(errno)));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+
+               talloc_free(tdb);
+               goto again;
+       }
+       ret = tdb_check(ctdb->db_persistent_health->tdb, NULL, NULL);
+       if (ret != 0) {
+               struct tdb_wrap *tdb;
+
+               talloc_free(ctdb->db_persistent_health);
+               ctdb->db_persistent_health = NULL;
+
+               if (!first_try) {
+                       DEBUG(DEBUG_CRIT,("tdb_check('%s') failed\n",
+                                         persistent_health_path));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+               first_try = false;
+
+               unhealthy_reason = talloc_asprintf(ctdb, "WARNING - '%s' %s - %s",
+                                                  persistent_health_path,
+                                                  "was cleared after a failure",
+                                                  "manual verification needed");
+               if (unhealthy_reason == NULL) {
+                       DEBUG(DEBUG_CRIT,(__location__ " talloc_asprintf() failed\n"));
+                       talloc_free(persistent_health_path);
+                       return -1;
+               }
+
+               DEBUG(DEBUG_CRIT,("tdb_check('%s') failed - retrying after CLEAR_IF_FIRST\n",
+                                 persistent_health_path));
+               tdb = tdb_wrap_open(ctdb, persistent_health_path,
+                                   0, TDB_CLEAR_IF_FIRST | TDB_DISALLOW_NESTING,
+                                   O_CREAT | O_RDWR, 0600);
+               if (tdb) {
+                       DEBUG(DEBUG_CRIT,("Failed to open tdb '%s' - with CLEAR_IF_FIRST: %d - %s\n",
+                                         persistent_health_path,
+                                         errno,
+                                         strerror(errno)));
+                       talloc_free(persistent_health_path);
+                       talloc_free(unhealthy_reason);
+                       return -1;
+               }
+
+               talloc_free(tdb);
+               goto again;
+       }
+       talloc_free(persistent_health_path);
+
+       ret = ctdb_attach_persistent(ctdb, unhealthy_reason);
+       talloc_free(unhealthy_reason);
+       if (ret != 0) {
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  called when a broadcast seqnum update comes in
+ */
+int32_t ctdb_ltdb_update_seqnum(struct ctdb_context *ctdb, uint32_t db_id, uint32_t srcnode)
+{
+       struct ctdb_db_context *ctdb_db;
+       if (srcnode == ctdb->pnn) {
+               /* don't update ourselves! */
+               return 0;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_update_seqnum\n", db_id));
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_ltdb_update_seqnum: %s\n",
+                                ctdb_db->db_name, ctdb_db->unhealthy_reason));
+               return -1;
+       }
+
+       tdb_increment_seqnum_nonblock(ctdb_db->ltdb->tdb);
+       ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+       return 0;
+}
+
+/*
+  timer to check for seqnum changes in a ltdb and propogate them
+ */
+static void ctdb_ltdb_seqnum_check(struct event_context *ev, struct timed_event *te, 
+                                  struct timeval t, void *p)
+{
+       struct ctdb_db_context *ctdb_db = talloc_get_type(p, struct ctdb_db_context);
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       uint32_t new_seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+       if (new_seqnum != ctdb_db->seqnum) {
+               /* something has changed - propogate it */
+               TDB_DATA data;
+               data.dptr = (uint8_t *)&ctdb_db->db_id;
+               data.dsize = sizeof(uint32_t);
+               ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_VNNMAP, 0,
+                                        CTDB_CONTROL_UPDATE_SEQNUM, 0, CTDB_CTRL_FLAG_NOREPLY,
+                                        data, NULL, NULL);             
+       }
+       ctdb_db->seqnum = new_seqnum;
+
+       /* setup a new timer */
+       ctdb_db->seqnum_update =
+               event_add_timed(ctdb->ev, ctdb_db, 
+                               timeval_current_ofs(ctdb->tunable.seqnum_interval/1000, (ctdb->tunable.seqnum_interval%1000)*1000),
+                               ctdb_ltdb_seqnum_check, ctdb_db);
+}
+
+/*
+  enable seqnum handling on this db
+ */
+int32_t ctdb_ltdb_enable_seqnum(struct ctdb_context *ctdb, uint32_t db_id)
+{
+       struct ctdb_db_context *ctdb_db;
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_ltdb_enable_seqnum\n", db_id));
+               return -1;
+       }
+
+       if (ctdb_db->seqnum_update == NULL) {
+               ctdb_db->seqnum_update =
+                       event_add_timed(ctdb->ev, ctdb_db, 
+                                       timeval_current_ofs(ctdb->tunable.seqnum_interval/1000, (ctdb->tunable.seqnum_interval%1000)*1000),
+                                       ctdb_ltdb_seqnum_check, ctdb_db);
+       }
+
+       tdb_enable_seqnum(ctdb_db->ltdb->tdb);
+       ctdb_db->seqnum = tdb_get_seqnum(ctdb_db->ltdb->tdb);
+       return 0;
+}
+
+int32_t ctdb_control_set_db_priority(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_db_priority *db_prio = (struct ctdb_db_priority *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+
+       ctdb_db = find_ctdb_db(ctdb, db_prio->db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in ctdb_set_db_priority\n", db_prio->db_id));
+               return 0;
+       }
+
+       if ((db_prio->priority<1) || (db_prio->priority>NUM_DB_PRIORITIES)) {
+               DEBUG(DEBUG_ERR,("Trying to set invalid priority : %u\n", db_prio->priority));
+               return 0;
+       }
+
+       ctdb_db->priority = db_prio->priority;
+       DEBUG(DEBUG_INFO,("Setting DB priority to %u for db 0x%08x\n", db_prio->priority, db_prio->db_id));
+
+       return 0;
+}
+
+
+int ctdb_set_db_sticky(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db)
+{
+       if (ctdb_db->sticky) {
+               return 0;
+       }
+
+       if (ctdb_db->persistent) {
+               DEBUG(DEBUG_ERR,("Trying to set persistent database with sticky property\n"));
+               return -1;
+       }
+
+       ctdb_db->sticky_records = trbt_create(ctdb_db, 0);
+
+       ctdb_db->sticky = true;
+
+       DEBUG(DEBUG_NOTICE,("set db sticky %s\n", ctdb_db->db_name));
+
+       return 0;
+}
+
+int32_t ctdb_control_get_db_statistics(struct ctdb_context *ctdb,
+                               uint32_t db_id,
+                               TDB_DATA *outdata)
+{
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_db_statistics *stats;
+       int i;
+       int len;
+       char *ptr;
+
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,("Unknown db_id 0x%x in get_db_statistics\n", db_id));
+               return -1;
+       }
+
+       len = offsetof(struct ctdb_db_statistics, hot_keys_wire);
+       for (i = 0; i < MAX_HOT_KEYS; i++) {
+               len += ctdb_db->statistics.hot_keys[i].key.dsize;
+       }
+
+       stats = talloc_size(outdata, len);
+       if (stats == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate db statistics structure\n"));
+               return -1;
+       }
+
+       *stats = ctdb_db->statistics;
+
+       stats->num_hot_keys = MAX_HOT_KEYS;
+
+       ptr = &stats->hot_keys_wire[0];
+       for (i = 0; i < MAX_HOT_KEYS; i++) {
+               memcpy(ptr, ctdb_db->statistics.hot_keys[i].key.dptr,
+                      ctdb_db->statistics.hot_keys[i].key.dsize);
+               ptr += ctdb_db->statistics.hot_keys[i].key.dsize;
+       }
+
+       outdata->dptr  = (uint8_t *)stats;
+       outdata->dsize = len;
+
+       return 0;
+}
diff --git a/ctdb/server/ctdb_monitor.c b/ctdb/server/ctdb_monitor.c
new file mode 100644 (file)
index 0000000..acd68c8
--- /dev/null
@@ -0,0 +1,518 @@
+/* 
+   monitoring links to all other nodes to detect dead nodes
+
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+
+struct ctdb_monitor_state {
+       uint32_t monitoring_mode;
+       TALLOC_CTX *monitor_context;
+       uint32_t next_interval;
+};
+
+static void ctdb_check_health(struct event_context *ev, struct timed_event *te, 
+                             struct timeval t, void *private_data);
+
+/*
+  setup the notification script
+*/
+int ctdb_set_notification_script(struct ctdb_context *ctdb, const char *script)
+{
+       ctdb->notification_script = talloc_strdup(ctdb, script);
+       CTDB_NO_MEMORY(ctdb, ctdb->notification_script);
+       return 0;
+}
+
+static int ctdb_run_notification_script_child(struct ctdb_context *ctdb, const char *event)
+{
+       struct stat st;
+       int ret;
+       char *cmd;
+
+       if (stat(ctdb->notification_script, &st) != 0) {
+               DEBUG(DEBUG_ERR,("Could not stat notification script %s. Can not send notifications.\n", ctdb->notification_script));
+               return -1;
+       }
+       if (!(st.st_mode & S_IXUSR)) {
+               DEBUG(DEBUG_ERR,("Notification script %s is not executable.\n", ctdb->notification_script));
+               return -1;
+       }
+
+       cmd = talloc_asprintf(ctdb, "%s %s\n", ctdb->notification_script, event);
+       CTDB_NO_MEMORY(ctdb, cmd);
+
+       ret = system(cmd);
+       /* if the system() call was successful, translate ret into the
+          return code from the command
+       */
+       if (ret != -1) {
+               ret = WEXITSTATUS(ret);
+       }
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Notification script \"%s\" failed with error %d\n", cmd, ret));
+       }
+
+       return ret;
+}
+
+void ctdb_run_notification_script(struct ctdb_context *ctdb, const char *event)
+{
+       pid_t child;
+
+       if (ctdb->notification_script == NULL) {
+               return;
+       }
+
+       child = ctdb_fork(ctdb);
+       if (child == (pid_t)-1) {
+               DEBUG(DEBUG_ERR,("Failed to fork() a notification child process\n"));
+               return;
+       }
+       if (child == 0) {
+               int ret;
+
+               ctdb_set_process_name("ctdb_notification");
+               debug_extra = talloc_asprintf(NULL, "notification-%s:", event);
+               ret = ctdb_run_notification_script_child(ctdb, event);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Notification script failed\n"));
+               }
+               _exit(0);
+       }
+
+       return;
+}
+
+/*
+  called when a health monitoring event script finishes
+ */
+static void ctdb_health_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+       struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+       TDB_DATA data;
+       struct ctdb_node_flag_change c;
+       uint32_t next_interval;
+       int ret;
+       TDB_DATA rddata;
+       struct srvid_request rd;
+       const char *state_str = NULL;
+
+       c.pnn = ctdb->pnn;
+       c.old_flags = node->flags;
+
+       rd.pnn   = ctdb->pnn;
+       rd.srvid = CTDB_SRVID_TAKEOVER_RUN_RESPONSE;
+
+       rddata.dptr = (uint8_t *)&rd;
+       rddata.dsize = sizeof(rd);
+
+       if (status == -ECANCELED) {
+               DEBUG(DEBUG_ERR,("Monitoring event was cancelled\n"));
+               goto after_change_status;
+       }
+
+       if (status == -ETIME) {
+               ctdb->event_script_timeouts++;
+
+               if (ctdb->event_script_timeouts >= ctdb->tunable.script_timeout_count) {
+                       DEBUG(DEBUG_ERR, ("Maximum timeout count %u reached for eventscript. Making node unhealthy\n", ctdb->tunable.script_timeout_count));
+               } else {
+                       /* We pretend this is OK. */
+                       goto after_change_status;
+               }
+       }
+
+       if (status != 0 && !(node->flags & NODE_FLAGS_UNHEALTHY)) {
+               DEBUG(DEBUG_NOTICE,("monitor event failed - disabling node\n"));
+               node->flags |= NODE_FLAGS_UNHEALTHY;
+               ctdb->monitor->next_interval = 5;
+
+               ctdb_run_notification_script(ctdb, "unhealthy");
+       } else if (status == 0 && (node->flags & NODE_FLAGS_UNHEALTHY)) {
+               DEBUG(DEBUG_NOTICE,("monitor event OK - node re-enabled\n"));
+               node->flags &= ~NODE_FLAGS_UNHEALTHY;
+               ctdb->monitor->next_interval = 5;
+
+               ctdb_run_notification_script(ctdb, "healthy");
+       }
+
+after_change_status:
+       next_interval = ctdb->monitor->next_interval;
+
+       ctdb->monitor->next_interval *= 2;
+       if (ctdb->monitor->next_interval > ctdb->tunable.monitor_interval) {
+               ctdb->monitor->next_interval = ctdb->tunable.monitor_interval;
+       }
+
+       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, 
+                               timeval_current_ofs(next_interval, 0), 
+                               ctdb_check_health, ctdb);
+
+       if (c.old_flags == node->flags) {
+               return;
+       }
+
+       c.new_flags = node->flags;
+
+       data.dptr = (uint8_t *)&c;
+       data.dsize = sizeof(c);
+
+       /* ask the recovery daemon to push these changes out to all nodes */
+       ctdb_daemon_send_message(ctdb, ctdb->pnn,
+                                CTDB_SRVID_PUSH_NODE_FLAGS, data);
+
+       if (c.new_flags & NODE_FLAGS_UNHEALTHY) {
+               state_str = "UNHEALTHY";
+       } else {
+               state_str = "HEALTHY";
+       }
+
+       /* ask the recmaster to reallocate all addresses */
+       DEBUG(DEBUG_ERR,("Node became %s. Ask recovery master %u to perform ip reallocation\n",
+                        state_str, ctdb->recovery_master));
+       ret = ctdb_daemon_send_message(ctdb, ctdb->recovery_master, CTDB_SRVID_TAKEOVER_RUN, rddata);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to send ip takeover run request message to %u\n", ctdb->recovery_master));
+       }
+}
+
+
+/*
+  called when the startup event script finishes
+ */
+static void ctdb_startup_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+       if (status != 0) {
+               DEBUG(DEBUG_ERR,("startup event failed\n"));
+       } else if (status == 0) {
+               DEBUG(DEBUG_NOTICE,("startup event OK - enabling monitoring\n"));
+               ctdb_set_runstate(ctdb, CTDB_RUNSTATE_RUNNING);
+               ctdb->monitor->next_interval = 2;
+               ctdb_run_notification_script(ctdb, "startup");
+       }
+
+       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, 
+                       timeval_current_ofs(ctdb->monitor->next_interval, 0),
+                       ctdb_check_health, ctdb);
+}
+
+
+/*
+  wait until we have finished initial recoveries before we start the
+  monitoring events
+ */
+static void ctdb_wait_until_recovered(struct event_context *ev, struct timed_event *te, 
+                             struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int ret;
+       static int count = 0;
+
+       count++;
+
+       if (count < 60 || count%600 == 0) { 
+               DEBUG(DEBUG_NOTICE,("CTDB_WAIT_UNTIL_RECOVERED\n"));
+               if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_STOPPED) {
+                       DEBUG(DEBUG_NOTICE,("Node is STOPPED. Node will NOT recover.\n"));
+               }
+       }
+
+       if (ctdb->vnn_map->generation == INVALID_GENERATION) {
+               ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                    timeval_current_ofs(1, 0), 
+                                    ctdb_wait_until_recovered, ctdb);
+               return;
+       }
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+               DEBUG(DEBUG_NOTICE,(__location__ " in recovery. Wait one more second\n"));
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                    timeval_current_ofs(1, 0), 
+                                    ctdb_wait_until_recovered, ctdb);
+               return;
+       }
+
+
+       if (!fast_start && timeval_elapsed(&ctdb->last_recovery_finished) < (ctdb->tunable.rerecovery_timeout + 3)) {
+               ctdb->db_persistent_startup_generation = INVALID_GENERATION;
+
+               DEBUG(DEBUG_NOTICE,(__location__ " wait for pending recoveries to end. Wait one more second.\n"));
+
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                    timeval_current_ofs(1, 0), 
+                                    ctdb_wait_until_recovered, ctdb);
+               return;
+       }
+
+       if (ctdb->vnn_map->generation == ctdb->db_persistent_startup_generation) {
+               DEBUG(DEBUG_INFO,(__location__ " skip ctdb_recheck_persistent_health() "
+                                 "until the next recovery\n"));
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                    timeval_current_ofs(1, 0),
+                                    ctdb_wait_until_recovered, ctdb);
+               return;
+       }
+
+       ctdb->db_persistent_startup_generation = ctdb->vnn_map->generation;
+       ret = ctdb_recheck_persistent_health(ctdb);
+       if (ret != 0) {
+               ctdb->db_persistent_check_errors++;
+               if (ctdb->db_persistent_check_errors < ctdb->max_persistent_check_errors) {
+                       DEBUG(ctdb->db_persistent_check_errors==1?DEBUG_ERR:DEBUG_WARNING,
+                             (__location__ "ctdb_recheck_persistent_health() "
+                             "failed (%llu of %llu times) - retry later\n",
+                             (unsigned long long)ctdb->db_persistent_check_errors,
+                             (unsigned long long)ctdb->max_persistent_check_errors));
+                       event_add_timed(ctdb->ev,
+                                       ctdb->monitor->monitor_context,
+                                       timeval_current_ofs(1, 0),
+                                       ctdb_wait_until_recovered, ctdb);
+                       return;
+               }
+               DEBUG(DEBUG_ALERT,(__location__
+                                 "ctdb_recheck_persistent_health() failed (%llu times) - prepare shutdown\n",
+                                 (unsigned long long)ctdb->db_persistent_check_errors));
+               ctdb_shutdown_sequence(ctdb, 11);
+               /* In case above returns due to duplicate shutdown */
+               return;
+       }
+       ctdb->db_persistent_check_errors = 0;
+
+       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                            timeval_current(),
+                            ctdb_check_health, ctdb);
+}
+
+
+/*
+  see if the event scripts think we are healthy
+ */
+static void ctdb_check_health(struct event_context *ev, struct timed_event *te, 
+                             struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int ret = 0;
+
+       if (ctdb->runstate < CTDB_RUNSTATE_STARTUP) {
+               DEBUG(DEBUG_NOTICE,("Not yet in startup runstate. Wait one more second\n"));
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                               timeval_current_ofs(1, 0), 
+                               ctdb_check_health, ctdb);
+               return;
+       }
+       
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL ||
+           (ctdb->monitor->monitoring_mode == CTDB_MONITORING_DISABLED &&
+            ctdb->runstate == CTDB_RUNSTATE_RUNNING)) {
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                               timeval_current_ofs(ctdb->monitor->next_interval, 0), 
+                               ctdb_check_health, ctdb);
+               return;
+       }
+       
+       if (ctdb->runstate == CTDB_RUNSTATE_STARTUP) {
+               DEBUG(DEBUG_NOTICE,("Recoveries finished. Running the \"startup\" event.\n"));
+               ret = ctdb_event_script_callback(ctdb, 
+                                                ctdb->monitor->monitor_context, ctdb_startup_callback, 
+                                                ctdb, false,
+                                                CTDB_EVENT_STARTUP, "%s", "");
+       } else {
+               int i;
+               int skip_monitoring = 0;
+               
+               if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+                       skip_monitoring = 1;
+                       DEBUG(DEBUG_ERR,("Skip monitoring during recovery\n"));
+               }
+               for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+                       if (ctdb->freeze_handles[i] != NULL) {
+                               DEBUG(DEBUG_ERR,("Skip monitoring since databases are frozen\n"));
+                               skip_monitoring = 1;
+                               break;
+                       }
+               }
+               if (skip_monitoring != 0) {
+                       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                                       timeval_current_ofs(ctdb->monitor->next_interval, 0), 
+                                       ctdb_check_health, ctdb);
+                       return;
+               } else {
+                       ret = ctdb_event_script_callback(ctdb, 
+                                       ctdb->monitor->monitor_context, ctdb_health_callback,
+                                       ctdb, false,
+                                       CTDB_EVENT_MONITOR, "%s", "");
+               }
+       }
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Unable to launch monitor event script\n"));
+               ctdb->monitor->next_interval = 5;
+               event_add_timed(ctdb->ev, ctdb->monitor->monitor_context, 
+                       timeval_current_ofs(5, 0), 
+                       ctdb_check_health, ctdb);
+       }
+}
+
+/* 
+  (Temporaily) Disabling monitoring will stop the monitor event scripts
+  from running   but node health checks will still occur
+*/
+void ctdb_disable_monitoring(struct ctdb_context *ctdb)
+{
+       ctdb->monitor->monitoring_mode = CTDB_MONITORING_DISABLED;
+       DEBUG(DEBUG_INFO,("Monitoring has been disabled\n"));
+}
+
+/* 
+   Re-enable running monitor events after they have been disabled
+ */
+void ctdb_enable_monitoring(struct ctdb_context *ctdb)
+{
+       ctdb->monitor->monitoring_mode  = CTDB_MONITORING_ACTIVE;
+       ctdb->monitor->next_interval = 5;
+       DEBUG(DEBUG_INFO,("Monitoring has been enabled\n"));
+}
+
+/* stop any monitoring 
+   this should only be done when shutting down the daemon
+*/
+void ctdb_stop_monitoring(struct ctdb_context *ctdb)
+{
+       talloc_free(ctdb->monitor->monitor_context);
+       ctdb->monitor->monitor_context = NULL;
+
+       ctdb->monitor->monitoring_mode  = CTDB_MONITORING_DISABLED;
+       ctdb->monitor->next_interval = 5;
+       DEBUG(DEBUG_NOTICE,("Monitoring has been stopped\n"));
+}
+
+/*
+  start watching for nodes that might be dead
+ */
+void ctdb_start_monitoring(struct ctdb_context *ctdb)
+{
+       if (ctdb->monitor != NULL) {
+               return;
+       }
+
+       ctdb->monitor = talloc(ctdb, struct ctdb_monitor_state);
+       CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor);
+
+       ctdb->monitor->next_interval = 5;
+
+       ctdb->monitor->monitor_context = talloc_new(ctdb->monitor);
+       CTDB_NO_MEMORY_FATAL(ctdb, ctdb->monitor->monitor_context);
+
+       event_add_timed(ctdb->ev, ctdb->monitor->monitor_context,
+                            timeval_current_ofs(1, 0), 
+                            ctdb_wait_until_recovered, ctdb);
+
+       ctdb->monitor->monitoring_mode  = CTDB_MONITORING_ACTIVE;
+       DEBUG(DEBUG_NOTICE,("Monitoring has been started\n"));
+}
+
+
+/*
+  modify flags on a node
+ */
+int32_t ctdb_control_modflags(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)indata.dptr;
+       struct ctdb_node *node;
+       uint32_t old_flags;
+
+       if (c->pnn >= ctdb->num_nodes) {
+               DEBUG(DEBUG_ERR,(__location__ " Node %d is invalid, num_nodes :%d\n", c->pnn, ctdb->num_nodes));
+               return -1;
+       }
+
+       node         = ctdb->nodes[c->pnn];
+       old_flags    = node->flags;
+       if (c->pnn != ctdb->pnn) {
+               c->old_flags  = node->flags;
+       }
+       node->flags   = c->new_flags & ~NODE_FLAGS_DISCONNECTED;
+       node->flags  |= (c->old_flags & NODE_FLAGS_DISCONNECTED);
+
+       /* we dont let other nodes modify our STOPPED status */
+       if (c->pnn == ctdb->pnn) {
+               node->flags &= ~NODE_FLAGS_STOPPED;
+               if (old_flags & NODE_FLAGS_STOPPED) {
+                       node->flags |= NODE_FLAGS_STOPPED;
+               }
+       }
+
+       /* we dont let other nodes modify our BANNED status */
+       if (c->pnn == ctdb->pnn) {
+               node->flags &= ~NODE_FLAGS_BANNED;
+               if (old_flags & NODE_FLAGS_BANNED) {
+                       node->flags |= NODE_FLAGS_BANNED;
+               }
+       }
+
+       if (node->flags == c->old_flags) {
+               DEBUG(DEBUG_INFO, ("Control modflags on node %u - Unchanged - flags 0x%x\n", c->pnn, node->flags));
+               return 0;
+       }
+
+       DEBUG(DEBUG_INFO, ("Control modflags on node %u - flags now 0x%x\n", c->pnn, node->flags));
+
+       if (node->flags == 0 && ctdb->runstate <= CTDB_RUNSTATE_STARTUP) {
+               DEBUG(DEBUG_ERR, (__location__ " Node %u became healthy - force recovery for startup\n",
+                                 c->pnn));
+               ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+       }
+
+       /* tell the recovery daemon something has changed */
+       ctdb_daemon_send_message(ctdb, ctdb->pnn,
+                                CTDB_SRVID_SET_NODE_FLAGS, indata);
+
+       /* if we have become banned, we should go into recovery mode */
+       if ((node->flags & NODE_FLAGS_BANNED) && !(c->old_flags & NODE_FLAGS_BANNED) && (node->pnn == ctdb->pnn)) {
+               ctdb_local_node_got_banned(ctdb);
+       }
+       
+       return 0;
+}
+
+/*
+  return the monitoring mode
+ */
+int32_t ctdb_monitoring_mode(struct ctdb_context *ctdb)
+{
+       if (ctdb->monitor == NULL) {
+               return CTDB_MONITORING_DISABLED;
+       }
+       return ctdb->monitor->monitoring_mode;
+}
+
+/*
+ * Check if monitoring has been stopped
+ */
+bool ctdb_stopped_monitoring(struct ctdb_context *ctdb)
+{
+       return (ctdb->monitor->monitor_context == NULL ? true : false);
+}
diff --git a/ctdb/server/ctdb_persistent.c b/ctdb/server/ctdb_persistent.c
new file mode 100644 (file)
index 0000000..cfbea63
--- /dev/null
@@ -0,0 +1,382 @@
+/* 
+   persistent store logic
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "db_wrap.h"
+#include "tdb.h"
+#include "../include/ctdb_private.h"
+
+struct ctdb_persistent_state {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db; /* used by trans3_commit */
+       struct ctdb_client *client; /* used by trans3_commit */
+       struct ctdb_req_control *c;
+       const char *errormsg;
+       uint32_t num_pending;
+       int32_t status;
+       uint32_t num_failed, num_sent;
+};
+
+/*
+  1) all nodes fail, and all nodes reply
+  2) some nodes fail, all nodes reply
+  3) some nodes timeout
+  4) all nodes succeed
+ */
+
+/*
+  called when a node has acknowledged a ctdb_control_update_record call
+ */
+static void ctdb_persistent_callback(struct ctdb_context *ctdb,
+                                    int32_t status, TDB_DATA data, 
+                                    const char *errormsg,
+                                    void *private_data)
+{
+       struct ctdb_persistent_state *state = talloc_get_type(private_data, 
+                                                             struct ctdb_persistent_state);
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_INFO, ("ctdb_persistent_callback: ignoring reply "
+                                  "during recovery\n"));
+               return;
+       }
+
+       if (status != 0) {
+               DEBUG(DEBUG_ERR,("ctdb_persistent_callback failed with status %d (%s)\n",
+                        status, errormsg?errormsg:"no error message given"));
+               state->status = status;
+               state->errormsg = errormsg;
+               state->num_failed++;
+
+               /*
+                * If a node failed to complete the update_record control,
+                * then either a recovery is already running or something
+                * bad is going on. So trigger a recovery and let the
+                * recovery finish the transaction, sending back the reply
+                * for the trans3_commit control to the client.
+                */
+               ctdb->recovery_mode = CTDB_RECOVERY_ACTIVE;
+               return;
+       }
+
+       state->num_pending--;
+
+       if (state->num_pending != 0) {
+               return;
+       }
+
+       ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, state->errormsg);
+       talloc_free(state);
+}
+
+/*
+  called if persistent store times out
+ */
+static void ctdb_persistent_store_timeout(struct event_context *ev, struct timed_event *te, 
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_persistent_state *state = talloc_get_type(private_data, struct ctdb_persistent_state);
+
+       if (state->ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_INFO, ("ctdb_persistent_store_timeout: ignoring "
+                                  "timeout during recovery\n"));
+               return;
+       }
+
+       ctdb_request_control_reply(state->ctdb, state->c, NULL, 1,
+                                  "timeout in ctdb_persistent_state");
+
+       talloc_free(state);
+}
+
+/**
+ * Finish pending trans3 commit controls, i.e. send
+ * reply to the client. This is called by the end-recovery
+ * control to fix the situation when a recovery interrupts
+ * the usual progress of a transaction.
+ */
+void ctdb_persistent_finish_trans3_commits(struct ctdb_context *ctdb)
+{
+       struct ctdb_db_context *ctdb_db;
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_INFO, ("ctdb_persistent_finish_trans3_commits: "
+                                  "skipping execution when recovery is "
+                                  "active\n"));
+               return;
+       }
+
+       for (ctdb_db = ctdb->db_list; ctdb_db; ctdb_db = ctdb_db->next) {
+               struct ctdb_persistent_state *state;
+
+               if (ctdb_db->persistent_state == NULL) {
+                       continue;
+               }
+
+               state = ctdb_db->persistent_state;
+
+               ctdb_request_control_reply(ctdb, state->c, NULL, 2,
+                                          "trans3 commit ended by recovery");
+
+               /* The destructor sets ctdb_db->persistent_state to NULL. */
+               talloc_free(state);
+       }
+}
+
+static int ctdb_persistent_state_destructor(struct ctdb_persistent_state *state)
+{
+       if (state->client != NULL) {
+               state->client->db_id = 0;
+       }
+
+       if (state->ctdb_db != NULL) {
+               state->ctdb_db->persistent_state = NULL;
+       }
+
+       return 0;
+}
+
+/*
+ * Store a set of persistent records.
+ * This is used to roll out a transaction to all nodes.
+ */
+int32_t ctdb_control_trans3_commit(struct ctdb_context *ctdb,
+                                  struct ctdb_req_control *c,
+                                  TDB_DATA recdata, bool *async_reply)
+{
+       struct ctdb_client *client;
+       struct ctdb_persistent_state *state;
+       int i;
+       struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
+       struct ctdb_db_context *ctdb_db;
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_INFO,("rejecting ctdb_control_trans3_commit when recovery active\n"));
+               return -1;
+       }
+
+       client = ctdb_reqid_find(ctdb, c->client_id, struct ctdb_client);
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " can not match persistent_store "
+                                "to a client. Returning error\n"));
+               return -1;
+       }
+
+       if (client->db_id != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ERROR: trans3_commit: "
+                                "client-db_id[0x%08x] != 0 "
+                                "(client_id[0x%08x]): trans3_commit active?\n",
+                                client->db_id, client->client_id));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, m->db_id);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control_trans3_commit: "
+                                "Unknown database db_id[0x%08x]\n", m->db_id));
+               return -1;
+       }
+
+       if (ctdb_db->persistent_state != NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Error: "
+                                 "ctdb_control_trans3_commit "
+                                 "called while a transaction commit is "
+                                 "active. db_id[0x%08x]\n", m->db_id));
+               return -1;
+       }
+
+       ctdb_db->persistent_state = talloc_zero(ctdb_db,
+                                               struct ctdb_persistent_state);
+       CTDB_NO_MEMORY(ctdb, ctdb_db->persistent_state);
+
+       client->db_id = m->db_id;
+
+       state = ctdb_db->persistent_state;
+       state->ctdb = ctdb;
+       state->ctdb_db = ctdb_db;
+       state->c    = c;
+       state->client = client;
+
+       talloc_set_destructor(state, ctdb_persistent_state_destructor);
+
+       for (i = 0; i < ctdb->vnn_map->size; i++) {
+               struct ctdb_node *node = ctdb->nodes[ctdb->vnn_map->map[i]];
+               int ret;
+
+               /* only send to active nodes */
+               if (node->flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               ret = ctdb_daemon_send_control(ctdb, node->pnn, 0,
+                                              CTDB_CONTROL_UPDATE_RECORD,
+                                              c->client_id, 0, recdata,
+                                              ctdb_persistent_callback,
+                                              state);
+               if (ret == -1) {
+                       DEBUG(DEBUG_ERR,("Unable to send "
+                                        "CTDB_CONTROL_UPDATE_RECORD "
+                                        "to pnn %u\n", node->pnn));
+                       talloc_free(state);
+                       return -1;
+               }
+
+               state->num_pending++;
+               state->num_sent++;
+       }
+
+       if (state->num_pending == 0) {
+               talloc_free(state);
+               return 0;
+       }
+
+       /* we need to wait for the replies */
+       *async_reply = true;
+
+       /* need to keep the control structure around */
+       talloc_steal(state, c);
+
+       /* but we won't wait forever */
+       event_add_timed(ctdb->ev, state,
+                       timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+                       ctdb_persistent_store_timeout, state);
+
+       return 0;
+}
+
+
+/*
+  backwards compatibility:
+
+  start a persistent store operation. passing both the key, header and
+  data to the daemon. If the client disconnects before it has issued
+  a persistent_update call to the daemon we trigger a full recovery
+  to ensure the databases are brought back in sync.
+  for now we ignore the recdata that the client has passed to us.
+ */
+int32_t ctdb_control_start_persistent_update(struct ctdb_context *ctdb, 
+                                     struct ctdb_req_control *c,
+                                     TDB_DATA recdata)
+{
+       struct ctdb_client *client = ctdb_reqid_find(ctdb, c->client_id, struct ctdb_client);
+
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " can not match start_persistent_update to a client. Returning error\n"));
+               return -1;
+       }
+
+       client->num_persistent_updates++;
+
+       return 0;
+}
+
+/* 
+  backwards compatibility:
+
+  called to tell ctdbd that it is no longer doing a persistent update 
+*/
+int32_t ctdb_control_cancel_persistent_update(struct ctdb_context *ctdb, 
+                                             struct ctdb_req_control *c,
+                                             TDB_DATA recdata)
+{
+       struct ctdb_client *client = ctdb_reqid_find(ctdb, c->client_id, struct ctdb_client);
+
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " can not match cancel_persistent_update to a client. Returning error\n"));
+               return -1;
+       }
+
+       if (client->num_persistent_updates > 0) {
+               client->num_persistent_updates--;
+       }
+
+       return 0;
+}
+
+static int32_t ctdb_get_db_seqnum(struct ctdb_context *ctdb,
+                                 uint32_t db_id,
+                                 uint64_t *seqnum)
+{
+       int32_t ret;
+       struct ctdb_db_context *ctdb_db;
+       const char *keyname = CTDB_DB_SEQNUM_KEY;
+       TDB_DATA key;
+       TDB_DATA data;
+       TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+       struct ctdb_ltdb_header header;
+
+       ctdb_db = find_ctdb_db(ctdb, db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", db_id));
+               ret = -1;
+               goto done;
+       }
+
+       key.dptr = (uint8_t *)discard_const(keyname);
+       key.dsize = strlen(keyname) + 1;
+
+       ret = (int32_t)ctdb_ltdb_fetch(ctdb_db, key, &header, mem_ctx, &data);
+       if (ret != 0) {
+               goto done;
+       }
+
+       if (data.dsize != sizeof(uint64_t)) {
+               *seqnum = 0;
+               goto done;
+       }
+
+       *seqnum = *(uint64_t *)data.dptr;
+
+done:
+       talloc_free(mem_ctx);
+       return ret;
+}
+
+/**
+ * Get the sequence number of a persistent database.
+ */
+int32_t ctdb_control_get_db_seqnum(struct ctdb_context *ctdb,
+                                  TDB_DATA indata,
+                                  TDB_DATA *outdata)
+{
+       uint32_t db_id;
+       int32_t ret;
+       uint64_t seqnum;
+
+       db_id = *(uint32_t *)indata.dptr;
+       ret = ctdb_get_db_seqnum(ctdb, db_id, &seqnum);
+       if (ret != 0) {
+               goto done;
+       }
+
+       outdata->dsize = sizeof(uint64_t);
+       outdata->dptr = (uint8_t *)talloc_zero(outdata, uint64_t);
+       if (outdata->dptr == NULL) {
+               ret = -1;
+               goto done;
+       }
+
+       *(outdata->dptr) = seqnum;
+
+done:
+       return ret;
+}
diff --git a/ctdb/server/ctdb_recover.c b/ctdb/server/ctdb_recover.c
new file mode 100644 (file)
index 0000000..1cbcc59
--- /dev/null
@@ -0,0 +1,1396 @@
+/* 
+   ctdb recovery code
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "tdb.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include "lib/util/dlinklist.h"
+#include "db_wrap.h"
+
+
+int 
+ctdb_control_getvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+       CHECK_CONTROL_DATA_SIZE(0);
+       struct ctdb_vnn_map_wire *map;
+       size_t len;
+
+       len = offsetof(struct ctdb_vnn_map_wire, map) + sizeof(uint32_t)*ctdb->vnn_map->size;
+       map = talloc_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, map);
+
+       map->generation = ctdb->vnn_map->generation;
+       map->size = ctdb->vnn_map->size;
+       memcpy(map->map, ctdb->vnn_map->map, sizeof(uint32_t)*map->size);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)map;
+
+       return 0;
+}
+
+int 
+ctdb_control_setvnnmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+       struct ctdb_vnn_map_wire *map = (struct ctdb_vnn_map_wire *)indata.dptr;
+       int i;
+
+       for(i=1; i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb->freeze_mode[i] != CTDB_FREEZE_FROZEN) {
+                       DEBUG(DEBUG_ERR,("Attempt to set vnnmap when not frozen\n"));
+                       return -1;
+               }
+       }
+
+       talloc_free(ctdb->vnn_map);
+
+       ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
+       CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
+
+       ctdb->vnn_map->generation = map->generation;
+       ctdb->vnn_map->size       = map->size;
+       ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, map->size);
+       CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
+
+       memcpy(ctdb->vnn_map->map, map->map, sizeof(uint32_t)*map->size);
+
+       return 0;
+}
+
+int 
+ctdb_control_getdbmap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+       uint32_t i, len;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_dbid_map *dbid_map;
+
+       CHECK_CONTROL_DATA_SIZE(0);
+
+       len = 0;
+       for(ctdb_db=ctdb->db_list;ctdb_db;ctdb_db=ctdb_db->next){
+               len++;
+       }
+
+
+       outdata->dsize = offsetof(struct ctdb_dbid_map, dbs) + sizeof(dbid_map->dbs[0])*len;
+       outdata->dptr  = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
+       if (!outdata->dptr) {
+               DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate dbmap array\n"));
+               exit(1);
+       }
+
+       dbid_map = (struct ctdb_dbid_map *)outdata->dptr;
+       dbid_map->num = len;
+       for (i=0,ctdb_db=ctdb->db_list;ctdb_db;i++,ctdb_db=ctdb_db->next){
+               dbid_map->dbs[i].dbid       = ctdb_db->db_id;
+               if (ctdb_db->persistent != 0) {
+                       dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_PERSISTENT;
+               }
+               if (ctdb_db->readonly != 0) {
+                       dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_READONLY;
+               }
+               if (ctdb_db->sticky != 0) {
+                       dbid_map->dbs[i].flags |= CTDB_DB_FLAGS_STICKY;
+               }
+       }
+
+       return 0;
+}
+
+int 
+ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+       uint32_t i, num_nodes;
+       struct ctdb_node_map *node_map;
+
+       CHECK_CONTROL_DATA_SIZE(0);
+
+       num_nodes = ctdb->num_nodes;
+
+       outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
+       outdata->dptr  = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
+       if (!outdata->dptr) {
+               DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
+               exit(1);
+       }
+
+       node_map = (struct ctdb_node_map *)outdata->dptr;
+       node_map->num = num_nodes;
+       for (i=0; i<num_nodes; i++) {
+               if (parse_ip(ctdb->nodes[i]->address.address,
+                            NULL, /* TODO: pass in the correct interface here*/
+                            0,
+                            &node_map->nodes[i].addr) == 0)
+               {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
+               }
+
+               node_map->nodes[i].pnn   = ctdb->nodes[i]->pnn;
+               node_map->nodes[i].flags = ctdb->nodes[i]->flags;
+       }
+
+       return 0;
+}
+
+/*
+   get an old style ipv4-only nodemap
+*/
+int 
+ctdb_control_getnodemapv4(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+       uint32_t i, num_nodes;
+       struct ctdb_node_mapv4 *node_map;
+
+       CHECK_CONTROL_DATA_SIZE(0);
+
+       num_nodes = ctdb->num_nodes;
+
+       outdata->dsize = offsetof(struct ctdb_node_mapv4, nodes) + num_nodes*sizeof(struct ctdb_node_and_flagsv4);
+       outdata->dptr  = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
+       if (!outdata->dptr) {
+               DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
+               exit(1);
+       }
+
+       node_map = (struct ctdb_node_mapv4 *)outdata->dptr;
+       node_map->num = num_nodes;
+       for (i=0; i<num_nodes; i++) {
+               if (parse_ipv4(ctdb->nodes[i]->address.address, 0, &node_map->nodes[i].sin) == 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
+                       return -1;
+               }
+
+               node_map->nodes[i].pnn   = ctdb->nodes[i]->pnn;
+               node_map->nodes[i].flags = ctdb->nodes[i]->flags;
+       }
+
+       return 0;
+}
+
+static void
+ctdb_reload_nodes_event(struct event_context *ev, struct timed_event *te, 
+                              struct timeval t, void *private_data)
+{
+       int i, num_nodes;
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       TALLOC_CTX *tmp_ctx;
+       struct ctdb_node **nodes;       
+
+       tmp_ctx = talloc_new(ctdb);
+
+       /* steal the old nodes file for a while */
+       talloc_steal(tmp_ctx, ctdb->nodes);
+       nodes = ctdb->nodes;
+       ctdb->nodes = NULL;
+       num_nodes = ctdb->num_nodes;
+       ctdb->num_nodes = 0;
+
+       /* load the new nodes file */
+       ctdb_load_nodes_file(ctdb);
+
+       for (i=0; i<ctdb->num_nodes; i++) {
+               /* keep any identical pre-existing nodes and connections */
+               if ((i < num_nodes) && ctdb_same_address(&ctdb->nodes[i]->address, &nodes[i]->address)) {
+                       talloc_free(ctdb->nodes[i]);
+                       ctdb->nodes[i] = talloc_steal(ctdb->nodes, nodes[i]);
+                       continue;
+               }
+
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+
+               /* any new or different nodes must be added */
+               if (ctdb->methods->add_node(ctdb->nodes[i]) != 0) {
+                       DEBUG(DEBUG_CRIT, (__location__ " methods->add_node failed at %d\n", i));
+                       ctdb_fatal(ctdb, "failed to add node. shutting down\n");
+               }
+               if (ctdb->methods->connect_node(ctdb->nodes[i]) != 0) {
+                       DEBUG(DEBUG_CRIT, (__location__ " methods->add_connect failed at %d\n", i));
+                       ctdb_fatal(ctdb, "failed to connect to node. shutting down\n");
+               }
+       }
+
+       /* tell the recovery daemon to reaload the nodes file too */
+       ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELOAD_NODES, tdb_null);
+
+       talloc_free(tmp_ctx);
+       return;
+}
+
+/*
+  reload the nodes file after a short delay (so that we can send the response
+  back first
+*/
+int 
+ctdb_control_reload_nodes_file(struct ctdb_context *ctdb, uint32_t opcode)
+{
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1,0), ctdb_reload_nodes_event, ctdb);
+
+       return 0;
+}
+
+/* 
+   a traverse function for pulling all relevent records from pulldb
+ */
+struct pulldb_data {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_marshall_buffer *pulldata;
+       uint32_t len;
+       uint32_t allocated_len;
+       bool failed;
+};
+
+static int traverse_pulldb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
+{
+       struct pulldb_data *params = (struct pulldb_data *)p;
+       struct ctdb_rec_data *rec;
+       struct ctdb_context *ctdb = params->ctdb;
+       struct ctdb_db_context *ctdb_db = params->ctdb_db;
+
+       /* add the record to the blob */
+       rec = ctdb_marshall_record(params->pulldata, 0, key, NULL, data);
+       if (rec == NULL) {
+               params->failed = true;
+               return -1;
+       }
+       if (params->len + rec->length >= params->allocated_len) {
+               params->allocated_len = rec->length + params->len + ctdb->tunable.pulldb_preallocation_size;
+               params->pulldata = talloc_realloc_size(NULL, params->pulldata, params->allocated_len);
+       }
+       if (params->pulldata == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to expand pulldb_data to %u\n", rec->length + params->len));
+               ctdb_fatal(params->ctdb, "failed to allocate memory for recovery. shutting down\n");
+       }
+       params->pulldata->count++;
+       memcpy(params->len+(uint8_t *)params->pulldata, rec, rec->length);
+       params->len += rec->length;
+
+       if (ctdb->tunable.db_record_size_warn != 0 && rec->length > ctdb->tunable.db_record_size_warn) {
+               DEBUG(DEBUG_ERR,("Data record in %s is big. Record size is %d bytes\n", ctdb_db->db_name, (int)rec->length));
+       }
+
+       talloc_free(rec);
+
+       return 0;
+}
+
+/*
+  pull a bunch of records from a ltdb, filtering by lmaster
+ */
+int32_t ctdb_control_pull_db(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
+{
+       struct ctdb_control_pulldb *pull;
+       struct ctdb_db_context *ctdb_db;
+       struct pulldb_data params;
+       struct ctdb_marshall_buffer *reply;
+
+       pull = (struct ctdb_control_pulldb *)indata.dptr;
+       
+       ctdb_db = find_ctdb_db(ctdb, pull->db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", pull->db_id));
+               return -1;
+       }
+
+       if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
+               DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_pull_db when not frozen\n"));
+               return -1;
+       }
+
+       reply = talloc_zero(outdata, struct ctdb_marshall_buffer);
+       CTDB_NO_MEMORY(ctdb, reply);
+
+       reply->db_id = pull->db_id;
+
+       params.ctdb = ctdb;
+       params.ctdb_db = ctdb_db;
+       params.pulldata = reply;
+       params.len = offsetof(struct ctdb_marshall_buffer, data);
+       params.allocated_len = params.len;
+       params.failed = false;
+
+       if (ctdb_db->unhealthy_reason) {
+               /* this is just a warning, as the tdb should be empty anyway */
+               DEBUG(DEBUG_WARNING,("db(%s) unhealty in ctdb_control_pull_db: %s\n",
+                                    ctdb_db->db_name, ctdb_db->unhealthy_reason));
+       }
+
+       if (ctdb_lockall_mark_prio(ctdb, ctdb_db->priority) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
+               return -1;
+       }
+
+       if (tdb_traverse_read(ctdb_db->ltdb->tdb, traverse_pulldb, &params) == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get traverse db '%s'\n", ctdb_db->db_name));
+               ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
+               talloc_free(params.pulldata);
+               return -1;
+       }
+
+       ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
+
+       outdata->dptr = (uint8_t *)params.pulldata;
+       outdata->dsize = params.len;
+
+       if (ctdb->tunable.db_record_count_warn != 0 && params.pulldata->count > ctdb->tunable.db_record_count_warn) {
+               DEBUG(DEBUG_ERR,("Database %s is big. Contains %d records\n", ctdb_db->db_name, params.pulldata->count));
+       }
+       if (ctdb->tunable.db_size_warn != 0 && outdata->dsize > ctdb->tunable.db_size_warn) {
+               DEBUG(DEBUG_ERR,("Database %s is big. Contains %d bytes\n", ctdb_db->db_name, (int)outdata->dsize));
+       }
+
+
+       return 0;
+}
+
+/*
+  push a bunch of records into a ltdb, filtering by rsn
+ */
+int32_t ctdb_control_push_db(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+       int i, ret;
+       struct ctdb_rec_data *rec;
+
+       if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
+               DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, reply->db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
+               return -1;
+       }
+
+       if (ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_FROZEN) {
+               DEBUG(DEBUG_DEBUG,("rejecting ctdb_control_push_db when not frozen\n"));
+               return -1;
+       }
+
+       if (ctdb_lockall_mark_prio(ctdb, ctdb_db->priority) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get lock on entired db - failing\n"));
+               return -1;
+       }
+
+       rec = (struct ctdb_rec_data *)&reply->data[0];
+
+       DEBUG(DEBUG_INFO,("starting push of %u records for dbid 0x%x\n",
+                reply->count, reply->db_id));
+
+       for (i=0;i<reply->count;i++) {
+               TDB_DATA key, data;
+               struct ctdb_ltdb_header *hdr;
+
+               key.dptr = &rec->data[0];
+               key.dsize = rec->keylen;
+               data.dptr = &rec->data[key.dsize];
+               data.dsize = rec->datalen;
+
+               if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+                       goto failed;
+               }
+               hdr = (struct ctdb_ltdb_header *)data.dptr;
+               /* strip off any read only record flags. All readonly records
+                  are revoked implicitely by a recovery
+               */
+               hdr->flags &= ~CTDB_REC_RO_FLAGS;
+
+               data.dptr += sizeof(*hdr);
+               data.dsize -= sizeof(*hdr);
+
+               ret = ctdb_ltdb_store(ctdb_db, key, hdr, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT, (__location__ " Unable to store record\n"));
+                       goto failed;
+               }
+
+               rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
+       }           
+
+       DEBUG(DEBUG_DEBUG,("finished push of %u records for dbid 0x%x\n",
+                reply->count, reply->db_id));
+
+       if (ctdb_db->readonly) {
+               DEBUG(DEBUG_CRIT,("Clearing the tracking database for dbid 0x%x\n",
+                                 ctdb_db->db_id));
+               if (tdb_wipe_all(ctdb_db->rottdb) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to wipe tracking database for 0x%x. Dropping read-only delegation support\n", ctdb_db->db_id));
+                       ctdb_db->readonly = false;
+                       tdb_close(ctdb_db->rottdb);
+                       ctdb_db->rottdb = NULL;
+                       ctdb_db->readonly = false;
+               }
+               while (ctdb_db->revokechild_active != NULL) {
+                       talloc_free(ctdb_db->revokechild_active);
+               }
+       }
+
+       ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
+       return 0;
+
+failed:
+       ctdb_lockall_unmark_prio(ctdb, ctdb_db->priority);
+       return -1;
+}
+
+struct ctdb_set_recmode_state {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_control *c;
+       uint32_t recmode;
+       int fd[2];
+       struct timed_event *te;
+       struct fd_event *fde;
+       pid_t child;
+       struct timeval start_time;
+};
+
+/*
+  called if our set_recmode child times out. this would happen if
+  ctdb_recovery_lock() would block.
+ */
+static void ctdb_set_recmode_timeout(struct event_context *ev, struct timed_event *te, 
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_set_recmode_state *state = talloc_get_type(private_data, 
+                                          struct ctdb_set_recmode_state);
+
+       /* we consider this a success, not a failure, as we failed to
+          set the recovery lock which is what we wanted.  This can be
+          caused by the cluster filesystem being very slow to
+          arbitrate locks immediately after a node failure.       
+        */
+       DEBUG(DEBUG_ERR,(__location__ " set_recmode child process hung/timedout CFS slow to grant locks? (allowing recmode set anyway)\n"));
+       state->ctdb->recovery_mode = state->recmode;
+       ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
+       talloc_free(state);
+}
+
+
+/* when we free the recmode state we must kill any child process.
+*/
+static int set_recmode_destructor(struct ctdb_set_recmode_state *state)
+{
+       double l = timeval_elapsed(&state->start_time);
+
+       CTDB_UPDATE_RECLOCK_LATENCY(state->ctdb, "daemon reclock", reclock.ctdbd, l);
+
+       if (state->fd[0] != -1) {
+               state->fd[0] = -1;
+       }
+       if (state->fd[1] != -1) {
+               state->fd[1] = -1;
+       }
+       ctdb_kill(state->ctdb, state->child, SIGKILL);
+       return 0;
+}
+
+/* this is called when the client process has completed ctdb_recovery_lock()
+   and has written data back to us through the pipe.
+*/
+static void set_recmode_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct ctdb_set_recmode_state *state= talloc_get_type(private_data, 
+                                            struct ctdb_set_recmode_state);
+       char c = 0;
+       int ret;
+
+       /* we got a response from our child process so we can abort the
+          timeout.
+       */
+       talloc_free(state->te);
+       state->te = NULL;
+
+
+       /* read the childs status when trying to lock the reclock file.
+          child wrote 0 if everything is fine and 1 if it did manage
+          to lock the file, which would be a problem since that means
+          we got a request to exit from recovery but we could still lock
+          the file   which at this time SHOULD be locked by the recovery
+          daemon on the recmaster
+       */              
+       ret = read(state->fd[0], &c, 1);
+       if (ret != 1 || c != 0) {
+               ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "managed to lock reclock file from inside daemon");
+               talloc_free(state);
+               return;
+       }
+
+       state->ctdb->recovery_mode = state->recmode;
+
+       /* release any deferred attach calls from clients */
+       if (state->recmode == CTDB_RECOVERY_NORMAL) {
+               ctdb_process_deferred_attach(state->ctdb);
+       }
+
+       ctdb_request_control_reply(state->ctdb, state->c, NULL, 0, NULL);
+       talloc_free(state);
+       return;
+}
+
+static void
+ctdb_drop_all_ips_event(struct event_context *ev, struct timed_event *te, 
+                              struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       DEBUG(DEBUG_ERR,(__location__ " Been in recovery mode for too long. Dropping all IPS\n"));
+       talloc_free(ctdb->release_ips_ctx);
+       ctdb->release_ips_ctx = NULL;
+
+       ctdb_release_all_ips(ctdb);
+}
+
+/*
+ * Set up an event to drop all public ips if we remain in recovery for too
+ * long
+ */
+int ctdb_deferred_drop_all_ips(struct ctdb_context *ctdb)
+{
+       if (ctdb->release_ips_ctx != NULL) {
+               talloc_free(ctdb->release_ips_ctx);
+       }
+       ctdb->release_ips_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, ctdb->release_ips_ctx);
+
+       event_add_timed(ctdb->ev, ctdb->release_ips_ctx, timeval_current_ofs(ctdb->tunable.recovery_drop_all_ips, 0), ctdb_drop_all_ips_event, ctdb);
+       return 0;
+}
+
+/*
+  set the recovery mode
+ */
+int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, 
+                                struct ctdb_req_control *c,
+                                TDB_DATA indata, bool *async_reply,
+                                const char **errormsg)
+{
+       uint32_t recmode = *(uint32_t *)indata.dptr;
+       int i, ret;
+       struct ctdb_set_recmode_state *state;
+       pid_t parent = getpid();
+
+       /* if we enter recovery but stay in recovery for too long
+          we will eventually drop all our ip addresses
+       */
+       if (recmode == CTDB_RECOVERY_NORMAL) {
+               talloc_free(ctdb->release_ips_ctx);
+               ctdb->release_ips_ctx = NULL;
+       } else {
+               if (ctdb_deferred_drop_all_ips(ctdb) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to set up deferred drop all ips\n"));
+               }
+       }
+
+       if (recmode != ctdb->recovery_mode) {
+               DEBUG(DEBUG_NOTICE,(__location__ " Recovery mode set to %s\n", 
+                        recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"ACTIVE"));
+       }
+
+       if (recmode != CTDB_RECOVERY_NORMAL ||
+           ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+               ctdb->recovery_mode = recmode;
+               return 0;
+       }
+
+       /* some special handling when ending recovery mode */
+
+       /* force the databases to thaw */
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb->freeze_handles[i] != NULL) {
+                       ctdb_control_thaw(ctdb, i);
+               }
+       }
+
+       state = talloc(ctdb, struct ctdb_set_recmode_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->start_time = timeval_current();
+       state->fd[0] = -1;
+       state->fd[1] = -1;
+
+       /* release any deferred attach calls from clients */
+       if (recmode == CTDB_RECOVERY_NORMAL) {
+               ctdb_process_deferred_attach(ctdb);
+       }
+
+       if (ctdb->tunable.verify_recovery_lock == 0) {
+               /* dont need to verify the reclock file */
+               ctdb->recovery_mode = recmode;
+               return 0;
+       }
+
+       /* For the rest of what needs to be done, we need to do this in
+          a child process since 
+          1, the call to ctdb_recovery_lock() can block if the cluster
+             filesystem is in the process of recovery.
+       */
+       ret = pipe(state->fd);
+       if (ret != 0) {
+               talloc_free(state);
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for set_recmode child\n"));
+               return -1;
+       }
+
+       state->child = ctdb_fork(ctdb);
+       if (state->child == (pid_t)-1) {
+               close(state->fd[0]);
+               close(state->fd[1]);
+               talloc_free(state);
+               return -1;
+       }
+
+       if (state->child == 0) {
+               char cc = 0;
+               close(state->fd[0]);
+
+               ctdb_set_process_name("ctdb_recmode");
+               debug_extra = talloc_asprintf(NULL, "set_recmode:");
+               /* we should not be able to get the lock on the reclock file, 
+                 as it should  be held by the recovery master 
+               */
+               if (ctdb_recovery_lock(ctdb, false)) {
+                       DEBUG(DEBUG_CRIT,("ERROR: recovery lock file %s not locked when recovering!\n", ctdb->recovery_lock_file));
+                       cc = 1;
+               }
+
+               write(state->fd[1], &cc, 1);
+               /* make sure we die when our parent dies */
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+                       write(state->fd[1], &cc, 1);
+               }
+               _exit(0);
+       }
+       close(state->fd[1]);
+       set_close_on_exec(state->fd[0]);
+
+       state->fd[1] = -1;
+
+       talloc_set_destructor(state, set_recmode_destructor);
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for setrecmode\n", state->fd[0]));
+
+       state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(5, 0),
+                                   ctdb_set_recmode_timeout, state);
+
+       state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
+                               EVENT_FD_READ,
+                               set_recmode_handler,
+                               (void *)state);
+
+       if (state->fde == NULL) {
+               talloc_free(state);
+               return -1;
+       }
+       tevent_fd_set_auto_close(state->fde);
+
+       state->ctdb    = ctdb;
+       state->recmode = recmode;
+       state->c       = talloc_steal(state, c);
+
+       *async_reply = true;
+
+       return 0;
+}
+
+
+/*
+  try and get the recovery lock in shared storage - should only work
+  on the recovery master recovery daemon. Anywhere else is a bug
+ */
+bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
+{
+       struct flock lock;
+
+       if (keep) {
+               DEBUG(DEBUG_ERR, ("Take the recovery lock\n"));
+       }
+       if (ctdb->recovery_lock_fd != -1) {
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
+       }
+
+       ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
+       if (ctdb->recovery_lock_fd == -1) {
+               DEBUG(DEBUG_ERR,("ctdb_recovery_lock: Unable to open %s - (%s)\n", 
+                        ctdb->recovery_lock_file, strerror(errno)));
+               return false;
+       }
+
+       set_close_on_exec(ctdb->recovery_lock_fd);
+
+       lock.l_type = F_WRLCK;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = 0;
+       lock.l_len = 1;
+       lock.l_pid = 0;
+
+       if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
+               if (keep) {
+                       DEBUG(DEBUG_CRIT,("ctdb_recovery_lock: Failed to get recovery lock on '%s'\n", ctdb->recovery_lock_file));
+               }
+               return false;
+       }
+
+       if (!keep) {
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
+       }
+
+       if (keep) {
+               DEBUG(DEBUG_NOTICE, ("Recovery lock taken successfully\n"));
+       }
+
+       DEBUG(DEBUG_NOTICE,("ctdb_recovery_lock: Got recovery lock on '%s'\n", ctdb->recovery_lock_file));
+
+       return true;
+}
+
+/*
+  delete a record as part of the vacuum process
+  only delete if we are not lmaster or dmaster, and our rsn is <= the provided rsn
+  use non-blocking locks
+
+  return 0 if the record was successfully deleted (i.e. it does not exist
+  when the function returns)
+  or !0 is the record still exists in the tdb after returning.
+ */
+static int delete_tdb_record(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct ctdb_rec_data *rec)
+{
+       TDB_DATA key, data, data2;
+       struct ctdb_ltdb_header *hdr, *hdr2;
+       
+       /* these are really internal tdb functions - but we need them here for
+          non-blocking lock of the freelist */
+       int tdb_lock_nonblock(struct tdb_context *tdb, int list, int ltype);
+       int tdb_unlock(struct tdb_context *tdb, int list, int ltype);
+
+
+       key.dsize = rec->keylen;
+       key.dptr  = &rec->data[0];
+       data.dsize = rec->datalen;
+       data.dptr = &rec->data[rec->keylen];
+
+       if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
+               DEBUG(DEBUG_INFO,(__location__ " Called delete on record where we are lmaster\n"));
+               return -1;
+       }
+
+       if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_ERR,(__location__ " Bad record size\n"));
+               return -1;
+       }
+
+       hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+       /* use a non-blocking lock */
+       if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+               return -1;
+       }
+
+       data2 = tdb_fetch(ctdb_db->ltdb->tdb, key);
+       if (data2.dptr == NULL) {
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               return 0;
+       }
+
+       if (data2.dsize < sizeof(struct ctdb_ltdb_header)) {
+               if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) == 0) {
+                       if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+                               DEBUG(DEBUG_CRIT,(__location__ " Failed to delete corrupt record\n"));
+                       }
+                       tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+                       DEBUG(DEBUG_CRIT,(__location__ " Deleted corrupt record\n"));
+               }
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               free(data2.dptr);
+               return 0;
+       }
+       
+       hdr2 = (struct ctdb_ltdb_header *)data2.dptr;
+
+       if (hdr2->rsn > hdr->rsn) {
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               DEBUG(DEBUG_INFO,(__location__ " Skipping record with rsn=%llu - called with rsn=%llu\n",
+                        (unsigned long long)hdr2->rsn, (unsigned long long)hdr->rsn));
+               free(data2.dptr);
+               return -1;
+       }
+
+       /* do not allow deleting record that have readonly flags set. */
+       if (hdr->flags & CTDB_REC_RO_FLAGS) {
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly flags set\n"));
+               free(data2.dptr);
+               return -1;
+       }
+       if (hdr2->flags & CTDB_REC_RO_FLAGS) {
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly flags set\n"));
+               free(data2.dptr);
+               return -1;
+       }
+
+       if (hdr2->dmaster == ctdb->pnn) {
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               DEBUG(DEBUG_INFO,(__location__ " Attempted delete record where we are the dmaster\n"));
+               free(data2.dptr);
+               return -1;
+       }
+
+       if (tdb_lock_nonblock(ctdb_db->ltdb->tdb, -1, F_WRLCK) != 0) {
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               free(data2.dptr);
+               return -1;
+       }
+
+       if (tdb_delete(ctdb_db->ltdb->tdb, key) != 0) {
+               tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+               tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+               DEBUG(DEBUG_INFO,(__location__ " Failed to delete record\n"));
+               free(data2.dptr);
+               return -1;
+       }
+
+       tdb_unlock(ctdb_db->ltdb->tdb, -1, F_WRLCK);
+       tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+       free(data2.dptr);
+       return 0;
+}
+
+
+
+struct recovery_callback_state {
+       struct ctdb_req_control *c;
+};
+
+
+/*
+  called when the 'recovered' event script has finished
+ */
+static void ctdb_end_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+       struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+       ctdb_enable_monitoring(ctdb);
+       CTDB_INCREMENT_STAT(ctdb, num_recoveries);
+
+       if (status != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " recovered event script failed (status %d)\n", status));
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
+       }
+
+       ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+       talloc_free(state);
+
+       gettimeofday(&ctdb->last_recovery_finished, NULL);
+
+       if (ctdb->runstate == CTDB_RUNSTATE_FIRST_RECOVERY) {
+               ctdb_set_runstate(ctdb, CTDB_RUNSTATE_STARTUP);
+       }
+}
+
+/*
+  recovery has finished
+ */
+int32_t ctdb_control_end_recovery(struct ctdb_context *ctdb, 
+                               struct ctdb_req_control *c,
+                               bool *async_reply)
+{
+       int ret;
+       struct recovery_callback_state *state;
+
+       DEBUG(DEBUG_NOTICE,("Recovery has finished\n"));
+
+       ctdb_persistent_finish_trans3_commits(ctdb);
+
+       state = talloc(ctdb, struct recovery_callback_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c    = c;
+
+       ctdb_disable_monitoring(ctdb);
+
+       ret = ctdb_event_script_callback(ctdb, state,
+                                        ctdb_end_recovery_callback, 
+                                        state, 
+                                        false,
+                                        CTDB_EVENT_RECOVERED, "%s", "");
+
+       if (ret != 0) {
+               ctdb_enable_monitoring(ctdb);
+
+               DEBUG(DEBUG_ERR,(__location__ " Failed to end recovery\n"));
+               talloc_free(state);
+               return -1;
+       }
+
+       /* tell the control that we will be reply asynchronously */
+       state->c    = talloc_steal(state, c);
+       *async_reply = true;
+       return 0;
+}
+
+/*
+  called when the 'startrecovery' event script has finished
+ */
+static void ctdb_start_recovery_callback(struct ctdb_context *ctdb, int status, void *p)
+{
+       struct recovery_callback_state *state = talloc_get_type(p, struct recovery_callback_state);
+
+       if (status != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " startrecovery event script failed (status %d)\n", status));
+       }
+
+       ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+       talloc_free(state);
+}
+
+/*
+  run the startrecovery eventscript
+ */
+int32_t ctdb_control_start_recovery(struct ctdb_context *ctdb, 
+                               struct ctdb_req_control *c,
+                               bool *async_reply)
+{
+       int ret;
+       struct recovery_callback_state *state;
+
+       DEBUG(DEBUG_NOTICE,(__location__ " startrecovery eventscript has been invoked\n"));
+       gettimeofday(&ctdb->last_recovery_started, NULL);
+
+       state = talloc(ctdb, struct recovery_callback_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c    = talloc_steal(state, c);
+
+       ctdb_disable_monitoring(ctdb);
+
+       ret = ctdb_event_script_callback(ctdb, state,
+                                        ctdb_start_recovery_callback, 
+                                        state, false,
+                                        CTDB_EVENT_START_RECOVERY,
+                                        "%s", "");
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to start recovery\n"));
+               talloc_free(state);
+               return -1;
+       }
+
+       /* tell the control that we will be reply asynchronously */
+       *async_reply = true;
+       return 0;
+}
+
+/*
+ try to delete all these records as part of the vacuuming process
+ and return the records we failed to delete
+*/
+int32_t ctdb_control_try_delete_records(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
+{
+       struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+       int i;
+       struct ctdb_rec_data *rec;
+       struct ctdb_marshall_buffer *records;
+
+       if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
+               DEBUG(DEBUG_ERR,(__location__ " invalid data in try_delete_records\n"));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, reply->db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR,(__location__ " Unknown db 0x%08x\n", reply->db_id));
+               return -1;
+       }
+
+
+       DEBUG(DEBUG_DEBUG,("starting try_delete_records of %u records for dbid 0x%x\n",
+                reply->count, reply->db_id));
+
+
+       /* create a blob to send back the records we couldnt delete */  
+       records = (struct ctdb_marshall_buffer *)
+                       talloc_zero_size(outdata, 
+                                   offsetof(struct ctdb_marshall_buffer, data));
+       if (records == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               return -1;
+       }
+       records->db_id = ctdb_db->db_id;
+
+
+       rec = (struct ctdb_rec_data *)&reply->data[0];
+       for (i=0;i<reply->count;i++) {
+               TDB_DATA key, data;
+
+               key.dptr = &rec->data[0];
+               key.dsize = rec->keylen;
+               data.dptr = &rec->data[key.dsize];
+               data.dsize = rec->datalen;
+
+               if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record in indata\n"));
+                       return -1;
+               }
+
+               /* If we cant delete the record we must add it to the reply
+                  so the lmaster knows it may not purge this record
+               */
+               if (delete_tdb_record(ctdb, ctdb_db, rec) != 0) {
+                       size_t old_size;
+                       struct ctdb_ltdb_header *hdr;
+
+                       hdr = (struct ctdb_ltdb_header *)data.dptr;
+                       data.dptr += sizeof(*hdr);
+                       data.dsize -= sizeof(*hdr);
+
+                       DEBUG(DEBUG_INFO, (__location__ " Failed to vacuum delete record with hash 0x%08x\n", ctdb_hash(&key)));
+
+                       old_size = talloc_get_size(records);
+                       records = talloc_realloc_size(outdata, records, old_size + rec->length);
+                       if (records == NULL) {
+                               DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
+                               return -1;
+                       }
+                       records->count++;
+                       memcpy(old_size+(uint8_t *)records, rec, rec->length);
+               } 
+
+               rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
+       }           
+
+
+       outdata->dptr = (uint8_t *)records;
+       outdata->dsize = talloc_get_size(records);
+
+       return 0;
+}
+
+/**
+ * Store a record as part of the vacuum process:
+ * This is called from the RECEIVE_RECORD control which
+ * the lmaster uses to send the current empty copy
+ * to all nodes for storing, before it lets the other
+ * nodes delete the records in the second phase with
+ * the TRY_DELETE_RECORDS control.
+ *
+ * Only store if we are not lmaster or dmaster, and our
+ * rsn is <= the provided rsn. Use non-blocking locks.
+ *
+ * return 0 if the record was successfully stored.
+ * return !0 if the record still exists in the tdb after returning.
+ */
+static int store_tdb_record(struct ctdb_context *ctdb,
+                           struct ctdb_db_context *ctdb_db,
+                           struct ctdb_rec_data *rec)
+{
+       TDB_DATA key, data, data2;
+       struct ctdb_ltdb_header *hdr, *hdr2;
+       int ret;
+
+       key.dsize = rec->keylen;
+       key.dptr = &rec->data[0];
+       data.dsize = rec->datalen;
+       data.dptr = &rec->data[rec->keylen];
+
+       if (ctdb_lmaster(ctdb, &key) == ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ " Called store_tdb_record "
+                                  "where we are lmaster\n"));
+               return -1;
+       }
+
+       if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_ERR, (__location__ " Bad record size\n"));
+               return -1;
+       }
+
+       hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+       /* use a non-blocking lock */
+       if (tdb_chainlock_nonblock(ctdb_db->ltdb->tdb, key) != 0) {
+               DEBUG(DEBUG_INFO, (__location__ " Failed to lock chain in non-blocking mode\n"));
+               return -1;
+       }
+
+       data2 = tdb_fetch(ctdb_db->ltdb->tdb, key);
+       if (data2.dptr == NULL || data2.dsize < sizeof(struct ctdb_ltdb_header)) {
+               tdb_store(ctdb_db->ltdb->tdb, key, data, 0);
+               DEBUG(DEBUG_INFO, (__location__ " Stored record\n"));
+               ret = 0;
+               goto done;
+       }
+
+       hdr2 = (struct ctdb_ltdb_header *)data2.dptr;
+
+       if (hdr2->rsn > hdr->rsn) {
+               DEBUG(DEBUG_INFO, (__location__ " Skipping record with "
+                                  "rsn=%llu - called with rsn=%llu\n",
+                                  (unsigned long long)hdr2->rsn,
+                                  (unsigned long long)hdr->rsn));
+               ret = -1;
+               goto done;
+       }
+
+       /* do not allow vacuuming of records that have readonly flags set. */
+       if (hdr->flags & CTDB_REC_RO_FLAGS) {
+               DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly "
+                                 "flags set\n"));
+               ret = -1;
+               goto done;
+       }
+       if (hdr2->flags & CTDB_REC_RO_FLAGS) {
+               DEBUG(DEBUG_INFO,(__location__ " Skipping record with readonly "
+                                 "flags set\n"));
+               ret = -1;
+               goto done;
+       }
+
+       if (hdr2->dmaster == ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ " Attempted to store record "
+                                  "where we are the dmaster\n"));
+               ret = -1;
+               goto done;
+       }
+
+       if (tdb_store(ctdb_db->ltdb->tdb, key, data, 0) != 0) {
+               DEBUG(DEBUG_INFO,(__location__ " Failed to store record\n"));
+               ret = -1;
+               goto done;
+       }
+
+       ret = 0;
+
+done:
+       tdb_chainunlock(ctdb_db->ltdb->tdb, key);
+       free(data2.dptr);
+       return  ret;
+}
+
+
+
+/**
+ * Try to store all these records as part of the vacuuming process
+ * and return the records we failed to store.
+ */
+int32_t ctdb_control_receive_records(struct ctdb_context *ctdb,
+                                    TDB_DATA indata, TDB_DATA *outdata)
+{
+       struct ctdb_marshall_buffer *reply = (struct ctdb_marshall_buffer *)indata.dptr;
+       struct ctdb_db_context *ctdb_db;
+       int i;
+       struct ctdb_rec_data *rec;
+       struct ctdb_marshall_buffer *records;
+
+       if (indata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
+               DEBUG(DEBUG_ERR,
+                     (__location__ " invalid data in receive_records\n"));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, reply->db_id);
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR, (__location__ " Unknown db 0x%08x\n",
+                                 reply->db_id));
+               return -1;
+       }
+
+       DEBUG(DEBUG_DEBUG, ("starting receive_records of %u records for "
+                           "dbid 0x%x\n", reply->count, reply->db_id));
+
+       /* create a blob to send back the records we could not store */
+       records = (struct ctdb_marshall_buffer *)
+                       talloc_zero_size(outdata,
+                               offsetof(struct ctdb_marshall_buffer, data));
+       if (records == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Out of memory\n"));
+               return -1;
+       }
+       records->db_id = ctdb_db->db_id;
+
+       rec = (struct ctdb_rec_data *)&reply->data[0];
+       for (i=0; i<reply->count; i++) {
+               TDB_DATA key, data;
+
+               key.dptr = &rec->data[0];
+               key.dsize = rec->keylen;
+               data.dptr = &rec->data[key.dsize];
+               data.dsize = rec->datalen;
+
+               if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+                       DEBUG(DEBUG_CRIT, (__location__ " bad ltdb record "
+                                          "in indata\n"));
+                       return -1;
+               }
+
+               /*
+                * If we can not store the record we must add it to the reply
+                * so the lmaster knows it may not purge this record.
+                */
+               if (store_tdb_record(ctdb, ctdb_db, rec) != 0) {
+                       size_t old_size;
+                       struct ctdb_ltdb_header *hdr;
+
+                       hdr = (struct ctdb_ltdb_header *)data.dptr;
+                       data.dptr += sizeof(*hdr);
+                       data.dsize -= sizeof(*hdr);
+
+                       DEBUG(DEBUG_INFO, (__location__ " Failed to store "
+                                          "record with hash 0x%08x in vacuum "
+                                          "via RECEIVE_RECORDS\n",
+                                          ctdb_hash(&key)));
+
+                       old_size = talloc_get_size(records);
+                       records = talloc_realloc_size(outdata, records,
+                                                     old_size + rec->length);
+                       if (records == NULL) {
+                               DEBUG(DEBUG_ERR, (__location__ " Failed to "
+                                                 "expand\n"));
+                               return -1;
+                       }
+                       records->count++;
+                       memcpy(old_size+(uint8_t *)records, rec, rec->length);
+               }
+
+               rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
+       }
+
+
+       outdata->dptr = (uint8_t *)records;
+       outdata->dsize = talloc_get_size(records);
+
+       return 0;
+}
+
+
+/*
+  report capabilities
+ */
+int32_t ctdb_control_get_capabilities(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+       uint32_t *capabilities = NULL;
+
+       capabilities = talloc(outdata, uint32_t);
+       CTDB_NO_MEMORY(ctdb, capabilities);
+       *capabilities = ctdb->capabilities;
+
+       outdata->dsize = sizeof(uint32_t);
+       outdata->dptr = (uint8_t *)capabilities;
+
+       return 0;       
+}
+
+/* The recovery daemon will ping us at regular intervals.
+   If we havent been pinged for a while we assume the recovery
+   daemon is inoperable and we restart.
+*/
+static void ctdb_recd_ping_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
+{
+       struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+       uint32_t *count = talloc_get_type(ctdb->recd_ping_count, uint32_t);
+
+       DEBUG(DEBUG_ERR, ("Recovery daemon ping timeout. Count : %u\n", *count));
+
+       if (*count < ctdb->tunable.recd_ping_failcount) {
+               (*count)++;
+               event_add_timed(ctdb->ev, ctdb->recd_ping_count, 
+                       timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
+                       ctdb_recd_ping_timeout, ctdb);
+               return;
+       }
+
+       DEBUG(DEBUG_ERR, ("Final timeout for recovery daemon ping. Restarting recovery daemon. (This can be caused if the cluster filesystem has hung)\n"));
+
+       ctdb_stop_recoverd(ctdb);
+       ctdb_start_recoverd(ctdb);
+}
+
+int32_t ctdb_control_recd_ping(struct ctdb_context *ctdb)
+{
+       talloc_free(ctdb->recd_ping_count);
+
+       ctdb->recd_ping_count = talloc_zero(ctdb, uint32_t);
+       CTDB_NO_MEMORY(ctdb, ctdb->recd_ping_count);
+
+       if (ctdb->tunable.recd_ping_timeout != 0) {
+               event_add_timed(ctdb->ev, ctdb->recd_ping_count, 
+                       timeval_current_ofs(ctdb->tunable.recd_ping_timeout, 0),
+                       ctdb_recd_ping_timeout, ctdb);
+       }
+
+       return 0;
+}
+
+
+
+int32_t ctdb_control_set_recmaster(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata)
+{
+       uint32_t new_recmaster;
+
+       CHECK_CONTROL_DATA_SIZE(sizeof(uint32_t));
+       new_recmaster = ((uint32_t *)(&indata.dptr[0]))[0];
+
+       if (ctdb->pnn != new_recmaster && ctdb->recovery_master == ctdb->pnn) {
+               DEBUG(DEBUG_NOTICE,
+                     ("This node (%u) is no longer the recovery master\n", ctdb->pnn));
+       }
+
+       if (ctdb->pnn == new_recmaster && ctdb->recovery_master != new_recmaster) {
+               DEBUG(DEBUG_NOTICE,
+                     ("This node (%u) is now the recovery master\n", ctdb->pnn));
+       }
+
+       ctdb->recovery_master = new_recmaster;
+       return 0;
+}
+
+
+int32_t ctdb_control_stop_node(struct ctdb_context *ctdb)
+{
+       DEBUG(DEBUG_NOTICE, ("Stopping node\n"));
+       ctdb_disable_monitoring(ctdb);
+       ctdb->nodes[ctdb->pnn]->flags |= NODE_FLAGS_STOPPED;
+
+       return 0;
+}
+
+int32_t ctdb_control_continue_node(struct ctdb_context *ctdb)
+{
+       DEBUG(DEBUG_NOTICE, ("Continue node\n"));
+       ctdb->nodes[ctdb->pnn]->flags &= ~NODE_FLAGS_STOPPED;
+
+       return 0;
+}
+
diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c
new file mode 100644 (file)
index 0000000..5caf7c0
--- /dev/null
@@ -0,0 +1,4328 @@
+/* 
+   ctdb recovery daemon
+
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "system/network.h"
+#include "system/wait.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "db_wrap.h"
+#include "dlinklist.h"
+
+
+/* List of SRVID requests that need to be processed */
+struct srvid_list {
+       struct srvid_list *next, *prev;
+       struct srvid_request *request;
+};
+
+struct srvid_requests {
+       struct srvid_list *requests;
+};
+
+static void srvid_request_reply(struct ctdb_context *ctdb,
+                               struct srvid_request *request,
+                               TDB_DATA result)
+{
+       /* Someone that sent srvid==0 does not want a reply */
+       if (request->srvid == 0) {
+               talloc_free(request);
+               return;
+       }
+
+       if (ctdb_client_send_message(ctdb, request->pnn, request->srvid,
+                                    result) == 0) {
+               DEBUG(DEBUG_INFO,("Sent SRVID reply to %u:%llu\n",
+                                 (unsigned)request->pnn,
+                                 (unsigned long long)request->srvid));
+       } else {
+               DEBUG(DEBUG_ERR,("Failed to send SRVID reply to %u:%llu\n",
+                                (unsigned)request->pnn,
+                                (unsigned long long)request->srvid));
+       }
+
+       talloc_free(request);
+}
+
+static void srvid_requests_reply(struct ctdb_context *ctdb,
+                                struct srvid_requests **requests,
+                                TDB_DATA result)
+{
+       struct srvid_list *r;
+
+       for (r = (*requests)->requests; r != NULL; r = r->next) {
+               srvid_request_reply(ctdb, r->request, result);
+       }
+
+       /* Free the list structure... */
+       TALLOC_FREE(*requests);
+}
+
+static void srvid_request_add(struct ctdb_context *ctdb,
+                             struct srvid_requests **requests,
+                             struct srvid_request *request)
+{
+       struct srvid_list *t;
+       int32_t ret;
+       TDB_DATA result;
+
+       if (*requests == NULL) {
+               *requests = talloc_zero(ctdb, struct srvid_requests);
+               if (*requests == NULL) {
+                       goto nomem;
+               }
+       }
+
+       t = talloc_zero(*requests, struct srvid_list);
+       if (t == NULL) {
+               /* If *requests was just allocated above then free it */
+               if ((*requests)->requests == NULL) {
+                       TALLOC_FREE(*requests);
+               }
+               goto nomem;
+       }
+
+       t->request = (struct srvid_request *)talloc_steal(t, request);
+       DLIST_ADD((*requests)->requests, t);
+
+       return;
+
+nomem:
+       /* Failed to add the request to the list.  Send a fail. */
+       DEBUG(DEBUG_ERR, (__location__
+                         " Out of memory, failed to queue SRVID request\n"));
+       ret = -ENOMEM;
+       result.dsize = sizeof(ret);
+       result.dptr = (uint8_t *)&ret;
+       srvid_request_reply(ctdb, request, result);
+}
+
+struct ctdb_banning_state {
+       uint32_t count;
+       struct timeval last_reported_time;
+};
+
+/*
+  private state of recovery daemon
+ */
+struct ctdb_recoverd {
+       struct ctdb_context *ctdb;
+       uint32_t recmaster;
+       uint32_t num_active;
+       uint32_t num_lmasters;
+       uint32_t num_connected;
+       uint32_t last_culprit_node;
+       struct ctdb_node_map *nodemap;
+       struct timeval priority_time;
+       bool need_takeover_run;
+       bool need_recovery;
+       uint32_t node_flags;
+       struct timed_event *send_election_te;
+       struct timed_event *election_timeout;
+       struct vacuum_info *vacuum_info;
+       struct srvid_requests *reallocate_requests;
+       bool takeover_run_in_progress;
+       TALLOC_CTX *takeover_runs_disable_ctx;
+       struct ctdb_control_get_ifaces *ifaces;
+       uint32_t *force_rebalance_nodes;
+};
+
+#define CONTROL_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_timeout, 0)
+#define MONITOR_TIMEOUT() timeval_current_ofs(ctdb->tunable.recover_interval, 0)
+
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, struct timeval t, void *private_data);
+
+/*
+  ban a node for a period of time
+ */
+static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t pnn, uint32_t ban_time)
+{
+       int ret;
+       struct ctdb_context *ctdb = rec->ctdb;
+       struct ctdb_ban_time bantime;
+       
+       if (!ctdb_validate_pnn(ctdb, pnn)) {
+               DEBUG(DEBUG_ERR,("Bad pnn %u in ctdb_ban_node\n", pnn));
+               return;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Banning node %u for %u seconds\n", pnn, ban_time));
+
+       bantime.pnn  = pnn;
+       bantime.time = ban_time;
+
+       ret = ctdb_ctrl_set_ban(ctdb, CONTROL_TIMEOUT(), pnn, &bantime);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to ban node %d\n", pnn));
+               return;
+       }
+
+}
+
+enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED};
+
+
+/*
+  remember the trouble maker
+ */
+static void ctdb_set_culprit_count(struct ctdb_recoverd *rec, uint32_t culprit, uint32_t count)
+{
+       struct ctdb_context *ctdb = talloc_get_type(rec->ctdb, struct ctdb_context);
+       struct ctdb_banning_state *ban_state;
+
+       if (culprit > ctdb->num_nodes) {
+               DEBUG(DEBUG_ERR,("Trying to set culprit %d but num_nodes is %d\n", culprit, ctdb->num_nodes));
+               return;
+       }
+
+       /* If we are banned or stopped, do not set other nodes as culprits */
+       if (rec->node_flags & NODE_FLAGS_INACTIVE) {
+               DEBUG(DEBUG_NOTICE, ("This node is INACTIVE, cannot set culprit node %d\n", culprit));
+               return;
+       }
+
+       if (ctdb->nodes[culprit]->ban_state == NULL) {
+               ctdb->nodes[culprit]->ban_state = talloc_zero(ctdb->nodes[culprit], struct ctdb_banning_state);
+               CTDB_NO_MEMORY_VOID(ctdb, ctdb->nodes[culprit]->ban_state);
+
+               
+       }
+       ban_state = ctdb->nodes[culprit]->ban_state;
+       if (timeval_elapsed(&ban_state->last_reported_time) > ctdb->tunable.recovery_grace_period) {
+               /* this was the first time in a long while this node
+                  misbehaved so we will forgive any old transgressions.
+               */
+               ban_state->count = 0;
+       }
+
+       ban_state->count += count;
+       ban_state->last_reported_time = timeval_current();
+       rec->last_culprit_node = culprit;
+}
+
+/*
+  remember the trouble maker
+ */
+static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
+{
+       ctdb_set_culprit_count(rec, culprit, 1);
+}
+
+
+/* this callback is called for every node that failed to execute the
+   recovered event
+*/
+static void recovered_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_ERR, (__location__ " Node %u failed the recovered event. Setting it as recovery fail culprit\n", node_pnn));
+
+       ctdb_set_culprit(rec, node_pnn);
+}
+
+/*
+  run the "recovered" eventscript on all nodes
+ */
+static int run_recovered_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, const char *caller)
+{
+       TALLOC_CTX *tmp_ctx;
+       uint32_t *nodes;
+       struct ctdb_context *ctdb = rec->ctdb;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_END_RECOVERY,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, tdb_null,
+                                       NULL, recovered_fail_callback,
+                                       rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event when called from %s\n", caller));
+
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/* this callback is called for every node that failed to execute the
+   start recovery event
+*/
+static void startrecovery_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_ERR, (__location__ " Node %u failed the startrecovery event. Setting it as recovery fail culprit\n", node_pnn));
+
+       ctdb_set_culprit(rec, node_pnn);
+}
+
+/*
+  run the "startrecovery" eventscript on all nodes
+ */
+static int run_startrecovery_eventscript(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
+{
+       TALLOC_CTX *tmp_ctx;
+       uint32_t *nodes;
+       struct ctdb_context *ctdb = rec->ctdb;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_START_RECOVERY,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, tdb_null,
+                                       NULL,
+                                       startrecovery_fail_callback,
+                                       rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event. Recovery failed.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+static void async_getcap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       if ( (outdata.dsize != sizeof(uint32_t)) || (outdata.dptr == NULL) ) {
+               DEBUG(DEBUG_ERR, (__location__ " Invalid length/pointer for getcap callback : %u %p\n",  (unsigned)outdata.dsize, outdata.dptr));
+               return;
+       }
+       if (node_pnn < ctdb->num_nodes) {
+               ctdb->nodes[node_pnn]->capabilities = *((uint32_t *)outdata.dptr);
+       }
+
+       if (node_pnn == ctdb->pnn) {
+               ctdb->capabilities = ctdb->nodes[node_pnn]->capabilities;
+       }
+}
+
+/*
+  update the node capabilities for all connected nodes
+ */
+static int update_capabilities(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+{
+       uint32_t *nodes;
+       TALLOC_CTX *tmp_ctx;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_CAPABILITIES,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(),
+                                       false, tdb_null,
+                                       async_getcap_callback, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to read node capabilities.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+static void set_recmode_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_ERR,("Failed to freeze node %u during recovery. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
+       ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
+}
+
+static void transaction_start_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_ERR,("Failed to start recovery transaction on node %u. Set it as ban culprit for %d credits\n", node_pnn, rec->nodemap->num));
+       ctdb_set_culprit_count(rec, node_pnn, rec->nodemap->num);
+}
+
+/*
+  change recovery mode on all nodes
+ */
+static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t rec_mode)
+{
+       TDB_DATA data;
+       uint32_t *nodes;
+       TALLOC_CTX *tmp_ctx;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       /* freeze all nodes */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (rec_mode == CTDB_RECOVERY_ACTIVE) {
+               int i;
+
+               for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+                       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
+                                               nodes, i,
+                                               CONTROL_TIMEOUT(),
+                                               false, tdb_null,
+                                               NULL,
+                                               set_recmode_fail_callback,
+                                               rec) != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to freeze nodes. Recovery failed.\n"));
+                               talloc_free(tmp_ctx);
+                               return -1;
+                       }
+               }
+       }
+
+
+       data.dsize = sizeof(uint32_t);
+       data.dptr = (unsigned char *)&rec_mode;
+
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMODE,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(),
+                                       false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode. Recovery failed.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  change recovery master on all node
+ */
+static int set_recovery_master(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn)
+{
+       TDB_DATA data;
+       TALLOC_CTX *tmp_ctx;
+       uint32_t *nodes;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       data.dsize = sizeof(uint32_t);
+       data.dptr = (unsigned char *)&pnn;
+
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECMASTER,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set recmaster. Recovery failed.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/* update all remote nodes to use the same db priority that we have
+   this can fail if the remove node has not yet been upgraded to 
+   support this function, so we always return success and never fail
+   a recovery if this call fails.
+*/
+static int update_db_priority_on_remote_nodes(struct ctdb_context *ctdb,
+       struct ctdb_node_map *nodemap, 
+       uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
+{
+       int db;
+       uint32_t *nodes;
+
+       nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+
+       /* step through all local databases */
+       for (db=0; db<dbmap->num;db++) {
+               TDB_DATA data;
+               struct ctdb_db_priority db_prio;
+               int ret;
+
+               db_prio.db_id     = dbmap->dbs[db].dbid;
+               ret = ctdb_ctrl_get_db_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, dbmap->dbs[db].dbid, &db_prio.priority);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to read database priority from local node for db 0x%08x\n", dbmap->dbs[db].dbid));
+                       continue;
+               }
+
+               DEBUG(DEBUG_INFO,("Update DB priority for db 0x%08x to %u\n", dbmap->dbs[db].dbid, db_prio.priority)); 
+
+               data.dptr  = (uint8_t *)&db_prio;
+               data.dsize = sizeof(db_prio);
+
+               if (ctdb_client_async_control(ctdb,
+                                       CTDB_CONTROL_SET_DB_PRIORITY,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to set DB priority for 0x%08x\n", db_prio.db_id));
+               }
+       }
+
+       return 0;
+}                      
+
+/*
+  ensure all other nodes have attached to any databases that we have
+ */
+static int create_missing_remote_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, 
+                                          uint32_t pnn, struct ctdb_dbid_map *dbmap, TALLOC_CTX *mem_ctx)
+{
+       int i, j, db, ret;
+       struct ctdb_dbid_map *remote_dbmap;
+
+       /* verify that all other nodes have all our databases */
+       for (j=0; j<nodemap->num; j++) {
+               /* we dont need to ourself ourselves */
+               if (nodemap->nodes[j].pnn == pnn) {
+                       continue;
+               }
+               /* dont check nodes that are unavailable */
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+                                        mem_ctx, &remote_dbmap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
+                       return -1;
+               }
+
+               /* step through all local databases */
+               for (db=0; db<dbmap->num;db++) {
+                       const char *name;
+
+
+                       for (i=0;i<remote_dbmap->num;i++) {
+                               if (dbmap->dbs[db].dbid == remote_dbmap->dbs[i].dbid) {
+                                       break;
+                               }
+                       }
+                       /* the remote node already have this database */
+                       if (i!=remote_dbmap->num) {
+                               continue;
+                       }
+                       /* ok so we need to create this database */
+                       ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), pnn, dbmap->dbs[db].dbid, 
+                                           mem_ctx, &name);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", pnn));
+                               return -1;
+                       }
+                       ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+                                          mem_ctx, name,
+                                          dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to create remote db:%s\n", name));
+                               return -1;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+  ensure we are attached to any databases that anyone else is attached to
+ */
+static int create_missing_local_databases(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, 
+                                         uint32_t pnn, struct ctdb_dbid_map **dbmap, TALLOC_CTX *mem_ctx)
+{
+       int i, j, db, ret;
+       struct ctdb_dbid_map *remote_dbmap;
+
+       /* verify that we have all database any other node has */
+       for (j=0; j<nodemap->num; j++) {
+               /* we dont need to ourself ourselves */
+               if (nodemap->nodes[j].pnn == pnn) {
+                       continue;
+               }
+               /* dont check nodes that are unavailable */
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+                                        mem_ctx, &remote_dbmap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node %u\n", pnn));
+                       return -1;
+               }
+
+               /* step through all databases on the remote node */
+               for (db=0; db<remote_dbmap->num;db++) {
+                       const char *name;
+
+                       for (i=0;i<(*dbmap)->num;i++) {
+                               if (remote_dbmap->dbs[db].dbid == (*dbmap)->dbs[i].dbid) {
+                                       break;
+                               }
+                       }
+                       /* we already have this db locally */
+                       if (i!=(*dbmap)->num) {
+                               continue;
+                       }
+                       /* ok so we need to create this database and
+                          rebuild dbmap
+                        */
+                       ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+                                           remote_dbmap->dbs[db].dbid, mem_ctx, &name);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to get dbname from node %u\n", 
+                                         nodemap->nodes[j].pnn));
+                               return -1;
+                       }
+                       ctdb_ctrl_createdb(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, name, 
+                                          remote_dbmap->dbs[db].flags & CTDB_DB_FLAGS_PERSISTENT);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to create local db:%s\n", name));
+                               return -1;
+                       }
+                       ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, dbmap);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to reread dbmap on node %u\n", pnn));
+                               return -1;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+  pull the remote database contents from one node into the recdb
+ */
+static int pull_one_remote_database(struct ctdb_context *ctdb, uint32_t srcnode, 
+                                   struct tdb_wrap *recdb, uint32_t dbid)
+{
+       int ret;
+       TDB_DATA outdata;
+       struct ctdb_marshall_buffer *reply;
+       struct ctdb_rec_data *rec;
+       int i;
+       TALLOC_CTX *tmp_ctx = talloc_new(recdb);
+
+       ret = ctdb_ctrl_pulldb(ctdb, srcnode, dbid, CTDB_LMASTER_ANY, tmp_ctx,
+                              CONTROL_TIMEOUT(), &outdata);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Unable to copy db from node %u\n", srcnode));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       reply = (struct ctdb_marshall_buffer *)outdata.dptr;
+
+       if (outdata.dsize < offsetof(struct ctdb_marshall_buffer, data)) {
+               DEBUG(DEBUG_ERR,(__location__ " invalid data in pulldb reply\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       
+       rec = (struct ctdb_rec_data *)&reply->data[0];
+       
+       for (i=0;
+            i<reply->count;
+            rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec), i++) {
+               TDB_DATA key, data;
+               struct ctdb_ltdb_header *hdr;
+               TDB_DATA existing;
+               
+               key.dptr = &rec->data[0];
+               key.dsize = rec->keylen;
+               data.dptr = &rec->data[key.dsize];
+               data.dsize = rec->datalen;
+               
+               hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+               if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+                       DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+
+               /* fetch the existing record, if any */
+               existing = tdb_fetch(recdb->tdb, key);
+               
+               if (existing.dptr != NULL) {
+                       struct ctdb_ltdb_header header;
+                       if (existing.dsize < sizeof(struct ctdb_ltdb_header)) {
+                               DEBUG(DEBUG_CRIT,(__location__ " Bad record size %u from node %u\n", 
+                                        (unsigned)existing.dsize, srcnode));
+                               free(existing.dptr);
+                               talloc_free(tmp_ctx);
+                               return -1;
+                       }
+                       header = *(struct ctdb_ltdb_header *)existing.dptr;
+                       free(existing.dptr);
+                       if (!(header.rsn < hdr->rsn ||
+                             (header.dmaster != ctdb->recovery_master && header.rsn == hdr->rsn))) {
+                               continue;
+                       }
+               }
+               
+               if (tdb_store(recdb->tdb, key, data, TDB_REPLACE) != 0) {
+                       DEBUG(DEBUG_CRIT,(__location__ " Failed to store record\n"));
+                       talloc_free(tmp_ctx);
+                       return -1;                              
+               }
+       }
+
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+
+struct pull_seqnum_cbdata {
+       int failed;
+       uint32_t pnn;
+       uint64_t seqnum;
+};
+
+static void pull_seqnum_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
+       uint64_t seqnum;
+
+       if (cb_data->failed != 0) {
+               DEBUG(DEBUG_ERR, ("Got seqnum from node %d but we have already failed the entire operation\n", node_pnn));
+               return;
+       }
+
+       if (res != 0) {
+               DEBUG(DEBUG_ERR, ("Error when pulling seqnum from node %d\n", node_pnn));
+               cb_data->failed = 1;
+               return;
+       }
+
+       if (outdata.dsize != sizeof(uint64_t)) {
+               DEBUG(DEBUG_ERR, ("Error when reading pull seqnum from node %d, got %d bytes but expected %d\n", node_pnn, (int)outdata.dsize, (int)sizeof(uint64_t)));
+               cb_data->failed = -1;
+               return;
+       }
+
+       seqnum = *((uint64_t *)outdata.dptr);
+
+       if (seqnum > cb_data->seqnum) {
+               cb_data->seqnum = seqnum;
+               cb_data->pnn = node_pnn;
+       }
+}
+
+static void pull_seqnum_fail_cb(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct pull_seqnum_cbdata *cb_data = talloc_get_type(callback_data, struct pull_seqnum_cbdata);
+
+       DEBUG(DEBUG_ERR, ("Failed to pull db seqnum from node %d\n", node_pnn));
+       cb_data->failed = 1;
+}
+
+static int pull_highest_seqnum_pdb(struct ctdb_context *ctdb,
+                               struct ctdb_recoverd *rec, 
+                               struct ctdb_node_map *nodemap, 
+                               struct tdb_wrap *recdb, uint32_t dbid)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint32_t *nodes;
+       TDB_DATA data;
+       uint32_t outdata[2];
+       struct pull_seqnum_cbdata *cb_data;
+
+       DEBUG(DEBUG_NOTICE, ("Scan for highest seqnum pdb for db:0x%08x\n", dbid));
+
+       outdata[0] = dbid;
+       outdata[1] = 0;
+
+       data.dsize = sizeof(outdata);
+       data.dptr  = (uint8_t *)&outdata[0];
+
+       cb_data = talloc(tmp_ctx, struct pull_seqnum_cbdata);
+       if (cb_data == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to allocate pull highest seqnum cb_data structure\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       cb_data->failed = 0;
+       cb_data->pnn    = -1;
+       cb_data->seqnum = 0;
+       
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_DB_SEQNUM,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, data,
+                                       pull_seqnum_cb,
+                                       pull_seqnum_fail_cb,
+                                       cb_data) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to run async GET_DB_SEQNUM\n"));
+
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (cb_data->failed != 0) {
+               DEBUG(DEBUG_NOTICE, ("Failed to pull sequence numbers for DB 0x%08x\n", dbid));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (cb_data->seqnum == 0 || cb_data->pnn == -1) {
+               DEBUG(DEBUG_NOTICE, ("Failed to find a node with highest sequence numbers for DB 0x%08x\n", dbid));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, ("Pull persistent db:0x%08x from node %d with highest seqnum:%lld\n", dbid, cb_data->pnn, (long long)cb_data->seqnum)); 
+
+       if (pull_one_remote_database(ctdb, cb_data->pnn, recdb, dbid) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to pull higest seqnum database 0x%08x from node %d\n", dbid, cb_data->pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
+/*
+  pull all the remote database contents into the recdb
+ */
+static int pull_remote_database(struct ctdb_context *ctdb,
+                               struct ctdb_recoverd *rec, 
+                               struct ctdb_node_map *nodemap, 
+                               struct tdb_wrap *recdb, uint32_t dbid,
+                               bool persistent)
+{
+       int j;
+
+       if (persistent && ctdb->tunable.recover_pdb_by_seqnum != 0) {
+               int ret;
+               ret = pull_highest_seqnum_pdb(ctdb, rec, nodemap, recdb, dbid);
+               if (ret == 0) {
+                       return 0;
+               }
+       }
+
+       /* pull all records from all other nodes across onto this node
+          (this merges based on rsn)
+       */
+       for (j=0; j<nodemap->num; j++) {
+               /* dont merge from nodes that are unavailable */
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (pull_one_remote_database(ctdb, nodemap->nodes[j].pnn, recdb, dbid) != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to pull remote database from node %u\n", 
+                                nodemap->nodes[j].pnn));
+                       ctdb_set_culprit_count(rec, nodemap->nodes[j].pnn, nodemap->num);
+                       return -1;
+               }
+       }
+       
+       return 0;
+}
+
+
+/*
+  update flags on all active nodes
+ */
+static int update_flags_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, uint32_t pnn, uint32_t flags)
+{
+       int ret;
+
+       ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), pnn, flags, ~flags);
+               if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  ensure all nodes have the same vnnmap we do
+ */
+static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap, 
+                                     uint32_t pnn, struct ctdb_vnn_map *vnnmap, TALLOC_CTX *mem_ctx)
+{
+       int j, ret;
+
+       /* push the new vnn map out to all the nodes */
+       for (j=0; j<nodemap->num; j++) {
+               /* dont push to nodes that are unavailable */
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, mem_ctx, vnnmap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+
+struct vacuum_info {
+       struct vacuum_info *next, *prev;
+       struct ctdb_recoverd *rec;
+       uint32_t srcnode;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_marshall_buffer *recs;
+       struct ctdb_rec_data *r;
+};
+
+static void vacuum_fetch_next(struct vacuum_info *v);
+
+/*
+  called when a vacuum fetch has completed - just free it and do the next one
+ */
+static void vacuum_fetch_callback(struct ctdb_client_call_state *state)
+{
+       struct vacuum_info *v = talloc_get_type(state->async.private_data, struct vacuum_info);
+       talloc_free(state);
+       vacuum_fetch_next(v);
+}
+
+
+/*
+  process the next element from the vacuum list
+*/
+static void vacuum_fetch_next(struct vacuum_info *v)
+{
+       struct ctdb_call call;
+       struct ctdb_rec_data *r;
+
+       while (v->recs->count) {
+               struct ctdb_client_call_state *state;
+               TDB_DATA data;
+               struct ctdb_ltdb_header *hdr;
+
+               ZERO_STRUCT(call);
+               call.call_id = CTDB_NULL_FUNC;
+               call.flags = CTDB_IMMEDIATE_MIGRATION;
+               call.flags |= CTDB_CALL_FLAG_VACUUM_MIGRATION;
+
+               r = v->r;
+               v->r = (struct ctdb_rec_data *)(r->length + (uint8_t *)r);
+               v->recs->count--;
+
+               call.key.dptr = &r->data[0];
+               call.key.dsize = r->keylen;
+
+               /* ensure we don't block this daemon - just skip a record if we can't get
+                  the chainlock */
+               if (tdb_chainlock_nonblock(v->ctdb_db->ltdb->tdb, call.key) != 0) {
+                       continue;
+               }
+
+               data = tdb_fetch(v->ctdb_db->ltdb->tdb, call.key);
+               if (data.dptr == NULL) {
+                       tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+                       continue;
+               }
+
+               if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+                       free(data.dptr);
+                       tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+                       continue;
+               }
+               
+               hdr = (struct ctdb_ltdb_header *)data.dptr;
+               if (hdr->dmaster == v->rec->ctdb->pnn) {
+                       /* its already local */
+                       free(data.dptr);
+                       tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+                       continue;
+               }
+
+               free(data.dptr);
+
+               state = ctdb_call_send(v->ctdb_db, &call);
+               tdb_chainunlock(v->ctdb_db->ltdb->tdb, call.key);
+               if (state == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to setup vacuum fetch call\n"));
+                       talloc_free(v);
+                       return;
+               }
+               state->async.fn = vacuum_fetch_callback;
+               state->async.private_data = v;
+               return;
+       }
+
+       talloc_free(v);
+}
+
+
+/*
+  destroy a vacuum info structure
+ */
+static int vacuum_info_destructor(struct vacuum_info *v)
+{
+       DLIST_REMOVE(v->rec->vacuum_info, v);
+       return 0;
+}
+
+
+/*
+  handler for vacuum fetch
+*/
+static void vacuum_fetch_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                                TDB_DATA data, void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+       struct ctdb_marshall_buffer *recs;
+       int ret, i;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       const char *name;
+       struct ctdb_dbid_map *dbmap=NULL;
+       bool persistent = false;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_rec_data *r;
+       uint32_t srcnode;
+       struct vacuum_info *v;
+
+       recs = (struct ctdb_marshall_buffer *)data.dptr;
+       r = (struct ctdb_rec_data *)&recs->data[0];
+
+       if (recs->count == 0) {
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       srcnode = r->reqid;
+
+       for (v=rec->vacuum_info;v;v=v->next) {
+               if (srcnode == v->srcnode && recs->db_id == v->ctdb_db->db_id) {
+                       /* we're already working on records from this node */
+                       talloc_free(tmp_ctx);
+                       return;
+               }
+       }
+
+       /* work out if the database is persistent */
+       ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &dbmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from local node\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       for (i=0;i<dbmap->num;i++) {
+               if (dbmap->dbs[i].dbid == recs->db_id) {
+                       persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
+                       break;
+               }
+       }
+       if (i == dbmap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to find db_id 0x%x on local node\n", recs->db_id));
+               talloc_free(tmp_ctx);
+               return;         
+       }
+
+       /* find the name of this database */
+       if (ctdb_ctrl_getdbname(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, recs->db_id, tmp_ctx, &name) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", recs->db_id));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       /* attach to it */
+       ctdb_db = ctdb_attach(ctdb, CONTROL_TIMEOUT(), name, persistent, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       v = talloc_zero(rec, struct vacuum_info);
+       if (v == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       v->rec = rec;
+       v->srcnode = srcnode;
+       v->ctdb_db = ctdb_db;
+       v->recs = talloc_memdup(v, recs, data.dsize);
+       if (v->recs == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Out of memory\n"));
+               talloc_free(v);
+               talloc_free(tmp_ctx);
+               return;         
+       }
+       v->r =  (struct ctdb_rec_data *)&v->recs->data[0];
+
+       DLIST_ADD(rec->vacuum_info, v);
+
+       talloc_set_destructor(v, vacuum_info_destructor);
+
+       vacuum_fetch_next(v);
+       talloc_free(tmp_ctx);
+}
+
+
+/*
+  called when ctdb_wait_timeout should finish
+ */
+static void ctdb_wait_handler(struct event_context *ev, struct timed_event *te, 
+                             struct timeval yt, void *p)
+{
+       uint32_t *timed_out = (uint32_t *)p;
+       (*timed_out) = 1;
+}
+
+/*
+  wait for a given number of seconds
+ */
+static void ctdb_wait_timeout(struct ctdb_context *ctdb, double secs)
+{
+       uint32_t timed_out = 0;
+       time_t usecs = (secs - (time_t)secs) * 1000000;
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(secs, usecs), ctdb_wait_handler, &timed_out);
+       while (!timed_out) {
+               event_loop_once(ctdb->ev);
+       }
+}
+
+/*
+  called when an election times out (ends)
+ */
+static void ctdb_election_timeout(struct event_context *ev, struct timed_event *te, 
+                                 struct timeval t, void *p)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+       rec->election_timeout = NULL;
+       fast_start = false;
+
+       DEBUG(DEBUG_WARNING,(__location__ " Election timed out\n"));
+}
+
+
+/*
+  wait for an election to finish. It finished election_timeout seconds after
+  the last election packet is received
+ */
+static void ctdb_wait_election(struct ctdb_recoverd *rec)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       while (rec->election_timeout) {
+               event_loop_once(ctdb->ev);
+       }
+}
+
+/*
+  Update our local flags from all remote connected nodes. 
+  This is only run when we are or we belive we are the recovery master
+ */
+static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
+{
+       int j;
+       struct ctdb_context *ctdb = rec->ctdb;
+       TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+
+       /* get the nodemap for all active remote nodes and verify
+          they are the same as for this node
+        */
+       for (j=0; j<nodemap->num; j++) {
+               struct ctdb_node_map *remote_nodemap=NULL;
+               int ret;
+
+               if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+               if (nodemap->nodes[j].pnn == ctdb->pnn) {
+                       continue;
+               }
+
+               ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+                                          mem_ctx, &remote_nodemap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from remote node %u\n", 
+                                 nodemap->nodes[j].pnn));
+                       ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                       talloc_free(mem_ctx);
+                       return MONITOR_FAILED;
+               }
+               if (nodemap->nodes[j].flags != remote_nodemap->nodes[j].flags) {
+                       /* We should tell our daemon about this so it
+                          updates its flags or else we will log the same 
+                          message again in the next iteration of recovery.
+                          Since we are the recovery master we can just as
+                          well update the flags on all nodes.
+                       */
+                       ret = ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags, ~remote_nodemap->nodes[j].flags);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to update nodeflags on remote nodes\n"));
+                               return -1;
+                       }
+
+                       /* Update our local copy of the flags in the recovery
+                          daemon.
+                       */
+                       DEBUG(DEBUG_NOTICE,("Remote node %u had flags 0x%x, local had 0x%x - updating local\n",
+                                nodemap->nodes[j].pnn, remote_nodemap->nodes[j].flags,
+                                nodemap->nodes[j].flags));
+                       nodemap->nodes[j].flags = remote_nodemap->nodes[j].flags;
+               }
+               talloc_free(remote_nodemap);
+       }
+       talloc_free(mem_ctx);
+       return MONITOR_OK;
+}
+
+
+/* Create a new random generation ip. 
+   The generation id can not be the INVALID_GENERATION id
+*/
+static uint32_t new_generation(void)
+{
+       uint32_t generation;
+
+       while (1) {
+               generation = random();
+
+               if (generation != INVALID_GENERATION) {
+                       break;
+               }
+       }
+
+       return generation;
+}
+
+
+/*
+  create a temporary working database
+ */
+static struct tdb_wrap *create_recdb(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
+{
+       char *name;
+       struct tdb_wrap *recdb;
+       unsigned tdb_flags;
+
+       /* open up the temporary recovery database */
+       name = talloc_asprintf(mem_ctx, "%s/recdb.tdb.%u",
+                              ctdb->db_directory_state,
+                              ctdb->pnn);
+       if (name == NULL) {
+               return NULL;
+       }
+       unlink(name);
+
+       tdb_flags = TDB_NOLOCK;
+       if (ctdb->valgrinding) {
+               tdb_flags |= TDB_NOMMAP;
+       }
+       tdb_flags |= (TDB_INCOMPATIBLE_HASH | TDB_DISALLOW_NESTING);
+
+       recdb = tdb_wrap_open(mem_ctx, name, ctdb->tunable.database_hash_size, 
+                             tdb_flags, O_RDWR|O_CREAT|O_EXCL, 0600);
+       if (recdb == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to create temp recovery database '%s'\n", name));
+       }
+
+       talloc_free(name);
+
+       return recdb;
+}
+
+
+/* 
+   a traverse function for pulling all relevant records from recdb
+ */
+struct recdb_data {
+       struct ctdb_context *ctdb;
+       struct ctdb_marshall_buffer *recdata;
+       uint32_t len;
+       uint32_t allocated_len;
+       bool failed;
+       bool persistent;
+};
+
+static int traverse_recdb(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
+{
+       struct recdb_data *params = (struct recdb_data *)p;
+       struct ctdb_rec_data *rec;
+       struct ctdb_ltdb_header *hdr;
+
+       /*
+        * skip empty records - but NOT for persistent databases:
+        *
+        * The record-by-record mode of recovery deletes empty records.
+        * For persistent databases, this can lead to data corruption
+        * by deleting records that should be there:
+        *
+        * - Assume the cluster has been running for a while.
+        *
+        * - A record R in a persistent database has been created and
+        *   deleted a couple of times, the last operation being deletion,
+        *   leaving an empty record with a high RSN, say 10.
+        *
+        * - Now a node N is turned off.
+        *
+        * - This leaves the local database copy of D on N with the empty
+        *   copy of R and RSN 10. On all other nodes, the recovery has deleted
+        *   the copy of record R.
+        *
+        * - Now the record is created again while node N is turned off.
+        *   This creates R with RSN = 1 on all nodes except for N.
+        *
+        * - Now node N is turned on again. The following recovery will chose
+        *   the older empty copy of R due to RSN 10 > RSN 1.
+        *
+        * ==> Hence the record is gone after the recovery.
+        *
+        * On databases like Samba's registry, this can damage the higher-level
+        * data structures built from the various tdb-level records.
+        */
+       if (!params->persistent && data.dsize <= sizeof(struct ctdb_ltdb_header)) {
+               return 0;
+       }
+
+       /* update the dmaster field to point to us */
+       hdr = (struct ctdb_ltdb_header *)data.dptr;
+       if (!params->persistent) {
+               hdr->dmaster = params->ctdb->pnn;
+               hdr->flags |= CTDB_REC_FLAG_MIGRATED_WITH_DATA;
+       }
+
+       /* add the record to the blob ready to send to the nodes */
+       rec = ctdb_marshall_record(params->recdata, 0, key, NULL, data);
+       if (rec == NULL) {
+               params->failed = true;
+               return -1;
+       }
+       if (params->len + rec->length >= params->allocated_len) {
+               params->allocated_len = rec->length + params->len + params->ctdb->tunable.pulldb_preallocation_size;
+               params->recdata = talloc_realloc_size(NULL, params->recdata, params->allocated_len);
+       }
+       if (params->recdata == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to expand recdata to %u\n",
+                        rec->length + params->len));
+               params->failed = true;
+               return -1;
+       }
+       params->recdata->count++;
+       memcpy(params->len+(uint8_t *)params->recdata, rec, rec->length);
+       params->len += rec->length;
+       talloc_free(rec);
+
+       return 0;
+}
+
+/*
+  push the recdb database out to all nodes
+ */
+static int push_recdb_database(struct ctdb_context *ctdb, uint32_t dbid,
+                              bool persistent,
+                              struct tdb_wrap *recdb, struct ctdb_node_map *nodemap)
+{
+       struct recdb_data params;
+       struct ctdb_marshall_buffer *recdata;
+       TDB_DATA outdata;
+       TALLOC_CTX *tmp_ctx;
+       uint32_t *nodes;
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY(ctdb, tmp_ctx);
+
+       recdata = talloc_zero(recdb, struct ctdb_marshall_buffer);
+       CTDB_NO_MEMORY(ctdb, recdata);
+
+       recdata->db_id = dbid;
+
+       params.ctdb = ctdb;
+       params.recdata = recdata;
+       params.len = offsetof(struct ctdb_marshall_buffer, data);
+       params.allocated_len = params.len;
+       params.failed = false;
+       params.persistent = persistent;
+
+       if (tdb_traverse_read(recdb->tdb, traverse_recdb, &params) == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
+               talloc_free(params.recdata);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (params.failed) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to traverse recdb database\n"));
+               talloc_free(params.recdata);
+               talloc_free(tmp_ctx);
+               return -1;              
+       }
+
+       recdata = params.recdata;
+
+       outdata.dptr = (void *)recdata;
+       outdata.dsize = params.len;
+
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, outdata,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to push recdb records to nodes for db 0x%x\n", dbid));
+               talloc_free(recdata);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pushed remote database 0x%x of size %u\n", 
+                 dbid, recdata->count));
+
+       talloc_free(recdata);
+       talloc_free(tmp_ctx);
+
+       return 0;
+}
+
+
+/*
+  go through a full recovery on one database 
+ */
+static int recover_database(struct ctdb_recoverd *rec, 
+                           TALLOC_CTX *mem_ctx,
+                           uint32_t dbid,
+                           bool persistent,
+                           uint32_t pnn, 
+                           struct ctdb_node_map *nodemap,
+                           uint32_t transaction_id)
+{
+       struct tdb_wrap *recdb;
+       int ret;
+       struct ctdb_context *ctdb = rec->ctdb;
+       TDB_DATA data;
+       struct ctdb_control_wipe_database w;
+       uint32_t *nodes;
+
+       recdb = create_recdb(ctdb, mem_ctx);
+       if (recdb == NULL) {
+               return -1;
+       }
+
+       /* pull all remote databases onto the recdb */
+       ret = pull_remote_database(ctdb, rec, nodemap, recdb, dbid, persistent);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to pull remote database 0x%x\n", dbid));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - pulled remote database 0x%x\n", dbid));
+
+       /* wipe all the remote databases. This is safe as we are in a transaction */
+       w.db_id = dbid;
+       w.transaction_id = transaction_id;
+
+       data.dptr = (void *)&w;
+       data.dsize = sizeof(w);
+
+       nodes = list_of_active_nodes(ctdb, nodemap, recdb, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to wipe database. Recovery failed.\n"));
+               talloc_free(recdb);
+               return -1;
+       }
+       
+       /* push out the correct database. This sets the dmaster and skips 
+          the empty records */
+       ret = push_recdb_database(ctdb, dbid, persistent, recdb, nodemap);
+       if (ret != 0) {
+               talloc_free(recdb);
+               return -1;
+       }
+
+       /* all done with this database */
+       talloc_free(recdb);
+
+       return 0;
+}
+
+static int ctdb_reload_remote_public_ips(struct ctdb_context *ctdb,
+                                        struct ctdb_recoverd *rec,
+                                        struct ctdb_node_map *nodemap,
+                                        uint32_t *culprit)
+{
+       int j;
+       int ret;
+
+       if (ctdb->num_nodes != nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) invalid param\n",
+                                 ctdb->num_nodes, nodemap->num));
+               if (culprit) {
+                       *culprit = ctdb->pnn;
+               }
+               return -1;
+       }
+
+       for (j=0; j<nodemap->num; j++) {
+               /* For readability */
+               struct ctdb_node *node = ctdb->nodes[j];
+
+               /* release any existing data */
+               if (node->known_public_ips) {
+                       talloc_free(node->known_public_ips);
+                       node->known_public_ips = NULL;
+               }
+               if (node->available_public_ips) {
+                       talloc_free(node->available_public_ips);
+                       node->available_public_ips = NULL;
+               }
+
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               /* Retrieve the list of known public IPs from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       CONTROL_TIMEOUT(),
+                                       node->pnn,
+                                       ctdb->nodes,
+                                       0,
+                                       &node->known_public_ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to read known public IPs from node: %u\n",
+                              node->pnn));
+                       if (culprit) {
+                               *culprit = node->pnn;
+                       }
+                       return -1;
+               }
+
+               if (ctdb->do_checkpublicip &&
+                   rec->takeover_runs_disable_ctx == NULL &&
+                   verify_remote_ip_allocation(ctdb,
+                                                node->known_public_ips,
+                                                node->pnn)) {
+                       DEBUG(DEBUG_ERR,("Trigger IP reallocation\n"));
+                       rec->need_takeover_run = true;
+               }
+
+               /* Retrieve the list of available public IPs from the node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb,
+                                       CONTROL_TIMEOUT(),
+                                       node->pnn,
+                                       ctdb->nodes,
+                                       CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE,
+                                       &node->available_public_ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to read available public IPs from node: %u\n",
+                              node->pnn));
+                       if (culprit) {
+                               *culprit = node->pnn;
+                       }
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+/* when we start a recovery, make sure all nodes use the same reclock file
+   setting
+*/
+static int sync_recovery_lock_file_across_cluster(struct ctdb_recoverd *rec)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       TDB_DATA data;
+       uint32_t *nodes;
+
+       if (ctdb->recovery_lock_file == NULL) {
+               data.dptr  = NULL;
+               data.dsize = 0;
+       } else {
+               data.dsize = strlen(ctdb->recovery_lock_file) + 1;
+               data.dptr  = (uint8_t *)ctdb->recovery_lock_file;
+       }
+
+       nodes = list_of_active_nodes(ctdb, rec->nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_SET_RECLOCK_FILE,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(),
+                                       false, data,
+                                       NULL, NULL,
+                                       rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to sync reclock file settings\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
+/*
+ * this callback is called for every node that failed to execute ctdb_takeover_run()
+ * and set flag to re-run takeover run.
+ */
+static void takeover_fail_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       DEBUG(DEBUG_ERR, ("Node %u failed the takeover run\n", node_pnn));
+
+       if (callback_data != NULL) {
+               struct ctdb_recoverd *rec = talloc_get_type(callback_data, struct ctdb_recoverd);
+
+               DEBUG(DEBUG_ERR, ("Setting node %u as recovery fail culprit\n", node_pnn));
+
+               ctdb_set_culprit(rec, node_pnn);
+       }
+}
+
+
+static void ban_misbehaving_nodes(struct ctdb_recoverd *rec, bool *self_ban)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       int i;
+       struct ctdb_banning_state *ban_state;
+
+       *self_ban = false;
+       for (i=0; i<ctdb->num_nodes; i++) {
+               if (ctdb->nodes[i]->ban_state == NULL) {
+                       continue;
+               }
+               ban_state = (struct ctdb_banning_state *)ctdb->nodes[i]->ban_state;
+               if (ban_state->count < 2*ctdb->num_nodes) {
+                       continue;
+               }
+
+               DEBUG(DEBUG_NOTICE,("Node %u reached %u banning credits - banning it for %u seconds\n",
+                       ctdb->nodes[i]->pnn, ban_state->count,
+                       ctdb->tunable.recovery_ban_period));
+               ctdb_ban_node(rec, ctdb->nodes[i]->pnn, ctdb->tunable.recovery_ban_period);
+               ban_state->count = 0;
+
+               /* Banning ourself? */
+               if (ctdb->nodes[i]->pnn == rec->ctdb->pnn) {
+                       *self_ban = true;
+               }
+       }
+}
+
+static bool do_takeover_run(struct ctdb_recoverd *rec,
+                           struct ctdb_node_map *nodemap,
+                           bool banning_credits_on_fail)
+{
+       uint32_t *nodes = NULL;
+       struct srvid_request dtr;
+       TDB_DATA data;
+       int i;
+       uint32_t *rebalance_nodes = rec->force_rebalance_nodes;
+       int ret;
+       bool ok;
+
+       DEBUG(DEBUG_NOTICE, ("Takeover run starting\n"));
+
+       if (rec->takeover_run_in_progress) {
+               DEBUG(DEBUG_ERR, (__location__
+                                 " takeover run already in progress \n"));
+               ok = false;
+               goto done;
+       }
+
+       rec->takeover_run_in_progress = true;
+
+       /* If takeover runs are in disabled then fail... */
+       if (rec->takeover_runs_disable_ctx != NULL) {
+               DEBUG(DEBUG_ERR,
+                     ("Takeover runs are disabled so refusing to run one\n"));
+               ok = false;
+               goto done;
+       }
+
+       /* Disable IP checks (takeover runs, really) on other nodes
+        * while doing this takeover run.  This will stop those other
+        * nodes from triggering takeover runs when think they should
+        * be hosting an IP but it isn't yet on an interface.  Don't
+        * wait for replies since a failure here might cause some
+        * noise in the logs but will not actually cause a problem.
+        */
+       dtr.srvid = 0; /* No reply */
+       dtr.pnn = -1;
+
+       data.dptr  = (uint8_t*)&dtr;
+       data.dsize = sizeof(dtr);
+
+       nodes = list_of_connected_nodes(rec->ctdb, nodemap, rec, false);
+
+       /* Disable for 60 seconds.  This can be a tunable later if
+        * necessary.
+        */
+       dtr.data = 60;
+       for (i = 0; i < talloc_array_length(nodes); i++) {
+               if (ctdb_client_send_message(rec->ctdb, nodes[i],
+                                            CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+                                            data) != 0) {
+                       DEBUG(DEBUG_INFO,("Failed to disable takeover runs\n"));
+               }
+       }
+
+       ret = ctdb_takeover_run(rec->ctdb, nodemap,
+                               rec->force_rebalance_nodes,
+                               takeover_fail_callback,
+                               banning_credits_on_fail ? rec : NULL);
+
+       /* Reenable takeover runs and IP checks on other nodes */
+       dtr.data = 0;
+       for (i = 0; i < talloc_array_length(nodes); i++) {
+               if (ctdb_client_send_message(rec->ctdb, nodes[i],
+                                            CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+                                            data) != 0) {
+                       DEBUG(DEBUG_INFO,("Failed to reenable takeover runs\n"));
+               }
+       }
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("ctdb_takeover_run() failed\n"));
+               ok = false;
+               goto done;
+       }
+
+       ok = true;
+       /* Takeover run was successful so clear force rebalance targets */
+       if (rebalance_nodes == rec->force_rebalance_nodes) {
+               TALLOC_FREE(rec->force_rebalance_nodes);
+       } else {
+               DEBUG(DEBUG_WARNING,
+                     ("Rebalance target nodes changed during takeover run - not clearing\n"));
+       }
+done:
+       rec->need_takeover_run = !ok;
+       talloc_free(nodes);
+       rec->takeover_run_in_progress = false;
+
+       DEBUG(DEBUG_NOTICE, ("Takeover run %s\n", ok ? "completed successfully" : "unsuccessful"));
+       return ok;
+}
+
+
+/*
+  we are the recmaster, and recovery is needed - start a recovery run
+ */
+static int do_recovery(struct ctdb_recoverd *rec, 
+                      TALLOC_CTX *mem_ctx, uint32_t pnn,
+                      struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       int i, j, ret;
+       uint32_t generation;
+       struct ctdb_dbid_map *dbmap;
+       TDB_DATA data;
+       uint32_t *nodes;
+       struct timeval start_time;
+       uint32_t culprit = (uint32_t)-1;
+       bool self_ban;
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Starting do_recovery\n"));
+
+       /* if recovery fails, force it again */
+       rec->need_recovery = true;
+
+       ban_misbehaving_nodes(rec, &self_ban);
+       if (self_ban) {
+               DEBUG(DEBUG_NOTICE, ("This node was banned, aborting recovery\n"));
+               return -1;
+       }
+
+        if (ctdb->tunable.verify_recovery_lock != 0) {
+               DEBUG(DEBUG_ERR,("Taking out recovery lock from recovery daemon\n"));
+               start_time = timeval_current();
+               if (!ctdb_recovery_lock(ctdb, true)) {
+                       DEBUG(DEBUG_ERR,("Unable to get recovery lock - aborting recovery "
+                                        "and ban ourself for %u seconds\n",
+                                        ctdb->tunable.recovery_ban_period));
+                       ctdb_ban_node(rec, pnn, ctdb->tunable.recovery_ban_period);
+                       return -1;
+               }
+               ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&start_time));
+               DEBUG(DEBUG_NOTICE,("Recovery lock taken successfully by recovery daemon\n"));
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery initiated due to problem with node %u\n", rec->last_culprit_node));
+
+       /* get a list of all databases */
+       ret = ctdb_ctrl_getdbmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &dbmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get dbids from node :%u\n", pnn));
+               return -1;
+       }
+
+       /* we do the db creation before we set the recovery mode, so the freeze happens
+          on all databases we will be dealing with. */
+
+       /* verify that we have all the databases any other node has */
+       ret = create_missing_local_databases(ctdb, nodemap, pnn, &dbmap, mem_ctx);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to create missing local databases\n"));
+               return -1;
+       }
+
+       /* verify that all other nodes have all our databases */
+       ret = create_missing_remote_databases(ctdb, nodemap, pnn, dbmap, mem_ctx);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to create missing remote databases\n"));
+               return -1;
+       }
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - created remote databases\n"));
+
+       /* update the database priority for all remote databases */
+       ret = update_db_priority_on_remote_nodes(ctdb, nodemap, pnn, dbmap, mem_ctx);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set db priority on remote nodes\n"));
+       }
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated db priority for all databases\n"));
+
+
+       /* update all other nodes to use the same setting for reclock files
+          as the local recovery master.
+       */
+       sync_recovery_lock_file_across_cluster(rec);
+
+       /* set recovery mode to active on all nodes */
+       ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
+               return -1;
+       }
+
+       /* execute the "startrecovery" event script on all nodes */
+       ret = run_startrecovery_eventscript(rec, nodemap);
+       if (ret!=0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
+               return -1;
+       }
+
+       /*
+         update all nodes to have the same flags that we have
+        */
+       for (i=0;i<nodemap->num;i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+
+               ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+               if (ret != 0) {
+                       if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                               DEBUG(DEBUG_WARNING, (__location__ "Unable to update flags on inactive node %d\n", i));
+                       } else {
+                               DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+                               return -1;
+                       }
+               }
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
+       /* pick a new generation number */
+       generation = new_generation();
+
+       /* change the vnnmap on this node to use the new generation 
+          number but not on any other nodes.
+          this guarantees that if we abort the recovery prematurely
+          for some reason (a node stops responding?)
+          that we can just return immediately and we will reenter
+          recovery shortly again.
+          I.e. we deliberately leave the cluster with an inconsistent
+          generation id to allow us to abort recovery at any stage and
+          just restart it from scratch.
+        */
+       vnnmap->generation = generation;
+       ret = ctdb_ctrl_setvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, vnnmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set vnnmap for node %u\n", pnn));
+               return -1;
+       }
+
+       data.dptr = (void *)&generation;
+       data.dsize = sizeof(uint32_t);
+
+       nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, data,
+                                       NULL,
+                                       transaction_start_fail_callback,
+                                       rec) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to start transactions. Recovery failed.\n"));
+               if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_CANCEL,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, tdb_null,
+                                       NULL,
+                                       NULL,
+                                       NULL) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to cancel recovery transaction\n"));
+               }
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE,(__location__ " started transactions on all nodes\n"));
+
+       for (i=0;i<dbmap->num;i++) {
+               ret = recover_database(rec, mem_ctx,
+                                      dbmap->dbs[i].dbid,
+                                      dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT,
+                                      pnn, nodemap, generation);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to recover database 0x%x\n", dbmap->dbs[i].dbid));
+                       return -1;
+               }
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - starting database commits\n"));
+
+       /* commit all the changes */
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to commit recovery changes. Recovery failed.\n"));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - committed databases\n"));
+       
+
+       /* update the capabilities for all nodes */
+       ret = update_capabilities(ctdb, nodemap);
+       if (ret!=0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
+               return -1;
+       }
+
+       /* build a new vnn map with all the currently active and
+          unbanned nodes */
+       generation = new_generation();
+       vnnmap = talloc(mem_ctx, struct ctdb_vnn_map);
+       CTDB_NO_MEMORY(ctdb, vnnmap);
+       vnnmap->generation = generation;
+       vnnmap->size = 0;
+       vnnmap->map = talloc_zero_array(vnnmap, uint32_t, vnnmap->size);
+       CTDB_NO_MEMORY(ctdb, vnnmap->map);
+       for (i=j=0;i<nodemap->num;i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (!(ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER)) {
+                       /* this node can not be an lmaster */
+                       DEBUG(DEBUG_DEBUG, ("Node %d cant be a LMASTER, skipping it\n", i));
+                       continue;
+               }
+
+               vnnmap->size++;
+               vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
+               CTDB_NO_MEMORY(ctdb, vnnmap->map);
+               vnnmap->map[j++] = nodemap->nodes[i].pnn;
+
+       }
+       if (vnnmap->size == 0) {
+               DEBUG(DEBUG_NOTICE, ("No suitable lmasters found. Adding local node (recmaster) anyway.\n"));
+               vnnmap->size++;
+               vnnmap->map = talloc_realloc(vnnmap, vnnmap->map, uint32_t, vnnmap->size);
+               CTDB_NO_MEMORY(ctdb, vnnmap->map);
+               vnnmap->map[0] = pnn;
+       }       
+
+       /* update to the new vnnmap on all nodes */
+       ret = update_vnnmap_on_all_nodes(ctdb, nodemap, pnn, vnnmap, mem_ctx);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update vnnmap on all nodes\n"));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated vnnmap\n"));
+
+       /* update recmaster to point to us for all nodes */
+       ret = set_recovery_master(ctdb, nodemap, pnn);
+       if (ret!=0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery master\n"));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated recmaster\n"));
+
+       /*
+         update all nodes to have the same flags that we have
+        */
+       for (i=0;i<nodemap->num;i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+
+               ret = update_flags_on_all_nodes(ctdb, nodemap, i, nodemap->nodes[i].flags);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to update flags on all nodes for node %d\n", i));
+                       return -1;
+               }
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - updated flags\n"));
+
+       /* disable recovery mode */
+       ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_NORMAL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to normal on cluster\n"));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - disabled recovery mode\n"));
+
+       /* Fetch known/available public IPs from each active node */
+       ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                culprit));
+               rec->need_takeover_run = true;
+               return -1;
+       }
+
+       do_takeover_run(rec, nodemap, false);
+
+       /* execute the "recovered" event script on all nodes */
+       ret = run_recovered_eventscript(rec, nodemap, "do_recovery");
+       if (ret!=0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Recovery process failed.\n"));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery - finished the recovered event\n"));
+
+       /* send a message to all clients telling them that the cluster 
+          has been reconfigured */
+       ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECONFIGURE, tdb_null);
+
+       DEBUG(DEBUG_NOTICE, (__location__ " Recovery complete\n"));
+
+       rec->need_recovery = false;
+
+       /* we managed to complete a full recovery, make sure to forgive
+          any past sins by the nodes that could now participate in the
+          recovery.
+       */
+       DEBUG(DEBUG_ERR,("Resetting ban count to 0 for all nodes\n"));
+       for (i=0;i<nodemap->num;i++) {
+               struct ctdb_banning_state *ban_state;
+
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+
+               ban_state = (struct ctdb_banning_state *)ctdb->nodes[nodemap->nodes[i].pnn]->ban_state;
+               if (ban_state == NULL) {
+                       continue;
+               }
+
+               ban_state->count = 0;
+       }
+
+
+       /* We just finished a recovery successfully. 
+          We now wait for rerecovery_timeout before we allow 
+          another recovery to take place.
+       */
+       DEBUG(DEBUG_NOTICE, ("Just finished a recovery. New recoveries will now be supressed for the rerecovery timeout (%d seconds)\n", ctdb->tunable.rerecovery_timeout));
+       ctdb_wait_timeout(ctdb, ctdb->tunable.rerecovery_timeout);
+       DEBUG(DEBUG_NOTICE, ("The rerecovery timeout has elapsed. We now allow recoveries to trigger again.\n"));
+
+       return 0;
+}
+
+
+/*
+  elections are won by first checking the number of connected nodes, then
+  the priority time, then the pnn
+ */
+struct election_message {
+       uint32_t num_connected;
+       struct timeval priority_time;
+       uint32_t pnn;
+       uint32_t node_flags;
+};
+
+/*
+  form this nodes election data
+ */
+static void ctdb_election_data(struct ctdb_recoverd *rec, struct election_message *em)
+{
+       int ret, i;
+       struct ctdb_node_map *nodemap;
+       struct ctdb_context *ctdb = rec->ctdb;
+
+       ZERO_STRUCTP(em);
+
+       em->pnn = rec->ctdb->pnn;
+       em->priority_time = rec->priority_time;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, rec, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
+               return;
+       }
+
+       rec->node_flags = nodemap->nodes[ctdb->pnn].flags;
+       em->node_flags = rec->node_flags;
+
+       for (i=0;i<nodemap->num;i++) {
+               if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
+                       em->num_connected++;
+               }
+       }
+
+       /* we shouldnt try to win this election if we cant be a recmaster */
+       if ((ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
+               em->num_connected = 0;
+               em->priority_time = timeval_current();
+       }
+
+       talloc_free(nodemap);
+}
+
+/*
+  see if the given election data wins
+ */
+static bool ctdb_election_win(struct ctdb_recoverd *rec, struct election_message *em)
+{
+       struct election_message myem;
+       int cmp = 0;
+
+       ctdb_election_data(rec, &myem);
+
+       /* we cant win if we dont have the recmaster capability */
+       if ((rec->ctdb->capabilities & CTDB_CAP_RECMASTER) == 0) {
+               return false;
+       }
+
+       /* we cant win if we are banned */
+       if (rec->node_flags & NODE_FLAGS_BANNED) {
+               return false;
+       }
+
+       /* we cant win if we are stopped */
+       if (rec->node_flags & NODE_FLAGS_STOPPED) {
+               return false;
+       }
+
+       /* we will automatically win if the other node is banned */
+       if (em->node_flags & NODE_FLAGS_BANNED) {
+               return true;
+       }
+
+       /* we will automatically win if the other node is banned */
+       if (em->node_flags & NODE_FLAGS_STOPPED) {
+               return true;
+       }
+
+       /* try to use the most connected node */
+       if (cmp == 0) {
+               cmp = (int)myem.num_connected - (int)em->num_connected;
+       }
+
+       /* then the longest running node */
+       if (cmp == 0) {
+               cmp = timeval_compare(&em->priority_time, &myem.priority_time);
+       }
+
+       if (cmp == 0) {
+               cmp = (int)myem.pnn - (int)em->pnn;
+       }
+
+       return cmp > 0;
+}
+
+/*
+  send out an election request
+ */
+static int send_election_request(struct ctdb_recoverd *rec, uint32_t pnn)
+{
+       int ret;
+       TDB_DATA election_data;
+       struct election_message emsg;
+       uint64_t srvid;
+       struct ctdb_context *ctdb = rec->ctdb;
+
+       srvid = CTDB_SRVID_RECOVERY;
+
+       ctdb_election_data(rec, &emsg);
+
+       election_data.dsize = sizeof(struct election_message);
+       election_data.dptr  = (unsigned char *)&emsg;
+
+
+       /* first we assume we will win the election and set 
+          recoverymaster to be ourself on the current node
+        */
+       ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), pnn, pnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request\n"));
+               return -1;
+       }
+
+
+       /* send an election message to all active nodes */
+       DEBUG(DEBUG_INFO,(__location__ " Send election request to all active nodes\n"));
+       ctdb_client_send_message(ctdb, CTDB_BROADCAST_ALL, srvid, election_data);
+
+       return 0;
+}
+
+/*
+  this function will unban all nodes in the cluster
+*/
+static void unban_all_nodes(struct ctdb_context *ctdb)
+{
+       int ret, i;
+       struct ctdb_node_map *nodemap;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " failed to get nodemap to unban all nodes\n"));
+               return;
+       }
+
+       for (i=0;i<nodemap->num;i++) {
+               if ( (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED))
+                 && (nodemap->nodes[i].flags & NODE_FLAGS_BANNED) ) {
+                       ctdb_ctrl_modflags(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[i].pnn, 0, NODE_FLAGS_BANNED);
+               }
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+
+/*
+  we think we are winning the election - send a broadcast election request
+ */
+static void election_send_request(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+       int ret;
+
+       ret = send_election_request(rec, ctdb_get_pnn(rec->ctdb));
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send election request!\n"));
+       }
+
+       talloc_free(rec->send_election_te);
+       rec->send_election_te = NULL;
+}
+
+/*
+  handler for memory dumps
+*/
+static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA *dump;
+       int ret;
+       struct srvid_request *rd;
+
+       if (data.dsize != sizeof(struct srvid_request)) {
+               DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+       rd = (struct srvid_request *)data.dptr;
+
+       dump = talloc_zero(tmp_ctx, TDB_DATA);
+       if (dump == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to allocate memory for memdump\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+       ret = ctdb_dump_memory(ctdb, dump);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb_dump_memory() failed\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+DEBUG(DEBUG_ERR, ("recovery master memory dump\n"));           
+
+       ret = ctdb_client_send_message(ctdb, rd->pnn, rd->srvid, *dump);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send rd memdump reply message\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  handler for getlog
+*/
+static void getlog_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                          TDB_DATA data, void *private_data)
+{
+       struct ctdb_get_log_addr *log_addr;
+       pid_t child;
+
+       if (data.dsize != sizeof(struct ctdb_get_log_addr)) {
+               DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+               return;
+       }
+       log_addr = (struct ctdb_get_log_addr *)data.dptr;
+
+       child = ctdb_fork_no_free_ringbuffer(ctdb);
+       if (child == (pid_t)-1) {
+               DEBUG(DEBUG_ERR,("Failed to fork a log collector child\n"));
+               return;
+       }
+
+       if (child == 0) {
+               ctdb_set_process_name("ctdb_rec_log_collector");
+               if (switch_from_server_to_client(ctdb, "recoverd-log-collector") != 0) {
+                       DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch log collector child into client mode.\n"));
+                       _exit(1);
+               }
+               ctdb_collect_log(ctdb, log_addr);
+               _exit(0);
+       }
+}
+
+/*
+  handler for clearlog
+*/
+static void clearlog_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       ctdb_clear_log(ctdb);
+}
+
+/*
+  handler for reload_nodes
+*/
+static void reload_nodes_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_ERR, (__location__ " Reload nodes file from recovery daemon\n"));
+
+       ctdb_load_nodes_file(rec->ctdb);
+}
+
+
+static void ctdb_rebalance_timeout(struct event_context *ev,
+                                  struct timed_event *te,
+                                  struct timeval t, void *p)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+
+       if (rec->force_rebalance_nodes == NULL) {
+               DEBUG(DEBUG_ERR,
+                     ("Rebalance timeout occurred - no nodes to rebalance\n"));
+               return;
+       }
+
+       DEBUG(DEBUG_NOTICE,
+             ("Rebalance timeout occurred - do takeover run\n"));
+       do_takeover_run(rec, rec->nodemap, false);
+}
+
+       
+static void recd_node_rebalance_handler(struct ctdb_context *ctdb,
+                                       uint64_t srvid,
+                                       TDB_DATA data, void *private_data)
+{
+       uint32_t pnn;
+       uint32_t *t;
+       int len;
+       uint32_t deferred_rebalance;
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+
+       if (rec->recmaster != ctdb_get_pnn(ctdb)) {
+               return;
+       }
+
+       if (data.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,(__location__ " Incorrect size of node rebalance message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(uint32_t)));
+               return;
+       }
+
+       pnn = *(uint32_t *)&data.dptr[0];
+
+       DEBUG(DEBUG_NOTICE,("Setting up rebalance of IPs to node %u\n", pnn));
+
+       /* Copy any existing list of nodes.  There's probably some
+        * sort of realloc variant that will do this but we need to
+        * make sure that freeing the old array also cancels the timer
+        * event for the timeout... not sure if realloc will do that.
+        */
+       len = (rec->force_rebalance_nodes != NULL) ?
+               talloc_array_length(rec->force_rebalance_nodes) :
+               0;
+
+       /* This allows duplicates to be added but they don't cause
+        * harm.  A call to add a duplicate PNN arguably means that
+        * the timeout should be reset, so this is the simplest
+        * solution.
+        */
+       t = talloc_zero_array(rec, uint32_t, len+1);
+       CTDB_NO_MEMORY_VOID(ctdb, t);
+       if (len > 0) {
+               memcpy(t, rec->force_rebalance_nodes, sizeof(uint32_t) * len);
+       }
+       t[len] = pnn;
+
+       talloc_free(rec->force_rebalance_nodes);
+
+       rec->force_rebalance_nodes = t;
+
+       /* If configured, setup a deferred takeover run to make sure
+        * that certain nodes get IPs rebalanced to them.  This will
+        * be cancelled if a successful takeover run happens before
+        * the timeout.  Assign tunable value to variable for
+        * readability.
+        */
+       deferred_rebalance = ctdb->tunable.deferred_rebalance_on_node_add;
+       if (deferred_rebalance != 0) {
+               event_add_timed(ctdb->ev, rec->force_rebalance_nodes,
+                               timeval_current_ofs(deferred_rebalance, 0),
+                               ctdb_rebalance_timeout, rec);
+       }
+}
+
+
+
+static void recd_update_ip_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+       struct ctdb_public_ip *ip;
+
+       if (rec->recmaster != rec->ctdb->pnn) {
+               DEBUG(DEBUG_INFO,("Not recmaster, ignore update ip message\n"));
+               return;
+       }
+
+       if (data.dsize != sizeof(struct ctdb_public_ip)) {
+               DEBUG(DEBUG_ERR,(__location__ " Incorrect size of recd update ip message. Was %zd but expected %zd bytes\n", data.dsize, sizeof(struct ctdb_public_ip)));
+               return;
+       }
+
+       ip = (struct ctdb_public_ip *)data.dptr;
+
+       update_ip_assignment_tree(rec->ctdb, ip);
+}
+
+
+static void clear_takeover_runs_disable(struct ctdb_recoverd *rec)
+{
+       TALLOC_FREE(rec->takeover_runs_disable_ctx);
+}
+
+static void reenable_takeover_runs(struct event_context *ev,
+                                  struct timed_event *te,
+                                  struct timeval yt, void *p)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(p, struct ctdb_recoverd);
+
+       DEBUG(DEBUG_NOTICE,("Reenabling takeover runs after timeout\n"));
+       clear_takeover_runs_disable(rec);
+}
+
+static void disable_takeover_runs_handler(struct ctdb_context *ctdb,
+                                         uint64_t srvid, TDB_DATA data,
+                                         void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data,
+                                                   struct ctdb_recoverd);
+       struct srvid_request *r;
+       uint32_t timeout;
+       TDB_DATA result;
+       int32_t ret = 0;
+
+       /* Validate input data */
+       if (data.dsize != sizeof(struct srvid_request)) {
+               DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
+                                "expecting %lu\n", (long unsigned)data.dsize,
+                                (long unsigned)sizeof(struct srvid_request)));
+               ret = -EINVAL;
+               goto done;
+       }
+       if (data.dptr == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
+               ret = -EINVAL;
+               goto done;
+       }
+
+       r = (struct srvid_request *)data.dptr;
+       timeout = r->data;
+
+       if (timeout == 0) {
+               DEBUG(DEBUG_NOTICE,("Reenabling takeover runs\n"));
+               clear_takeover_runs_disable(rec);
+               ret = ctdb_get_pnn(ctdb);
+               goto done;
+       }
+
+       if (rec->node_flags & NODE_FLAGS_INACTIVE) {
+               DEBUG(DEBUG_ERR,
+                     ("Refusing to disable takeover runs on inactive node\n"));
+               ret = -EHOSTDOWN;
+               goto done;
+       }
+
+       if (rec->takeover_run_in_progress) {
+               DEBUG(DEBUG_ERR,
+                     ("Unable to disable takeover runs - in progress\n"));
+               ret = -EAGAIN;
+               goto done;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Disabling takeover runs for %u seconds\n", timeout));
+
+       /* Clear any old timers */
+       clear_takeover_runs_disable(rec);
+
+       /* When this is non-NULL it indicates that takeover runs are
+        * disabled.  This context also holds the timeout timer.
+        */
+       rec->takeover_runs_disable_ctx = talloc_new(rec);
+       if (rec->takeover_runs_disable_ctx == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Unable to allocate memory\n"));
+               ret = -ENOMEM;
+               goto done;
+       }
+
+       /* Arrange for the timeout to occur */
+       event_add_timed(ctdb->ev, rec->takeover_runs_disable_ctx,
+                       timeval_current_ofs(timeout, 0),
+                       reenable_takeover_runs,
+                       rec);
+
+       /* Returning our PNN tells the caller that we succeeded */
+       ret = ctdb_get_pnn(ctdb);
+done:
+       result.dsize = sizeof(int32_t);
+       result.dptr  = (uint8_t *)&ret;
+       srvid_request_reply(ctdb, r, result);
+}
+
+/* Backward compatibility for this SRVID - call
+ * disable_takeover_runs_handler() instead
+ */
+static void disable_ip_check_handler(struct ctdb_context *ctdb, uint64_t srvid,
+                                    TDB_DATA data, void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data,
+                                                   struct ctdb_recoverd);
+       TDB_DATA data2;
+       struct srvid_request *req;
+
+       if (data.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,(__location__ " Wrong size for data :%lu "
+                                "expecting %lu\n", (long unsigned)data.dsize,
+                                (long unsigned)sizeof(uint32_t)));
+               return;
+       }
+       if (data.dptr == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " No data received\n"));
+               return;
+       }
+
+       req = talloc(ctdb, struct srvid_request);
+       CTDB_NO_MEMORY_VOID(ctdb, req);
+
+       req->srvid = 0; /* No reply */
+       req->pnn = -1;
+       req->data = *((uint32_t *)data.dptr); /* Timeout */
+
+       data2.dsize = sizeof(*req);
+       data2.dptr = (uint8_t *)req;
+
+       disable_takeover_runs_handler(rec->ctdb,
+                                     CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+                                     data2, rec);
+}
+
+/*
+  handler for ip reallocate, just add it to the list of requests and 
+  handle this later in the monitor_cluster loop so we do not recurse
+  with other requests to takeover_run()
+*/
+static void ip_reallocate_handler(struct ctdb_context *ctdb, uint64_t srvid,
+                                 TDB_DATA data, void *private_data)
+{
+       struct srvid_request *request;
+       struct ctdb_recoverd *rec = talloc_get_type(private_data,
+                                                   struct ctdb_recoverd);
+
+       if (data.dsize != sizeof(struct srvid_request)) {
+               DEBUG(DEBUG_ERR, (__location__ " Wrong size of return address.\n"));
+               return;
+       }
+
+       request = (struct srvid_request *)data.dptr;
+
+       srvid_request_add(ctdb, &rec->reallocate_requests, request);
+}
+
+static void process_ipreallocate_requests(struct ctdb_context *ctdb,
+                                         struct ctdb_recoverd *rec)
+{
+       TDB_DATA result;
+       int32_t ret;
+       uint32_t culprit;
+
+       DEBUG(DEBUG_INFO, ("recovery master forced ip reallocation\n"));
+
+       /* update the list of public ips that a node can handle for
+          all connected nodes
+       */
+       ret = ctdb_reload_remote_public_ips(ctdb, rec, rec->nodemap, &culprit);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                culprit));
+               rec->need_takeover_run = true;
+       }
+       if (ret == 0) {
+               if (do_takeover_run(rec, rec->nodemap, false)) {
+                       ret = ctdb_get_pnn(ctdb);
+               } else {
+                       ret = -1;
+               }
+       }
+
+       result.dsize = sizeof(int32_t);
+       result.dptr  = (uint8_t *)&ret;
+
+       srvid_requests_reply(ctdb, &rec->reallocate_requests, result);
+}
+
+
+/*
+  handler for recovery master elections
+*/
+static void election_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+       int ret;
+       struct election_message *em = (struct election_message *)data.dptr;
+       TALLOC_CTX *mem_ctx;
+
+       /* we got an election packet - update the timeout for the election */
+       talloc_free(rec->election_timeout);
+       rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
+                                               timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
+                                               ctdb_election_timeout, rec);
+
+       mem_ctx = talloc_new(ctdb);
+
+       /* someone called an election. check their election data
+          and if we disagree and we would rather be the elected node, 
+          send a new election message to all other nodes
+        */
+       if (ctdb_election_win(rec, em)) {
+               if (!rec->send_election_te) {
+                       rec->send_election_te = event_add_timed(ctdb->ev, rec, 
+                                                               timeval_current_ofs(0, 500000),
+                                                               election_send_request, rec);
+               }
+               talloc_free(mem_ctx);
+               /*unban_all_nodes(ctdb);*/
+               return;
+       }
+       
+       /* we didn't win */
+       talloc_free(rec->send_election_te);
+       rec->send_election_te = NULL;
+
+        if (ctdb->tunable.verify_recovery_lock != 0) {
+               /* release the recmaster lock */
+               if (em->pnn != ctdb->pnn &&
+                   ctdb->recovery_lock_fd != -1) {
+                       close(ctdb->recovery_lock_fd);
+                       ctdb->recovery_lock_fd = -1;
+                       unban_all_nodes(ctdb);
+               }
+       }
+
+       /* ok, let that guy become recmaster then */
+       ret = ctdb_ctrl_setrecmaster(ctdb, CONTROL_TIMEOUT(), ctdb_get_pnn(ctdb), em->pnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to send recmaster election request"));
+               talloc_free(mem_ctx);
+               return;
+       }
+
+       talloc_free(mem_ctx);
+       return;
+}
+
+
+/*
+  force the start of the election process
+ */
+static void force_election(struct ctdb_recoverd *rec, uint32_t pnn, 
+                          struct ctdb_node_map *nodemap)
+{
+       int ret;
+       struct ctdb_context *ctdb = rec->ctdb;
+
+       DEBUG(DEBUG_INFO,(__location__ " Force an election\n"));
+
+       /* set all nodes to recovery mode to stop all internode traffic */
+       ret = set_recovery_mode(ctdb, rec, nodemap, CTDB_RECOVERY_ACTIVE);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to set recovery mode to active on cluster\n"));
+               return;
+       }
+
+       talloc_free(rec->election_timeout);
+       rec->election_timeout = event_add_timed(ctdb->ev, ctdb, 
+                                               fast_start ?
+                                               timeval_current_ofs(0, 500000) :
+                                               timeval_current_ofs(ctdb->tunable.election_timeout, 0), 
+                                               ctdb_election_timeout, rec);
+
+       ret = send_election_request(rec, pnn);
+       if (ret!=0) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to initiate recmaster election"));
+               return;
+       }
+
+       /* wait for a few seconds to collect all responses */
+       ctdb_wait_election(rec);
+}
+
+
+
+/*
+  handler for when a node changes its flags
+*/
+static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                           TDB_DATA data, void *private_data)
+{
+       int ret;
+       struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+       struct ctdb_node_map *nodemap=NULL;
+       TALLOC_CTX *tmp_ctx;
+       int i;
+       struct ctdb_recoverd *rec = talloc_get_type(private_data, struct ctdb_recoverd);
+       int disabled_flag_changed;
+
+       if (data.dsize != sizeof(*c)) {
+               DEBUG(DEBUG_ERR,(__location__ "Invalid data in ctdb_node_flag_change\n"));
+               return;
+       }
+
+       tmp_ctx = talloc_new(ctdb);
+       CTDB_NO_MEMORY_VOID(ctdb, tmp_ctx);
+
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ "ctdb_ctrl_getnodemap failed in monitor_handler\n"));
+               talloc_free(tmp_ctx);
+               return;         
+       }
+
+
+       for (i=0;i<nodemap->num;i++) {
+               if (nodemap->nodes[i].pnn == c->pnn) break;
+       }
+
+       if (i == nodemap->num) {
+               DEBUG(DEBUG_CRIT,(__location__ "Flag change for non-existant node %u\n", c->pnn));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       if (c->old_flags != c->new_flags) {
+               DEBUG(DEBUG_NOTICE,("Node %u has changed flags - now 0x%x  was 0x%x\n", c->pnn, c->new_flags, c->old_flags));
+       }
+
+       disabled_flag_changed =  (nodemap->nodes[i].flags ^ c->new_flags) & NODE_FLAGS_DISABLED;
+
+       nodemap->nodes[i].flags = c->new_flags;
+
+       ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), 
+                                    CTDB_CURRENT_NODE, &ctdb->recovery_master);
+
+       if (ret == 0) {
+               ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, CONTROL_TIMEOUT(), 
+                                          CTDB_CURRENT_NODE, &ctdb->recovery_mode);
+       }
+       
+       if (ret == 0 &&
+           ctdb->recovery_master == ctdb->pnn &&
+           ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
+               /* Only do the takeover run if the perm disabled or unhealthy
+                  flags changed since these will cause an ip failover but not
+                  a recovery.
+                  If the node became disconnected or banned this will also
+                  lead to an ip address failover but that is handled 
+                  during recovery
+               */
+               if (disabled_flag_changed) {
+                       rec->need_takeover_run = true;
+               }
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  handler for when we need to push out flag changes ot all other nodes
+*/
+static void push_flags_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                           TDB_DATA data, void *private_data)
+{
+       int ret;
+       struct ctdb_node_flag_change *c = (struct ctdb_node_flag_change *)data.dptr;
+       struct ctdb_node_map *nodemap=NULL;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t recmaster;
+       uint32_t *nodes;
+
+       /* find the recovery master */
+       ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &recmaster);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from local node\n"));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       /* read the node flags from the recmaster */
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), recmaster, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", c->pnn));
+               talloc_free(tmp_ctx);
+               return;
+       }
+       if (c->pnn >= nodemap->num) {
+               DEBUG(DEBUG_ERR,(__location__ " Nodemap from recmaster does not contain node %d\n", c->pnn));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       /* send the flags update to all connected nodes */
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_MODIFY_FLAGS,
+                                     nodes, 0, CONTROL_TIMEOUT(),
+                                     false, data,
+                                     NULL, NULL,
+                                     NULL) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb_control to modify node flags failed\n"));
+
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+
+struct verify_recmode_normal_data {
+       uint32_t count;
+       enum monitor_result status;
+};
+
+static void verify_recmode_normal_callback(struct ctdb_client_control_state *state)
+{
+       struct verify_recmode_normal_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmode_normal_data);
+
+
+       /* one more node has responded with recmode data*/
+       rmdata->count--;
+
+       /* if we failed to get the recmode, then return an error and let
+          the main loop try again.
+       */
+       if (state->state != CTDB_CONTROL_DONE) {
+               if (rmdata->status == MONITOR_OK) {
+                       rmdata->status = MONITOR_FAILED;
+               }
+               return;
+       }
+
+       /* if we got a response, then the recmode will be stored in the
+          status field
+       */
+       if (state->status != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_NOTICE, ("Node:%u was in recovery mode. Start recovery process\n", state->c->hdr.destnode));
+               rmdata->status = MONITOR_RECOVERY_NEEDED;
+       }
+
+       return;
+}
+
+
+/* verify that all nodes are in normal recovery mode */
+static enum monitor_result verify_recmode(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+{
+       struct verify_recmode_normal_data *rmdata;
+       TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+       struct ctdb_client_control_state *state;
+       enum monitor_result status;
+       int j;
+       
+       rmdata = talloc(mem_ctx, struct verify_recmode_normal_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
+       rmdata->count  = 0;
+       rmdata->status = MONITOR_OK;
+
+       /* loop over all active nodes and send an async getrecmode call to 
+          them*/
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               state = ctdb_ctrl_getrecmode_send(ctdb, mem_ctx, 
+                                       CONTROL_TIMEOUT(), 
+                                       nodemap->nodes[j].pnn);
+               if (state == NULL) {
+                       /* we failed to send the control, treat this as 
+                          an error and try again next iteration
+                       */                      
+                       DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmode_send during monitoring\n"));
+                       talloc_free(mem_ctx);
+                       return MONITOR_FAILED;
+               }
+
+               /* set up the callback functions */
+               state->async.fn = verify_recmode_normal_callback;
+               state->async.private_data = rmdata;
+
+               /* one more control to wait for to complete */
+               rmdata->count++;
+       }
+
+
+       /* now wait for up to the maximum number of seconds allowed
+          or until all nodes we expect a response from has replied
+       */
+       while (rmdata->count > 0) {
+               event_loop_once(ctdb->ev);
+       }
+
+       status = rmdata->status;
+       talloc_free(mem_ctx);
+       return status;
+}
+
+
+struct verify_recmaster_data {
+       struct ctdb_recoverd *rec;
+       uint32_t count;
+       uint32_t pnn;
+       enum monitor_result status;
+};
+
+static void verify_recmaster_callback(struct ctdb_client_control_state *state)
+{
+       struct verify_recmaster_data *rmdata = talloc_get_type(state->async.private_data, struct verify_recmaster_data);
+
+
+       /* one more node has responded with recmaster data*/
+       rmdata->count--;
+
+       /* if we failed to get the recmaster, then return an error and let
+          the main loop try again.
+       */
+       if (state->state != CTDB_CONTROL_DONE) {
+               if (rmdata->status == MONITOR_OK) {
+                       rmdata->status = MONITOR_FAILED;
+               }
+               return;
+       }
+
+       /* if we got a response, then the recmaster will be stored in the
+          status field
+       */
+       if (state->status != rmdata->pnn) {
+               DEBUG(DEBUG_ERR,("Node %d thinks node %d is recmaster. Need a new recmaster election\n", state->c->hdr.destnode, state->status));
+               ctdb_set_culprit(rmdata->rec, state->c->hdr.destnode);
+               rmdata->status = MONITOR_ELECTION_NEEDED;
+       }
+
+       return;
+}
+
+
+/* verify that all nodes agree that we are the recmaster */
+static enum monitor_result verify_recmaster(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap, uint32_t pnn)
+{
+       struct ctdb_context *ctdb = rec->ctdb;
+       struct verify_recmaster_data *rmdata;
+       TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+       struct ctdb_client_control_state *state;
+       enum monitor_result status;
+       int j;
+       
+       rmdata = talloc(mem_ctx, struct verify_recmaster_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, rmdata);
+       rmdata->rec    = rec;
+       rmdata->count  = 0;
+       rmdata->pnn    = pnn;
+       rmdata->status = MONITOR_OK;
+
+       /* loop over all active nodes and send an async getrecmaster call to 
+          them*/
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               state = ctdb_ctrl_getrecmaster_send(ctdb, mem_ctx, 
+                                       CONTROL_TIMEOUT(),
+                                       nodemap->nodes[j].pnn);
+               if (state == NULL) {
+                       /* we failed to send the control, treat this as 
+                          an error and try again next iteration
+                       */                      
+                       DEBUG(DEBUG_ERR,("Failed to call ctdb_ctrl_getrecmaster_send during monitoring\n"));
+                       talloc_free(mem_ctx);
+                       return MONITOR_FAILED;
+               }
+
+               /* set up the callback functions */
+               state->async.fn = verify_recmaster_callback;
+               state->async.private_data = rmdata;
+
+               /* one more control to wait for to complete */
+               rmdata->count++;
+       }
+
+
+       /* now wait for up to the maximum number of seconds allowed
+          or until all nodes we expect a response from has replied
+       */
+       while (rmdata->count > 0) {
+               event_loop_once(ctdb->ev);
+       }
+
+       status = rmdata->status;
+       talloc_free(mem_ctx);
+       return status;
+}
+
+static bool interfaces_have_changed(struct ctdb_context *ctdb,
+                                   struct ctdb_recoverd *rec)
+{
+       struct ctdb_control_get_ifaces *ifaces = NULL;
+       TALLOC_CTX *mem_ctx;
+       bool ret = false;
+
+       mem_ctx = talloc_new(NULL);
+
+       /* Read the interfaces from the local node */
+       if (ctdb_ctrl_get_ifaces(ctdb, CONTROL_TIMEOUT(),
+                                CTDB_CURRENT_NODE, mem_ctx, &ifaces) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get interfaces from local node %u\n", ctdb->pnn));
+               /* We could return an error.  However, this will be
+                * rare so we'll decide that the interfaces have
+                * actually changed, just in case.
+                */
+               talloc_free(mem_ctx);
+               return true;
+       }
+
+       if (!rec->ifaces) {
+               /* We haven't been here before so things have changed */
+               DEBUG(DEBUG_NOTICE, ("Initial interface fetched\n"));
+               ret = true;
+       } else if (rec->ifaces->num != ifaces->num) {
+               /* Number of interfaces has changed */
+               DEBUG(DEBUG_NOTICE, ("Interface count changed from %d to %d\n",
+                                    rec->ifaces->num, ifaces->num));
+               ret = true;
+       } else {
+               /* See if interface names or link states have changed */
+               int i;
+               for (i = 0; i < rec->ifaces->num; i++) {
+                       struct ctdb_control_iface_info * iface = &rec->ifaces->ifaces[i];
+                       if (strcmp(iface->name, ifaces->ifaces[i].name) != 0) {
+                               DEBUG(DEBUG_NOTICE,
+                                     ("Interface in slot %d changed: %s => %s\n",
+                                      i, iface->name, ifaces->ifaces[i].name));
+                               ret = true;
+                               break;
+                       }
+                       if (iface->link_state != ifaces->ifaces[i].link_state) {
+                               DEBUG(DEBUG_NOTICE,
+                                     ("Interface %s changed state: %d => %d\n",
+                                      iface->name, iface->link_state,
+                                      ifaces->ifaces[i].link_state));
+                               ret = true;
+                               break;
+                       }
+               }
+       }
+
+       talloc_free(rec->ifaces);
+       rec->ifaces = talloc_steal(rec, ifaces);
+
+       talloc_free(mem_ctx);
+       return ret;
+}
+
+/* called to check that the local allocation of public ip addresses is ok.
+*/
+static int verify_local_ip_allocation(struct ctdb_context *ctdb, struct ctdb_recoverd *rec, uint32_t pnn, struct ctdb_node_map *nodemap)
+{
+       TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       struct ctdb_uptime *uptime1 = NULL;
+       struct ctdb_uptime *uptime2 = NULL;
+       int ret, j;
+       bool need_takeover_run = false;
+
+       ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
+                               CTDB_CURRENT_NODE, &uptime1);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       if (interfaces_have_changed(ctdb, rec)) {
+               DEBUG(DEBUG_NOTICE, ("The interfaces status has changed on "
+                                    "local node %u - force takeover run\n",
+                                    pnn));
+               need_takeover_run = true;
+       }
+
+       ret = ctdb_ctrl_uptime(ctdb, mem_ctx, CONTROL_TIMEOUT(),
+                               CTDB_CURRENT_NODE, &uptime2);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get uptime from local node %u\n", pnn));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       /* skip the check if the startrecovery time has changed */
+       if (timeval_compare(&uptime1->last_recovery_started,
+                           &uptime2->last_recovery_started) != 0) {
+               DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
+               talloc_free(mem_ctx);
+               return 0;
+       }
+
+       /* skip the check if the endrecovery time has changed */
+       if (timeval_compare(&uptime1->last_recovery_finished,
+                           &uptime2->last_recovery_finished) != 0) {
+               DEBUG(DEBUG_NOTICE, (__location__ " last recovery time changed while we read the public ip list. skipping public ip address check\n"));
+               talloc_free(mem_ctx);
+               return 0;
+       }
+
+       /* skip the check if we have started but not finished recovery */
+       if (timeval_compare(&uptime1->last_recovery_finished,
+                           &uptime1->last_recovery_started) != 1) {
+               DEBUG(DEBUG_INFO, (__location__ " in the middle of recovery or ip reallocation. skipping public ip address check\n"));
+               talloc_free(mem_ctx);
+
+               return 0;
+       }
+
+       /* verify that we have the ip addresses we should have
+          and we dont have ones we shouldnt have.
+          if we find an inconsistency we set recmode to
+          active on the local node and wait for the recmaster
+          to do a full blown recovery.
+          also if the pnn is -1 and we are healthy and can host the ip
+          we also request a ip reallocation.
+       */
+       if (ctdb->tunable.disable_ip_failover == 0) {
+               struct ctdb_all_public_ips *ips = NULL;
+
+               /* read the *available* IPs from the local node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE, &ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get available public IPs from local node %u\n", pnn));
+                       talloc_free(mem_ctx);
+                       return -1;
+               }
+
+               for (j=0; j<ips->num; j++) {
+                       if (ips->ips[j].pnn == -1 &&
+                           nodemap->nodes[pnn].flags == 0) {
+                               DEBUG(DEBUG_CRIT,("Public IP '%s' is not assigned and we could serve it\n",
+                                                 ctdb_addr_to_str(&ips->ips[j].addr)));
+                               need_takeover_run = true;
+                       }
+               }
+
+               talloc_free(ips);
+
+               /* read the *known* IPs from the local node */
+               ret = ctdb_ctrl_get_public_ips_flags(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, mem_ctx, 0, &ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get known public IPs from local node %u\n", pnn));
+                       talloc_free(mem_ctx);
+                       return -1;
+               }
+
+               for (j=0; j<ips->num; j++) {
+                       if (ips->ips[j].pnn == pnn) {
+                               if (ctdb->do_checkpublicip && !ctdb_sys_have_ip(&ips->ips[j].addr)) {
+                                       DEBUG(DEBUG_CRIT,("Public IP '%s' is assigned to us but not on an interface\n",
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+                                       need_takeover_run = true;
+                               }
+                       } else {
+                               if (ctdb->do_checkpublicip &&
+                                   ctdb_sys_have_ip(&ips->ips[j].addr)) {
+
+                                       DEBUG(DEBUG_CRIT,("We are still serving a public IP '%s' that we should not be serving. Removing it\n", 
+                                               ctdb_addr_to_str(&ips->ips[j].addr)));
+
+                                       if (ctdb_ctrl_release_ip(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ips->ips[j]) != 0) {
+                                               DEBUG(DEBUG_ERR,("Failed to release local IP address\n"));
+                                       }
+                               }
+                       }
+               }
+       }
+
+       if (need_takeover_run) {
+               struct srvid_request rd;
+               TDB_DATA data;
+
+               DEBUG(DEBUG_CRIT,("Trigger takeoverrun\n"));
+
+               rd.pnn = ctdb->pnn;
+               rd.srvid = 0;
+               data.dptr = (uint8_t *)&rd;
+               data.dsize = sizeof(rd);
+
+               ret = ctdb_client_send_message(ctdb, rec->recmaster, CTDB_SRVID_TAKEOVER_RUN, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to send ipreallocate to recmaster :%d\n", (int)rec->recmaster));
+               }
+       }
+       talloc_free(mem_ctx);
+       return 0;
+}
+
+
+static void async_getnodemap_callback(struct ctdb_context *ctdb, uint32_t node_pnn, int32_t res, TDB_DATA outdata, void *callback_data)
+{
+       struct ctdb_node_map **remote_nodemaps = callback_data;
+
+       if (node_pnn >= ctdb->num_nodes) {
+               DEBUG(DEBUG_ERR,(__location__ " pnn from invalid node\n"));
+               return;
+       }
+
+       remote_nodemaps[node_pnn] = (struct ctdb_node_map *)talloc_steal(remote_nodemaps, outdata.dptr);
+
+}
+
+static int get_remote_nodemaps(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+       struct ctdb_node_map *nodemap,
+       struct ctdb_node_map **remote_nodemaps)
+{
+       uint32_t *nodes;
+
+       nodes = list_of_active_nodes(ctdb, nodemap, mem_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_NODEMAP,
+                                       nodes, 0,
+                                       CONTROL_TIMEOUT(), false, tdb_null,
+                                       async_getnodemap_callback,
+                                       NULL,
+                                       remote_nodemaps) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to pull all remote nodemaps\n"));
+
+               return -1;
+       }
+
+       return 0;
+}
+
+enum reclock_child_status { RECLOCK_CHECKING, RECLOCK_OK, RECLOCK_FAILED, RECLOCK_TIMEOUT};
+struct ctdb_check_reclock_state {
+       struct ctdb_context *ctdb;
+       struct timeval start_time;
+       int fd[2];
+       pid_t child;
+       struct timed_event *te;
+       struct fd_event *fde;
+       enum reclock_child_status status;
+};
+
+/* when we free the reclock state we must kill any child process.
+*/
+static int check_reclock_destructor(struct ctdb_check_reclock_state *state)
+{
+       struct ctdb_context *ctdb = state->ctdb;
+
+       ctdb_ctrl_report_recd_lock_latency(ctdb, CONTROL_TIMEOUT(), timeval_elapsed(&state->start_time));
+
+       if (state->fd[0] != -1) {
+               close(state->fd[0]);
+               state->fd[0] = -1;
+       }
+       if (state->fd[1] != -1) {
+               close(state->fd[1]);
+               state->fd[1] = -1;
+       }
+       ctdb_kill(ctdb, state->child, SIGKILL);
+       return 0;
+}
+
+/*
+  called if our check_reclock child times out. this would happen if
+  i/o to the reclock file blocks.
+ */
+static void ctdb_check_reclock_timeout(struct event_context *ev, struct timed_event *te, 
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_check_reclock_state *state = talloc_get_type(private_data, 
+                                          struct ctdb_check_reclock_state);
+
+       DEBUG(DEBUG_ERR,(__location__ " check_reclock child process hung/timedout CFS slow to grant locks?\n"));
+       state->status = RECLOCK_TIMEOUT;
+}
+
+/* this is called when the child process has completed checking the reclock
+   file and has written data back to us through the pipe.
+*/
+static void reclock_child_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct ctdb_check_reclock_state *state= talloc_get_type(private_data, 
+                                            struct ctdb_check_reclock_state);
+       char c = 0;
+       int ret;
+
+       /* we got a response from our child process so we can abort the
+          timeout.
+       */
+       talloc_free(state->te);
+       state->te = NULL;
+
+       ret = read(state->fd[0], &c, 1);
+       if (ret != 1 || c != RECLOCK_OK) {
+               DEBUG(DEBUG_ERR,(__location__ " reclock child process returned error %d\n", c));
+               state->status = RECLOCK_FAILED;
+
+               return;
+       }
+
+       state->status = RECLOCK_OK;
+       return;
+}
+
+static int check_recovery_lock(struct ctdb_context *ctdb)
+{
+       int ret;
+       struct ctdb_check_reclock_state *state;
+       pid_t parent = getpid();
+
+       if (ctdb->recovery_lock_fd == -1) {
+               DEBUG(DEBUG_CRIT,("recovery master doesn't have the recovery lock\n"));
+               return -1;
+       }
+
+       state = talloc(ctdb, struct ctdb_check_reclock_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->ctdb = ctdb;
+       state->start_time = timeval_current();
+       state->status = RECLOCK_CHECKING;
+       state->fd[0] = -1;
+       state->fd[1] = -1;
+
+       ret = pipe(state->fd);
+       if (ret != 0) {
+               talloc_free(state);
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to open pipe for check_reclock child\n"));
+               return -1;
+       }
+
+       state->child = ctdb_fork(ctdb);
+       if (state->child == (pid_t)-1) {
+               DEBUG(DEBUG_CRIT,(__location__ " fork() failed in check_reclock child\n"));
+               close(state->fd[0]);
+               state->fd[0] = -1;
+               close(state->fd[1]);
+               state->fd[1] = -1;
+               talloc_free(state);
+               return -1;
+       }
+
+       if (state->child == 0) {
+               char cc = RECLOCK_OK;
+               close(state->fd[0]);
+               state->fd[0] = -1;
+
+               ctdb_set_process_name("ctdb_rec_reclock");
+               debug_extra = talloc_asprintf(NULL, "recovery-lock:");
+               if (pread(ctdb->recovery_lock_fd, &cc, 1, 0) == -1) {
+                       DEBUG(DEBUG_CRIT,("failed read from recovery_lock_fd - %s\n", strerror(errno)));
+                       cc = RECLOCK_FAILED;
+               }
+
+               write(state->fd[1], &cc, 1);
+               /* make sure we die when our parent dies */
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+               }
+               _exit(0);
+       }
+       close(state->fd[1]);
+       state->fd[1] = -1;
+       set_close_on_exec(state->fd[0]);
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for check_recovery_lock\n", state->fd[0]));
+
+       talloc_set_destructor(state, check_reclock_destructor);
+
+       state->te = event_add_timed(ctdb->ev, state, timeval_current_ofs(15, 0),
+                                   ctdb_check_reclock_timeout, state);
+       if (state->te == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to create a timed event for reclock child\n"));
+               talloc_free(state);
+               return -1;
+       }
+
+       state->fde = event_add_fd(ctdb->ev, state, state->fd[0],
+                               EVENT_FD_READ,
+                               reclock_child_handler,
+                               (void *)state);
+
+       if (state->fde == NULL) {
+               DEBUG(DEBUG_CRIT,(__location__ " Failed to create an fd event for reclock child\n"));
+               talloc_free(state);
+               return -1;
+       }
+       tevent_fd_set_auto_close(state->fde);
+
+       while (state->status == RECLOCK_CHECKING) {
+               event_loop_once(ctdb->ev);
+       }
+
+       if (state->status == RECLOCK_FAILED) {
+               DEBUG(DEBUG_ERR,(__location__ " reclock child failed when checking file\n"));
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
+               talloc_free(state);
+               return -1;
+       }
+
+       talloc_free(state);
+       return 0;
+}
+
+static int update_recovery_lock_file(struct ctdb_context *ctdb)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       const char *reclockfile;
+
+       if (ctdb_ctrl_getreclock(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, tmp_ctx, &reclockfile) != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read reclock file from daemon\n"));
+               talloc_free(tmp_ctx);
+               return -1;      
+       }
+
+       if (reclockfile == NULL) {
+               if (ctdb->recovery_lock_file != NULL) {
+                       DEBUG(DEBUG_ERR,("Reclock file disabled\n"));
+                       talloc_free(ctdb->recovery_lock_file);
+                       ctdb->recovery_lock_file = NULL;
+                       if (ctdb->recovery_lock_fd != -1) {
+                               close(ctdb->recovery_lock_fd);
+                               ctdb->recovery_lock_fd = -1;
+                       }
+               }
+               ctdb->tunable.verify_recovery_lock = 0;
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       if (ctdb->recovery_lock_file == NULL) {
+               ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
+               if (ctdb->recovery_lock_fd != -1) {
+                       close(ctdb->recovery_lock_fd);
+                       ctdb->recovery_lock_fd = -1;
+               }
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+
+       if (!strcmp(reclockfile, ctdb->recovery_lock_file)) {
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       talloc_free(ctdb->recovery_lock_file);
+       ctdb->recovery_lock_file = talloc_strdup(ctdb, reclockfile);
+       ctdb->tunable.verify_recovery_lock = 0;
+       if (ctdb->recovery_lock_fd != -1) {
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+static void main_loop(struct ctdb_context *ctdb, struct ctdb_recoverd *rec,
+                     TALLOC_CTX *mem_ctx)
+{
+       uint32_t pnn;
+       struct ctdb_node_map *nodemap=NULL;
+       struct ctdb_node_map *recmaster_nodemap=NULL;
+       struct ctdb_node_map **remote_nodemaps=NULL;
+       struct ctdb_vnn_map *vnnmap=NULL;
+       struct ctdb_vnn_map *remote_vnnmap=NULL;
+       int32_t debug_level;
+       int i, j, ret;
+       bool self_ban;
+
+
+       /* verify that the main daemon is still running */
+       if (ctdb_kill(ctdb, ctdb->ctdbd_pid, 0) != 0) {
+               DEBUG(DEBUG_CRIT,("CTDB daemon is no longer available. Shutting down recovery daemon\n"));
+               exit(-1);
+       }
+
+       /* ping the local daemon to tell it we are alive */
+       ctdb_ctrl_recd_ping(ctdb);
+
+       if (rec->election_timeout) {
+               /* an election is in progress */
+               return;
+       }
+
+       /* read the debug level from the parent and update locally */
+       ret = ctdb_ctrl_get_debuglevel(ctdb, CTDB_CURRENT_NODE, &debug_level);
+       if (ret !=0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to read debuglevel from parent\n"));
+               return;
+       }
+       LogLevel = debug_level;
+
+       /* get relevant tunables */
+       ret = ctdb_ctrl_get_all_tunables(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->tunable);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to get tunables - retrying\n"));
+               return;
+       }
+
+       /* get the current recovery lock file from the server */
+       if (update_recovery_lock_file(ctdb) != 0) {
+               DEBUG(DEBUG_ERR,("Failed to update the recovery lock file\n"));
+               return;
+       }
+
+       /* Make sure that if recovery lock verification becomes disabled when
+          we close the file
+       */
+        if (ctdb->tunable.verify_recovery_lock == 0) {
+               if (ctdb->recovery_lock_fd != -1) {
+                       close(ctdb->recovery_lock_fd);
+                       ctdb->recovery_lock_fd = -1;
+               }
+       }
+
+       pnn = ctdb_get_pnn(ctdb);
+
+       /* get the vnnmap */
+       ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), pnn, mem_ctx, &vnnmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from node %u\n", pnn));
+               return;
+       }
+
+
+       /* get number of nodes */
+       if (rec->nodemap) {
+               talloc_free(rec->nodemap);
+               rec->nodemap = NULL;
+               nodemap=NULL;
+       }
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), pnn, rec, &rec->nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from node %u\n", pnn));
+               return;
+       }
+       nodemap = rec->nodemap;
+
+       /* remember our own node flags */
+       rec->node_flags = nodemap->nodes[pnn].flags;
+
+       ban_misbehaving_nodes(rec, &self_ban);
+       if (self_ban) {
+               DEBUG(DEBUG_NOTICE, ("This node was banned, restart main_loop\n"));
+               return;
+       }
+
+       /* if the local daemon is STOPPED or BANNED, we verify that the databases are
+          also frozen and that the recmode is set to active.
+       */
+       if (rec->node_flags & (NODE_FLAGS_STOPPED | NODE_FLAGS_BANNED)) {
+               /* If this node has become inactive then we want to
+                * reduce the chances of it taking over the recovery
+                * master role when it becomes active again.  This
+                * helps to stabilise the recovery master role so that
+                * it stays on the most stable node.
+                */
+               rec->priority_time = timeval_current();
+
+               ret = ctdb_ctrl_getrecmode(ctdb, mem_ctx, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, &ctdb->recovery_mode);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to read recmode from local node\n"));
+               }
+               if (ctdb->recovery_mode == CTDB_RECOVERY_NORMAL) {
+                       DEBUG(DEBUG_ERR,("Node is stopped or banned but recovery mode is not active. Activate recovery mode and lock databases\n"));
+
+                       ret = ctdb_ctrl_freeze_priority(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, 1);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " Failed to freeze node in STOPPED or BANNED state\n"));
+                               return;
+                       }
+                       ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), CTDB_CURRENT_NODE, CTDB_RECOVERY_ACTIVE);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR,(__location__ " Failed to activate recovery mode in STOPPED or BANNED state\n"));
+
+                               return;
+                       }
+               }
+
+               /* If this node is stopped or banned then it is not the recovery
+                * master, so don't do anything. This prevents stopped or banned
+                * node from starting election and sending unnecessary controls.
+                */
+               return;
+       }
+
+       /* check which node is the recovery master */
+       ret = ctdb_ctrl_getrecmaster(ctdb, mem_ctx, CONTROL_TIMEOUT(), pnn, &rec->recmaster);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get recmaster from node %u\n", pnn));
+               return;
+       }
+
+       /* If we are not the recmaster then do some housekeeping */
+       if (rec->recmaster != pnn) {
+               /* Ignore any IP reallocate requests - only recmaster
+                * processes them
+                */
+               TALLOC_FREE(rec->reallocate_requests);
+               /* Clear any nodes that should be force rebalanced in
+                * the next takeover run.  If the recovery master role
+                * has moved then we don't want to process these some
+                * time in the future.
+                */
+               TALLOC_FREE(rec->force_rebalance_nodes);
+       }
+
+       /* This is a special case.  When recovery daemon is started, recmaster
+        * is set to -1.  If a node is not started in stopped state, then
+        * start election to decide recovery master
+        */
+       if (rec->recmaster == (uint32_t)-1) {
+               DEBUG(DEBUG_NOTICE,(__location__ " Initial recovery master set - forcing election\n"));
+               force_election(rec, pnn, nodemap);
+               return;
+       }
+
+       /* update the capabilities for all nodes */
+       ret = update_capabilities(ctdb, nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to update node capabilities.\n"));
+               return;
+       }
+
+       /*
+        * If the current recmaster does not have CTDB_CAP_RECMASTER,
+        * but we have, then force an election and try to become the new
+        * recmaster.
+        */
+       if ((rec->ctdb->nodes[rec->recmaster]->capabilities & CTDB_CAP_RECMASTER) == 0 &&
+           (rec->ctdb->capabilities & CTDB_CAP_RECMASTER) &&
+            !(nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE)) {
+               DEBUG(DEBUG_ERR, (__location__ " Current recmaster node %u does not have CAP_RECMASTER,"
+                                 " but we (node %u) have - force an election\n",
+                                 rec->recmaster, pnn));
+               force_election(rec, pnn, nodemap);
+               return;
+       }
+
+       /* count how many active nodes there are */
+       rec->num_active    = 0;
+       rec->num_lmasters  = 0;
+       rec->num_connected = 0;
+       for (i=0; i<nodemap->num; i++) {
+               if (!(nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE)) {
+                       rec->num_active++;
+                       if (rec->ctdb->nodes[i]->capabilities & CTDB_CAP_LMASTER) {
+                               rec->num_lmasters++;
+                       }
+               }
+               if (!(nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED)) {
+                       rec->num_connected++;
+               }
+       }
+
+
+       /* verify that the recmaster node is still active */
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].pnn==rec->recmaster) {
+                       break;
+               }
+       }
+
+       if (j == nodemap->num) {
+               DEBUG(DEBUG_ERR, ("Recmaster node %u not in list. Force reelection\n", rec->recmaster));
+               force_election(rec, pnn, nodemap);
+               return;
+       }
+
+       /* if recovery master is disconnected we must elect a new recmaster */
+       if (nodemap->nodes[j].flags & NODE_FLAGS_DISCONNECTED) {
+               DEBUG(DEBUG_NOTICE, ("Recmaster node %u is disconnected. Force reelection\n", nodemap->nodes[j].pnn));
+               force_election(rec, pnn, nodemap);
+               return;
+       }
+
+       /* get nodemap from the recovery master to check if it is inactive */
+       ret = ctdb_ctrl_getnodemap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+                                  mem_ctx, &recmaster_nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Unable to get nodemap from recovery master %u\n", 
+                         nodemap->nodes[j].pnn));
+               return;
+       }
+
+
+       if ((recmaster_nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) &&
+           (rec->node_flags & NODE_FLAGS_INACTIVE) == 0) {
+               DEBUG(DEBUG_NOTICE, ("Recmaster node %u no longer available. Force reelection\n", nodemap->nodes[j].pnn));
+               /*
+                * update our nodemap to carry the recmaster's notion of
+                * its own flags, so that we don't keep freezing the
+                * inactive recmaster node...
+                */
+               nodemap->nodes[j].flags = recmaster_nodemap->nodes[j].flags;
+               force_election(rec, pnn, nodemap);
+               return;
+       }
+
+       /* verify that we have all ip addresses we should have and we dont
+        * have addresses we shouldnt have.
+        */ 
+       if (ctdb->tunable.disable_ip_failover == 0 &&
+           rec->takeover_runs_disable_ctx == NULL) {
+               if (verify_local_ip_allocation(ctdb, rec, pnn, nodemap) != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Public IPs were inconsistent.\n"));
+               }
+       }
+
+
+       /* if we are not the recmaster then we do not need to check
+          if recovery is needed
+        */
+       if (pnn != rec->recmaster) {
+               return;
+       }
+
+
+       /* ensure our local copies of flags are right */
+       ret = update_local_flags(rec, nodemap);
+       if (ret == MONITOR_ELECTION_NEEDED) {
+               DEBUG(DEBUG_NOTICE,("update_local_flags() called for a re-election.\n"));
+               force_election(rec, pnn, nodemap);
+               return;
+       }
+       if (ret != MONITOR_OK) {
+               DEBUG(DEBUG_ERR,("Unable to update local flags\n"));
+               return;
+       }
+
+       if (ctdb->num_nodes != nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " ctdb->num_nodes (%d) != nodemap->num (%d) reloading nodes file\n", ctdb->num_nodes, nodemap->num));
+               ctdb_load_nodes_file(ctdb);
+               return;
+       }
+
+       /* verify that all active nodes agree that we are the recmaster */
+       switch (verify_recmaster(rec, nodemap, pnn)) {
+       case MONITOR_RECOVERY_NEEDED:
+               /* can not happen */
+               return;
+       case MONITOR_ELECTION_NEEDED:
+               force_election(rec, pnn, nodemap);
+               return;
+       case MONITOR_OK:
+               break;
+       case MONITOR_FAILED:
+               return;
+       }
+
+
+       if (rec->need_recovery) {
+               /* a previous recovery didn't finish */
+               do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+               return;
+       }
+
+       /* verify that all active nodes are in normal mode 
+          and not in recovery mode 
+       */
+       switch (verify_recmode(ctdb, nodemap)) {
+       case MONITOR_RECOVERY_NEEDED:
+               do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+               return;
+       case MONITOR_FAILED:
+               return;
+       case MONITOR_ELECTION_NEEDED:
+               /* can not happen */
+       case MONITOR_OK:
+               break;
+       }
+
+
+        if (ctdb->tunable.verify_recovery_lock != 0) {
+               /* we should have the reclock - check its not stale */
+               ret = check_recovery_lock(ctdb);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed check_recovery_lock. Force a recovery\n"));
+                       ctdb_set_culprit(rec, ctdb->pnn);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
+               }
+       }
+
+
+       /* if there are takeovers requested, perform it and notify the waiters */
+       if (rec->takeover_runs_disable_ctx == NULL &&
+           rec->reallocate_requests) {
+               process_ipreallocate_requests(ctdb, rec);
+       }
+
+       /* get the nodemap for all active remote nodes
+        */
+       remote_nodemaps = talloc_array(mem_ctx, struct ctdb_node_map *, nodemap->num);
+       if (remote_nodemaps == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to allocate remote nodemap array\n"));
+               return;
+       }
+       for(i=0; i<nodemap->num; i++) {
+               remote_nodemaps[i] = NULL;
+       }
+       if (get_remote_nodemaps(ctdb, mem_ctx, nodemap, remote_nodemaps) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to read remote nodemaps\n"));
+               return;
+       } 
+
+       /* verify that all other nodes have the same nodemap as we have
+       */
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               if (remote_nodemaps[j] == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Did not get a remote nodemap for node %d, restarting monitoring\n", j));
+                       ctdb_set_culprit(rec, j);
+
+                       return;
+               }
+
+               /* if the nodes disagree on how many nodes there are
+                  then this is a good reason to try recovery
+                */
+               if (remote_nodemaps[j]->num != nodemap->num) {
+                       DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different node count. %u vs %u of the local node\n",
+                                 nodemap->nodes[j].pnn, remote_nodemaps[j]->num, nodemap->num));
+                       ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
+               }
+
+               /* if the nodes disagree on which nodes exist and are
+                  active, then that is also a good reason to do recovery
+                */
+               for (i=0;i<nodemap->num;i++) {
+                       if (remote_nodemaps[j]->nodes[i].pnn != nodemap->nodes[i].pnn) {
+                               DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different nodemap pnn for %d (%u vs %u).\n", 
+                                         nodemap->nodes[j].pnn, i, 
+                                         remote_nodemaps[j]->nodes[i].pnn, nodemap->nodes[i].pnn));
+                               ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                               do_recovery(rec, mem_ctx, pnn, nodemap, 
+                                           vnnmap);
+                               return;
+                       }
+               }
+       }
+
+       /*
+        * Update node flags obtained from each active node. This ensure we have
+        * up-to-date information for all the nodes.
+        */
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               nodemap->nodes[j].flags = remote_nodemaps[j]->nodes[j].flags;
+       }
+
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+
+               /* verify the flags are consistent
+               */
+               for (i=0; i<nodemap->num; i++) {
+                       if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                               continue;
+                       }
+                       
+                       if (nodemap->nodes[i].flags != remote_nodemaps[j]->nodes[i].flags) {
+                               DEBUG(DEBUG_ERR, (__location__ " Remote node:%u has different flags for node %u. It has 0x%02x vs our 0x%02x\n", 
+                                 nodemap->nodes[j].pnn, 
+                                 nodemap->nodes[i].pnn, 
+                                 remote_nodemaps[j]->nodes[i].flags,
+                                 nodemap->nodes[i].flags));
+                               if (i == j) {
+                                       DEBUG(DEBUG_ERR,("Use flags 0x%02x from remote node %d for cluster update of its own flags\n", remote_nodemaps[j]->nodes[i].flags, j));
+                                       update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, remote_nodemaps[j]->nodes[i].flags);
+                                       ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                                       do_recovery(rec, mem_ctx, pnn, nodemap, 
+                                                   vnnmap);
+                                       return;
+                               } else {
+                                       DEBUG(DEBUG_ERR,("Use flags 0x%02x from local recmaster node for cluster update of node %d flags\n", nodemap->nodes[i].flags, i));
+                                       update_flags_on_all_nodes(ctdb, nodemap, nodemap->nodes[i].pnn, nodemap->nodes[i].flags);
+                                       ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                                       do_recovery(rec, mem_ctx, pnn, nodemap, 
+                                                   vnnmap);
+                                       return;
+                               }
+                       }
+               }
+       }
+
+
+       /* There must be the same number of lmasters in the vnn map as
+        * there are active nodes with the lmaster capability...  or
+        * do a recovery.
+        */
+       if (vnnmap->size != rec->num_lmasters) {
+               DEBUG(DEBUG_ERR, (__location__ " The vnnmap count is different from the number of active lmaster nodes: %u vs %u\n",
+                         vnnmap->size, rec->num_lmasters));
+               ctdb_set_culprit(rec, ctdb->pnn);
+               do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+               return;
+       }
+
+       /* verify that all active nodes in the nodemap also exist in 
+          the vnnmap.
+        */
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (nodemap->nodes[j].pnn == pnn) {
+                       continue;
+               }
+
+               for (i=0; i<vnnmap->size; i++) {
+                       if (vnnmap->map[i] == nodemap->nodes[j].pnn) {
+                               break;
+                       }
+               }
+               if (i == vnnmap->size) {
+                       DEBUG(DEBUG_ERR, (__location__ " Node %u is active in the nodemap but did not exist in the vnnmap\n", 
+                                 nodemap->nodes[j].pnn));
+                       ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
+               }
+       }
+
+       
+       /* verify that all other nodes have the same vnnmap
+          and are from the same generation
+        */
+       for (j=0; j<nodemap->num; j++) {
+               if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (nodemap->nodes[j].pnn == pnn) {
+                       continue;
+               }
+
+               ret = ctdb_ctrl_getvnnmap(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].pnn, 
+                                         mem_ctx, &remote_vnnmap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to get vnnmap from remote node %u\n", 
+                                 nodemap->nodes[j].pnn));
+                       return;
+               }
+
+               /* verify the vnnmap generation is the same */
+               if (vnnmap->generation != remote_vnnmap->generation) {
+                       DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different generation of vnnmap. %u vs %u (ours)\n", 
+                                 nodemap->nodes[j].pnn, remote_vnnmap->generation, vnnmap->generation));
+                       ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
+               }
+
+               /* verify the vnnmap size is the same */
+               if (vnnmap->size != remote_vnnmap->size) {
+                       DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different size of vnnmap. %u vs %u (ours)\n", 
+                                 nodemap->nodes[j].pnn, remote_vnnmap->size, vnnmap->size));
+                       ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
+               }
+
+               /* verify the vnnmap is the same */
+               for (i=0;i<vnnmap->size;i++) {
+                       if (remote_vnnmap->map[i] != vnnmap->map[i]) {
+                               DEBUG(DEBUG_ERR, (__location__ " Remote node %u has different vnnmap.\n", 
+                                         nodemap->nodes[j].pnn));
+                               ctdb_set_culprit(rec, nodemap->nodes[j].pnn);
+                               do_recovery(rec, mem_ctx, pnn, nodemap, 
+                                           vnnmap);
+                               return;
+                       }
+               }
+       }
+
+       /* we might need to change who has what IP assigned */
+       if (rec->need_takeover_run) {
+               uint32_t culprit = (uint32_t)-1;
+
+               rec->need_takeover_run = false;
+
+               /* update the list of public ips that a node can handle for
+                  all connected nodes
+               */
+               ret = ctdb_reload_remote_public_ips(ctdb, rec, nodemap, &culprit);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to read public ips from remote node %d\n",
+                                        culprit));
+                       rec->need_takeover_run = true;
+                       return;
+               }
+
+               /* execute the "startrecovery" event script on all nodes */
+               ret = run_startrecovery_eventscript(rec, nodemap);
+               if (ret!=0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'startrecovery' event on cluster\n"));
+                       ctdb_set_culprit(rec, ctdb->pnn);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+                       return;
+               }
+
+               /* If takeover run fails, then the offending nodes are
+                * assigned ban culprit counts. And we re-try takeover.
+                * If takeover run fails repeatedly, the node would get
+                * banned.
+                *
+                * If rec->need_takeover_run is not set to true at this
+                * failure, monitoring is disabled cluster-wide (via
+                * startrecovery eventscript) and will not get enabled.
+                */
+               if (!do_takeover_run(rec, nodemap, true)) {
+                       return;
+               }
+
+               /* execute the "recovered" event script on all nodes */
+               ret = run_recovered_eventscript(rec, nodemap, "monitor_cluster");
+#if 0
+// we cant check whether the event completed successfully
+// since this script WILL fail if the node is in recovery mode
+// and if that race happens, the code here would just cause a second
+// cascading recovery.
+               if (ret!=0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Unable to run the 'recovered' event on cluster. Update of public ips failed.\n"));
+                       ctdb_set_culprit(rec, ctdb->pnn);
+                       do_recovery(rec, mem_ctx, pnn, nodemap, vnnmap);
+               }
+#endif
+       }
+}
+
+/*
+  the main monitoring loop
+ */
+static void monitor_cluster(struct ctdb_context *ctdb)
+{
+       struct ctdb_recoverd *rec;
+
+       DEBUG(DEBUG_NOTICE,("monitor_cluster starting\n"));
+
+       rec = talloc_zero(ctdb, struct ctdb_recoverd);
+       CTDB_NO_MEMORY_FATAL(ctdb, rec);
+
+       rec->ctdb = ctdb;
+
+       rec->takeover_run_in_progress = false;
+
+       rec->priority_time = timeval_current();
+
+       /* register a message port for sending memory dumps */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_MEM_DUMP, mem_dump_handler, rec);
+
+       /* register a message port for requesting logs */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_GETLOG, getlog_handler, rec);
+
+       /* register a message port for clearing logs */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_CLEARLOG, clearlog_handler, rec);
+
+       /* register a message port for recovery elections */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECOVERY, election_handler, rec);
+
+       /* when nodes are disabled/enabled */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_SET_NODE_FLAGS, monitor_handler, rec);
+
+       /* when we are asked to puch out a flag change */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_PUSH_NODE_FLAGS, push_flags_handler, rec);
+
+       /* register a message port for vacuum fetch */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_VACUUM_FETCH, vacuum_fetch_handler, rec);
+
+       /* register a message port for reloadnodes  */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RELOAD_NODES, reload_nodes_handler, rec);
+
+       /* register a message port for performing a takeover run */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_TAKEOVER_RUN, ip_reallocate_handler, rec);
+
+       /* register a message port for disabling the ip check for a short while */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_DISABLE_IP_CHECK, disable_ip_check_handler, rec);
+
+       /* register a message port for updating the recovery daemons node assignment for an ip */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECD_UPDATE_IP, recd_update_ip_handler, rec);
+
+       /* register a message port for forcing a rebalance of a node next
+          reallocation */
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_REBALANCE_NODE, recd_node_rebalance_handler, rec);
+
+       /* Register a message port for disabling takeover runs */
+       ctdb_client_set_message_handler(ctdb,
+                                       CTDB_SRVID_DISABLE_TAKEOVER_RUNS,
+                                       disable_takeover_runs_handler, rec);
+
+       for (;;) {
+               TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+               struct timeval start;
+               double elapsed;
+
+               if (!mem_ctx) {
+                       DEBUG(DEBUG_CRIT,(__location__
+                                         " Failed to create temp context\n"));
+                       exit(-1);
+               }
+
+               start = timeval_current();
+               main_loop(ctdb, rec, mem_ctx);
+               talloc_free(mem_ctx);
+
+               /* we only check for recovery once every second */
+               elapsed = timeval_elapsed(&start);
+               if (elapsed < ctdb->tunable.recover_interval) {
+                       ctdb_wait_timeout(ctdb, ctdb->tunable.recover_interval
+                                         - elapsed);
+               }
+       }
+}
+
+/*
+  event handler for when the main ctdbd dies
+ */
+static void ctdb_recoverd_parent(struct event_context *ev, struct fd_event *fde, 
+                                uint16_t flags, void *private_data)
+{
+       DEBUG(DEBUG_ALERT,("recovery daemon parent died - exiting\n"));
+       _exit(1);
+}
+
+/*
+  called regularly to verify that the recovery daemon is still running
+ */
+static void ctdb_check_recd(struct event_context *ev, struct timed_event *te, 
+                             struct timeval yt, void *p)
+{
+       struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+       if (ctdb_kill(ctdb, ctdb->recoverd_pid, 0) != 0) {
+               DEBUG(DEBUG_ERR,("Recovery daemon (pid:%d) is no longer running. Trying to restart recovery daemon.\n", (int)ctdb->recoverd_pid));
+
+               event_add_timed(ctdb->ev, ctdb, timeval_zero(), 
+                               ctdb_restart_recd, ctdb);
+
+               return;
+       }
+
+       event_add_timed(ctdb->ev, ctdb->recd_ctx,
+                       timeval_current_ofs(30, 0),
+                       ctdb_check_recd, ctdb);
+}
+
+static void recd_sig_child_handler(struct event_context *ev,
+       struct signal_event *se, int signum, int count,
+       void *dont_care, 
+       void *private_data)
+{
+//     struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int status;
+       pid_t pid = -1;
+
+       while (pid != 0) {
+               pid = waitpid(-1, &status, WNOHANG);
+               if (pid == -1) {
+                       if (errno != ECHILD) {
+                               DEBUG(DEBUG_ERR, (__location__ " waitpid() returned error. errno:%s(%d)\n", strerror(errno),errno));
+                       }
+                       return;
+               }
+               if (pid > 0) {
+                       DEBUG(DEBUG_DEBUG, ("RECD SIGCHLD from %d\n", (int)pid));
+               }
+       }
+}
+
+/*
+  startup the recovery daemon as a child of the main ctdb daemon
+ */
+int ctdb_start_recoverd(struct ctdb_context *ctdb)
+{
+       int fd[2];
+       struct signal_event *se;
+       struct tevent_fd *fde;
+
+       if (pipe(fd) != 0) {
+               return -1;
+       }
+
+       ctdb->ctdbd_pid = getpid();
+
+       ctdb->recoverd_pid = ctdb_fork_no_free_ringbuffer(ctdb);
+       if (ctdb->recoverd_pid == -1) {
+               return -1;
+       }
+
+       if (ctdb->recoverd_pid != 0) {
+               talloc_free(ctdb->recd_ctx);
+               ctdb->recd_ctx = talloc_new(ctdb);
+               CTDB_NO_MEMORY(ctdb, ctdb->recd_ctx);
+
+               close(fd[0]);
+               event_add_timed(ctdb->ev, ctdb->recd_ctx,
+                               timeval_current_ofs(30, 0),
+                               ctdb_check_recd, ctdb);
+               return 0;
+       }
+
+       close(fd[1]);
+
+       srandom(getpid() ^ time(NULL));
+
+       /* Clear the log ringbuffer */
+       ctdb_clear_log(ctdb);
+
+       ctdb_set_process_name("ctdb_recovered");
+       if (switch_from_server_to_client(ctdb, "recoverd") != 0) {
+               DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch recovery daemon into client mode. shutting down.\n"));
+               exit(1);
+       }
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to recovery daemon\n", fd[0]));
+
+       fde = event_add_fd(ctdb->ev, ctdb, fd[0], EVENT_FD_READ,
+                    ctdb_recoverd_parent, &fd[0]);
+       tevent_fd_set_auto_close(fde);
+
+       /* set up a handler to pick up sigchld */
+       se = event_add_signal(ctdb->ev, ctdb,
+                                    SIGCHLD, 0,
+                                    recd_sig_child_handler,
+                                    ctdb);
+       if (se == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to set up signal handler for SIGCHLD in recovery daemon\n"));
+               exit(1);
+       }
+
+       monitor_cluster(ctdb);
+
+       DEBUG(DEBUG_ALERT,("ERROR: ctdb_recoverd finished!?\n"));
+       return -1;
+}
+
+/*
+  shutdown the recovery daemon
+ */
+void ctdb_stop_recoverd(struct ctdb_context *ctdb)
+{
+       if (ctdb->recoverd_pid == 0) {
+               return;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Shutting down recovery daemon\n"));
+       ctdb_kill(ctdb, ctdb->recoverd_pid, SIGTERM);
+
+       TALLOC_FREE(ctdb->recd_ctx);
+       TALLOC_FREE(ctdb->recd_ping_count);
+}
+
+static void ctdb_restart_recd(struct event_context *ev, struct timed_event *te, 
+                      struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       DEBUG(DEBUG_ERR,("Restarting recovery daemon\n"));
+       ctdb_stop_recoverd(ctdb);
+       ctdb_start_recoverd(ctdb);
+}
diff --git a/ctdb/server/ctdb_server.c b/ctdb/server/ctdb_server.c
new file mode 100644 (file)
index 0000000..c45f4cb
--- /dev/null
@@ -0,0 +1,690 @@
+/* 
+   ctdb main protocol code
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "lib/util/dlinklist.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+
+/*
+  choose the transport we will use
+*/
+int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport)
+{
+       ctdb->transport = talloc_strdup(ctdb, transport);
+       CTDB_NO_MEMORY(ctdb, ctdb->transport);
+
+       return 0;
+}
+
+/*
+  Check whether an ip is a valid node ip
+  Returns the node id for this ip address or -1
+*/
+int ctdb_ip_to_nodeid(struct ctdb_context *ctdb, const char *nodeip)
+{
+       int nodeid;
+
+       for (nodeid=0;nodeid<ctdb->num_nodes;nodeid++) {
+               if (ctdb->nodes[nodeid]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               if (!strcmp(ctdb->nodes[nodeid]->address.address, nodeip)) {
+                       return nodeid;
+               }
+       }
+
+       return -1;
+}
+
+/*
+  choose the recovery lock file
+*/
+int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file)
+{
+       if (ctdb->recovery_lock_file != NULL) {
+               talloc_free(ctdb->recovery_lock_file);
+               ctdb->recovery_lock_file = NULL;
+       }
+
+       if (file == NULL) {
+               DEBUG(DEBUG_ALERT,("Recovery lock file set to \"\". Disabling recovery lock checking\n"));
+               ctdb->tunable.verify_recovery_lock = 0;
+               return 0;
+       }
+
+       ctdb->recovery_lock_file = talloc_strdup(ctdb, file);
+       CTDB_NO_MEMORY(ctdb, ctdb->recovery_lock_file);
+
+       return 0;
+}
+
+/*
+  add a node to the list of nodes
+*/
+static int ctdb_add_node(struct ctdb_context *ctdb, char *nstr)
+{
+       struct ctdb_node *node, **nodep;
+
+       nodep = talloc_realloc(ctdb, ctdb->nodes, struct ctdb_node *, ctdb->num_nodes+1);
+       CTDB_NO_MEMORY(ctdb, nodep);
+
+       ctdb->nodes = nodep;
+       nodep = &ctdb->nodes[ctdb->num_nodes];
+       (*nodep) = talloc_zero(ctdb->nodes, struct ctdb_node);
+       CTDB_NO_MEMORY(ctdb, *nodep);
+       node = *nodep;
+
+       if (ctdb_parse_address(ctdb, node, nstr, &node->address) != 0) {
+               return -1;
+       }
+       node->ctdb = ctdb;
+       node->name = talloc_asprintf(node, "%s:%u", 
+                                    node->address.address, 
+                                    node->address.port);
+       /* this assumes that the nodes are kept in sorted order, and no gaps */
+       node->pnn = ctdb->num_nodes;
+
+       /* nodes start out disconnected and unhealthy */
+       node->flags = (NODE_FLAGS_DISCONNECTED | NODE_FLAGS_UNHEALTHY);
+
+       if (ctdb->address.address &&
+           ctdb_same_address(&ctdb->address, &node->address)) {
+               /* for automatic binding to interfaces, see tcp_connect.c */
+               ctdb->pnn = node->pnn;
+       }
+
+       ctdb->num_nodes++;
+       node->dead_count = 0;
+
+       return 0;
+}
+
+/*
+  add an entry for a "deleted" node to the list of nodes.
+  a "deleted" node is a node that is commented out from the nodes file.
+  this is used to prevent that subsequent nodes in the nodes list
+  change their pnn value if a node is "delete" by commenting it out and then
+  using "ctdb reloadnodes" at runtime.
+*/
+static int ctdb_add_deleted_node(struct ctdb_context *ctdb)
+{
+       struct ctdb_node *node, **nodep;
+
+       nodep = talloc_realloc(ctdb, ctdb->nodes, struct ctdb_node *, ctdb->num_nodes+1);
+       CTDB_NO_MEMORY(ctdb, nodep);
+
+       ctdb->nodes = nodep;
+       nodep = &ctdb->nodes[ctdb->num_nodes];
+       (*nodep) = talloc_zero(ctdb->nodes, struct ctdb_node);
+       CTDB_NO_MEMORY(ctdb, *nodep);
+       node = *nodep;
+       
+       if (ctdb_parse_address(ctdb, node, "0.0.0.0", &node->address) != 0) {
+               DEBUG(DEBUG_ERR,("Failed to setup deleted node %d\n", ctdb->num_nodes));
+               return -1;
+       }
+       node->ctdb = ctdb;
+       node->name = talloc_strdup(node, "0.0.0.0:0");
+
+       /* this assumes that the nodes are kept in sorted order, and no gaps */
+       node->pnn = ctdb->num_nodes;
+
+       /* this node is permanently deleted/disconnected */
+       node->flags = NODE_FLAGS_DELETED|NODE_FLAGS_DISCONNECTED;
+
+       ctdb->num_nodes++;
+       node->dead_count = 0;
+
+       return 0;
+}
+
+
+/*
+  setup the node list from a file
+*/
+static int ctdb_set_nlist(struct ctdb_context *ctdb, const char *nlist)
+{
+       char **lines;
+       int nlines;
+       int i, j, num_present;
+
+       talloc_free(ctdb->nodes);
+       ctdb->nodes     = NULL;
+       ctdb->num_nodes = 0;
+
+       lines = file_lines_load(nlist, &nlines, ctdb);
+       if (lines == NULL) {
+               ctdb_set_error(ctdb, "Failed to load nlist '%s'\n", nlist);
+               return -1;
+       }
+       while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
+               nlines--;
+       }
+
+       num_present = 0;
+       for (i=0; i < nlines; i++) {
+               char *node;
+
+               node = lines[i];
+               /* strip leading spaces */
+               while((*node == ' ') || (*node == '\t')) {
+                       node++;
+               }
+               if (*node == '#') {
+                       if (ctdb_add_deleted_node(ctdb) != 0) {
+                               talloc_free(lines);
+                               return -1;
+                       }
+                       continue;
+               }
+               if (strcmp(node, "") == 0) {
+                       continue;
+               }
+               if (ctdb_add_node(ctdb, node) != 0) {
+                       talloc_free(lines);
+                       return -1;
+               }
+               num_present++;
+       }
+
+       /* initialize the vnn mapping table now that we have the nodes list,
+          skipping any deleted nodes
+       */
+       ctdb->vnn_map = talloc(ctdb, struct ctdb_vnn_map);
+       CTDB_NO_MEMORY(ctdb, ctdb->vnn_map);
+
+       ctdb->vnn_map->generation = INVALID_GENERATION;
+       ctdb->vnn_map->size = num_present;
+       ctdb->vnn_map->map = talloc_array(ctdb->vnn_map, uint32_t, ctdb->vnn_map->size);
+       CTDB_NO_MEMORY(ctdb, ctdb->vnn_map->map);
+
+       for(i=0, j=0; i < ctdb->vnn_map->size; i++) {
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               ctdb->vnn_map->map[j] = i;
+               j++;
+       }
+       
+       talloc_free(lines);
+       return 0;
+}
+
+void ctdb_load_nodes_file(struct ctdb_context *ctdb)
+{
+       int ret;
+
+       ret = ctdb_set_nlist(ctdb, ctdb->nodes_file);
+       if (ret == -1) {
+               DEBUG(DEBUG_ALERT,("ctdb_set_nlist failed - %s\n", ctdb_errstr(ctdb)));
+               exit(1);
+       }
+}
+
+/*
+  setup the local node address
+*/
+int ctdb_set_address(struct ctdb_context *ctdb, const char *address)
+{
+       if (ctdb_parse_address(ctdb, ctdb, address, &ctdb->address) != 0) {
+               return -1;
+       }
+       
+       ctdb->name = talloc_asprintf(ctdb, "%s:%u", 
+                                    ctdb->address.address, 
+                                    ctdb->address.port);
+       return 0;
+}
+
+
+/*
+  return the number of active nodes
+*/
+uint32_t ctdb_get_num_active_nodes(struct ctdb_context *ctdb)
+{
+       int i;
+       uint32_t count=0;
+       for (i=0; i < ctdb->num_nodes; i++) {
+               if (!(ctdb->nodes[i]->flags & NODE_FLAGS_INACTIVE)) {
+                       count++;
+               }
+       }
+       return count;
+}
+
+
+/*
+  called when we need to process a packet. This can be a requeued packet
+  after a lockwait, or a real packet from another node
+*/
+void ctdb_input_pkt(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       TALLOC_CTX *tmp_ctx;
+
+       /* place the packet as a child of the tmp_ctx. We then use
+          talloc_free() below to free it. If any of the calls want
+          to keep it, then they will steal it somewhere else, and the
+          talloc_free() will only free the tmp_ctx */
+       tmp_ctx = talloc_new(ctdb);
+       talloc_steal(tmp_ctx, hdr);
+
+       DEBUG(DEBUG_DEBUG,(__location__ " ctdb request %u of type %u length %u from "
+                "node %u to %u\n", hdr->reqid, hdr->operation, hdr->length,
+                hdr->srcnode, hdr->destnode));
+
+       switch (hdr->operation) {
+       case CTDB_REQ_CALL:
+       case CTDB_REPLY_CALL:
+       case CTDB_REQ_DMASTER:
+       case CTDB_REPLY_DMASTER:
+               /* we dont allow these calls when banned */
+               if (ctdb->nodes[ctdb->pnn]->flags & NODE_FLAGS_BANNED) {
+                       DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
+                               " request %u"
+                               " length %u from node %u to %u while node"
+                               " is banned\n",
+                                hdr->operation, hdr->reqid,
+                                hdr->length, 
+                                hdr->srcnode, hdr->destnode));
+                       goto done;
+               }
+
+               /* for ctdb_call inter-node operations verify that the
+                  remote node that sent us the call is running in the
+                  same generation instance as this node
+               */
+               if (ctdb->vnn_map->generation != hdr->generation) {
+                       DEBUG(DEBUG_DEBUG,(__location__ " ctdb operation %u"
+                               " request %u"
+                               " length %u from node %u to %u had an"
+                               " invalid generation id:%u while our"
+                               " generation id is:%u\n", 
+                                hdr->operation, hdr->reqid,
+                                hdr->length, 
+                                hdr->srcnode, hdr->destnode, 
+                                hdr->generation, ctdb->vnn_map->generation));
+                       goto done;
+               }
+       }
+
+       switch (hdr->operation) {
+       case CTDB_REQ_CALL:
+               CTDB_INCREMENT_STAT(ctdb, node.req_call);
+               ctdb_request_call(ctdb, hdr);
+               break;
+
+       case CTDB_REPLY_CALL:
+               CTDB_INCREMENT_STAT(ctdb, node.reply_call);
+               ctdb_reply_call(ctdb, hdr);
+               break;
+
+       case CTDB_REPLY_ERROR:
+               CTDB_INCREMENT_STAT(ctdb, node.reply_error);
+               ctdb_reply_error(ctdb, hdr);
+               break;
+
+       case CTDB_REQ_DMASTER:
+               CTDB_INCREMENT_STAT(ctdb, node.req_dmaster);
+               ctdb_request_dmaster(ctdb, hdr);
+               break;
+
+       case CTDB_REPLY_DMASTER:
+               CTDB_INCREMENT_STAT(ctdb, node.reply_dmaster);
+               ctdb_reply_dmaster(ctdb, hdr);
+               break;
+
+       case CTDB_REQ_MESSAGE:
+               CTDB_INCREMENT_STAT(ctdb, node.req_message);
+               ctdb_request_message(ctdb, hdr);
+               break;
+
+       case CTDB_REQ_CONTROL:
+               CTDB_INCREMENT_STAT(ctdb, node.req_control);
+               ctdb_request_control(ctdb, hdr);
+               break;
+
+       case CTDB_REPLY_CONTROL:
+               CTDB_INCREMENT_STAT(ctdb, node.reply_control);
+               ctdb_reply_control(ctdb, hdr);
+               break;
+
+       case CTDB_REQ_KEEPALIVE:
+               CTDB_INCREMENT_STAT(ctdb, keepalive_packets_recv);
+               break;
+
+       default:
+               DEBUG(DEBUG_CRIT,("%s: Packet with unknown operation %u\n", 
+                        __location__, hdr->operation));
+               break;
+       }
+
+done:
+       talloc_free(tmp_ctx);
+}
+
+
+/*
+  called by the transport layer when a node is dead
+*/
+void ctdb_node_dead(struct ctdb_node *node)
+{
+       if (node->flags & NODE_FLAGS_DISCONNECTED) {
+               DEBUG(DEBUG_INFO,("%s: node %s is already marked disconnected: %u connected\n", 
+                        node->ctdb->name, node->name, 
+                        node->ctdb->num_connected));
+               return;
+       }
+       node->ctdb->num_connected--;
+       node->flags |= NODE_FLAGS_DISCONNECTED | NODE_FLAGS_UNHEALTHY;
+       node->rx_cnt = 0;
+       node->dead_count = 0;
+
+       DEBUG(DEBUG_NOTICE,("%s: node %s is dead: %u connected\n", 
+                node->ctdb->name, node->name, node->ctdb->num_connected));
+       ctdb_daemon_cancel_controls(node->ctdb, node);
+
+       if (node->ctdb->methods == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Can not restart transport while shutting down daemon.\n"));
+               return;
+       }
+
+       node->ctdb->methods->restart(node);
+}
+
+/*
+  called by the transport layer when a node is connected
+*/
+void ctdb_node_connected(struct ctdb_node *node)
+{
+       if (!(node->flags & NODE_FLAGS_DISCONNECTED)) {
+               DEBUG(DEBUG_INFO,("%s: node %s is already marked connected: %u connected\n", 
+                        node->ctdb->name, node->name, 
+                        node->ctdb->num_connected));
+               return;
+       }
+       node->ctdb->num_connected++;
+       node->dead_count = 0;
+       node->flags &= ~NODE_FLAGS_DISCONNECTED;
+       node->flags |= NODE_FLAGS_UNHEALTHY;
+       DEBUG(DEBUG_NOTICE,
+             ("%s: connected to %s - %u connected\n", 
+              node->ctdb->name, node->name, node->ctdb->num_connected));
+}
+
+struct queue_next {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_header *hdr;
+};
+
+
+/*
+  triggered when a deferred packet is due
+ */
+static void queue_next_trigger(struct event_context *ev, struct timed_event *te, 
+                              struct timeval t, void *private_data)
+{
+       struct queue_next *q = talloc_get_type(private_data, struct queue_next);
+       ctdb_input_pkt(q->ctdb, q->hdr);
+       talloc_free(q);
+}      
+
+/*
+  defer a packet, so it is processed on the next event loop
+  this is used for sending packets to ourselves
+ */
+static void ctdb_defer_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct queue_next *q;
+       q = talloc(ctdb, struct queue_next);
+       if (q == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to allocate deferred packet\n"));
+               return;
+       }
+       q->ctdb = ctdb;
+       q->hdr = talloc_memdup(ctdb, hdr, hdr->length);
+       if (q->hdr == NULL) {
+               DEBUG(DEBUG_ERR,("Error copying deferred packet to self\n"));
+               return;
+       }
+#if 0
+       /* use this to put packets directly into our recv function */
+       ctdb_input_pkt(q->ctdb, q->hdr);
+#else
+       event_add_timed(ctdb->ev, q, timeval_zero(), queue_next_trigger, q);
+#endif
+}
+
+
+/*
+  broadcast a packet to all nodes
+*/
+static void ctdb_broadcast_packet_all(struct ctdb_context *ctdb, 
+                                     struct ctdb_req_header *hdr)
+{
+       int i;
+       for (i=0; i < ctdb->num_nodes; i++) {
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               hdr->destnode = ctdb->nodes[i]->pnn;
+               ctdb_queue_packet(ctdb, hdr);
+       }
+}
+
+/*
+  broadcast a packet to all nodes in the current vnnmap
+*/
+static void ctdb_broadcast_packet_vnnmap(struct ctdb_context *ctdb, 
+                                        struct ctdb_req_header *hdr)
+{
+       int i;
+       for (i=0;i<ctdb->vnn_map->size;i++) {
+               hdr->destnode = ctdb->vnn_map->map[i];
+               ctdb_queue_packet(ctdb, hdr);
+       }
+}
+
+/*
+  broadcast a packet to all connected nodes
+*/
+static void ctdb_broadcast_packet_connected(struct ctdb_context *ctdb, 
+                                           struct ctdb_req_header *hdr)
+{
+       int i;
+       for (i=0; i < ctdb->num_nodes; i++) {
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               if (!(ctdb->nodes[i]->flags & NODE_FLAGS_DISCONNECTED)) {
+                       hdr->destnode = ctdb->nodes[i]->pnn;
+                       ctdb_queue_packet(ctdb, hdr);
+               }
+       }
+}
+
+/*
+  queue a packet or die
+*/
+void ctdb_queue_packet(struct ctdb_context *ctdb, struct ctdb_req_header *hdr)
+{
+       struct ctdb_node *node;
+
+       switch (hdr->destnode) {
+       case CTDB_BROADCAST_ALL:
+               ctdb_broadcast_packet_all(ctdb, hdr);
+               return;
+       case CTDB_BROADCAST_VNNMAP:
+               ctdb_broadcast_packet_vnnmap(ctdb, hdr);
+               return;
+       case CTDB_BROADCAST_CONNECTED:
+               ctdb_broadcast_packet_connected(ctdb, hdr);
+               return;
+       }
+
+       CTDB_INCREMENT_STAT(ctdb, node_packets_sent);
+
+       if (!ctdb_validate_pnn(ctdb, hdr->destnode)) {
+               DEBUG(DEBUG_CRIT,(__location__ " cant send to node %u that does not exist\n", 
+                        hdr->destnode));
+               return;
+       }
+
+       node = ctdb->nodes[hdr->destnode];
+
+       if (node->flags & NODE_FLAGS_DELETED) {
+               DEBUG(DEBUG_ERR, (__location__ " Can not queue packet to DELETED node %d\n", hdr->destnode));
+               return;
+       }
+
+       if (node->pnn == ctdb->pnn) {
+               ctdb_defer_packet(ctdb, hdr);
+               return;
+       }
+
+       if (ctdb->methods == NULL) {
+               DEBUG(DEBUG_ALERT, (__location__ " Can not queue packet. "
+                                   "Transport is DOWN\n"));
+               return;
+       }
+
+       node->tx_cnt++;
+       if (ctdb->methods->queue_pkt(node, (uint8_t *)hdr, hdr->length) != 0) {
+               ctdb_fatal(ctdb, "Unable to queue packet\n");
+       }
+}
+
+
+
+
+/*
+  a valgrind hack to allow us to get opcode specific backtraces
+  very ugly, and relies on no compiler optimisation!
+*/
+void ctdb_queue_packet_opcode(struct ctdb_context *ctdb, struct ctdb_req_header *hdr, unsigned opcode)
+{
+       switch (opcode) {
+#define DO_OP(x) case x: ctdb_queue_packet(ctdb, hdr); break
+               DO_OP(1);
+               DO_OP(2);
+               DO_OP(3);
+               DO_OP(4);
+               DO_OP(5);
+               DO_OP(6);
+               DO_OP(7);
+               DO_OP(8);
+               DO_OP(9);
+               DO_OP(10);
+               DO_OP(11);
+               DO_OP(12);
+               DO_OP(13);
+               DO_OP(14);
+               DO_OP(15);
+               DO_OP(16);
+               DO_OP(17);
+               DO_OP(18);
+               DO_OP(19);
+               DO_OP(20);
+               DO_OP(21);
+               DO_OP(22);
+               DO_OP(23);
+               DO_OP(24);
+               DO_OP(25);
+               DO_OP(26);
+               DO_OP(27);
+               DO_OP(28);
+               DO_OP(29);
+               DO_OP(30);
+               DO_OP(31);
+               DO_OP(32);
+               DO_OP(33);
+               DO_OP(34);
+               DO_OP(35);
+               DO_OP(36);
+               DO_OP(37);
+               DO_OP(38);
+               DO_OP(39);
+               DO_OP(40);
+               DO_OP(41);
+               DO_OP(42);
+               DO_OP(43);
+               DO_OP(44);
+               DO_OP(45);
+               DO_OP(46);
+               DO_OP(47);
+               DO_OP(48);
+               DO_OP(49);
+               DO_OP(50);
+               DO_OP(51);
+               DO_OP(52);
+               DO_OP(53);
+               DO_OP(54);
+               DO_OP(55);
+               DO_OP(56);
+               DO_OP(57);
+               DO_OP(58);
+               DO_OP(59);
+               DO_OP(60);
+               DO_OP(61);
+               DO_OP(62);
+               DO_OP(63);
+               DO_OP(64);
+               DO_OP(65);
+               DO_OP(66);
+               DO_OP(67);
+               DO_OP(68);
+               DO_OP(69);
+               DO_OP(70);
+               DO_OP(71);
+               DO_OP(72);
+               DO_OP(73);
+               DO_OP(74);
+               DO_OP(75);
+               DO_OP(76);
+               DO_OP(77);
+               DO_OP(78);
+               DO_OP(79);
+               DO_OP(80);
+               DO_OP(81);
+               DO_OP(82);
+               DO_OP(83);
+               DO_OP(84);
+               DO_OP(85);
+               DO_OP(86);
+               DO_OP(87);
+               DO_OP(88);
+               DO_OP(89);
+               DO_OP(90);
+               DO_OP(91);
+               DO_OP(92);
+               DO_OP(93);
+               DO_OP(94);
+               DO_OP(95);
+               DO_OP(96);
+               DO_OP(97);
+               DO_OP(98);
+               DO_OP(99);
+               DO_OP(100);
+       default: 
+               ctdb_queue_packet(ctdb, hdr);
+               break;
+       }
+}
diff --git a/ctdb/server/ctdb_serverids.c b/ctdb/server/ctdb_serverids.c
new file mode 100644 (file)
index 0000000..dba25ed
--- /dev/null
@@ -0,0 +1,191 @@
+/* 
+   ctdb_control protocol code to manage server ids
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+
+#define SERVER_ID_KEY_SIZE 3
+static uint32_t *get_server_id_key(struct ctdb_server_id *server_id)
+{
+       static uint32_t key[SERVER_ID_KEY_SIZE];
+
+       key[0] = server_id->type;
+       key[1] = server_id->pnn;
+       key[2] = server_id->server_id;
+
+       return &key[0];
+}
+
+/* add a server_id to the tree.
+   if we had already 'data' in the tree then this is a duplicate and we can
+   just talloc_free the structure in parm and leave data in the tree.
+   othervise if this is a new node we return parm and that is inserted
+   into the tree.
+*/
+static void *add_server_id_callback(void *parm, void *data)
+{
+       if (data) {
+               talloc_free(parm);
+               return data;
+       }
+       return parm;
+}
+
+/*
+  register a server id
+  a serverid that is registered with ctdb will be automatically unregistered
+  once the client domain socket dissappears.
+ */
+int32_t ctdb_control_register_server_id(struct ctdb_context *ctdb, 
+                                uint32_t client_id,
+                                TDB_DATA indata)
+{
+       struct ctdb_server_id *server_id;
+       struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+
+
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Could not find client parent structure. You can not send this control to a remote node\n"));
+               return 1;
+       }
+
+       /* hang the server_id structure off client before storing it in the
+          tree so that is will be automatically destroyed when client
+          is destroyed. 
+          when the structure is free'd it will be automatically
+          removed from the tree
+       */
+       server_id = talloc_zero(client, struct ctdb_server_id);
+       CTDB_NO_MEMORY(ctdb, server_id);
+       memcpy(server_id, indata.dptr, sizeof(struct ctdb_server_id));
+
+       trbt_insertarray32_callback(ctdb->server_ids, SERVER_ID_KEY_SIZE,
+               get_server_id_key(server_id), 
+               add_server_id_callback, server_id);
+
+       return 0;
+}
+
+
+/*
+  check whether a server id exists
+ */
+int32_t ctdb_control_check_server_id(struct ctdb_context *ctdb, 
+                                TDB_DATA indata)
+{
+       struct ctdb_server_id *server_id = (struct ctdb_server_id *)indata.dptr;
+
+       return trbt_lookuparray32(ctdb->server_ids, 
+                                 SERVER_ID_KEY_SIZE,
+                                 get_server_id_key(server_id)) == NULL? 0 : 1;
+}
+
+/*
+  unregisters a server id
+ */
+int32_t ctdb_control_unregister_server_id(struct ctdb_context *ctdb, 
+                                TDB_DATA indata)
+{
+       struct ctdb_server_id *server_id = (struct ctdb_server_id *)indata.dptr;
+
+       talloc_free(trbt_lookuparray32(ctdb->server_ids, 
+                       SERVER_ID_KEY_SIZE,
+                       get_server_id_key(server_id)));
+       return 0;
+}
+
+
+
+
+struct count_server_ids {
+       int count;
+       struct ctdb_server_id_list *list;
+};
+
+static int server_id_count(void *param, void *data)
+{
+       struct count_server_ids *svid = talloc_get_type(param, 
+                                               struct count_server_ids);
+
+       if (svid == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Got null pointer for svid\n"));
+               return -1;
+       }
+
+       svid->count++;
+       return 0;
+}
+
+static int server_id_store(void *param, void *data)
+{
+       struct count_server_ids *svid = talloc_get_type(param, 
+                                               struct count_server_ids);
+       struct ctdb_server_id *server_id = talloc_get_type(data, 
+                                               struct ctdb_server_id);
+
+       if (svid == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Got null pointer for svid\n"));
+               return -1;
+       }
+
+       if (svid->count >= svid->list->num) {
+               DEBUG(DEBUG_ERR, (__location__ " size of server id tree changed during traverse\n"));
+               return -1;
+       }
+
+       memcpy(&svid->list->server_ids[svid->count], server_id, sizeof(struct ctdb_server_id));
+       svid->count++;
+       return 0;
+}
+
+/* 
+   returns a list of all registered server ids for a node
+*/
+int32_t ctdb_control_get_server_id_list(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+       struct count_server_ids *svid;
+
+
+       svid = talloc_zero(outdata, struct count_server_ids);
+       CTDB_NO_MEMORY(ctdb, svid);
+
+
+       /* first we must count how many entries we have */
+       trbt_traversearray32(ctdb->server_ids, SERVER_ID_KEY_SIZE,
+                       server_id_count, svid);
+
+
+       outdata->dsize = offsetof(struct ctdb_server_id_list, 
+                               server_ids)
+                       + sizeof(struct ctdb_server_id) * svid->count;
+       outdata->dptr  = talloc_size(outdata, outdata->dsize);
+       CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+
+       /* now fill the structure in */
+       svid->list = (struct ctdb_server_id_list *)(outdata->dptr);
+       svid->list->num = svid->count;
+       svid->count=0;
+       trbt_traversearray32(ctdb->server_ids, SERVER_ID_KEY_SIZE,
+                       server_id_store, svid);
+
+
+       return 0;
+}
diff --git a/ctdb/server/ctdb_statistics.c b/ctdb/server/ctdb_statistics.c
new file mode 100644 (file)
index 0000000..96aad78
--- /dev/null
@@ -0,0 +1,78 @@
+/* 
+   ctdb statistics code
+
+   Copyright (C) Ronnie Sahlberg 2010
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include <string.h>
+#include "../include/ctdb_private.h"
+
+static void ctdb_statistics_update(struct event_context *ev, struct timed_event *te, 
+                                  struct timeval t, void *p)
+{
+       struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+       memmove(&ctdb->statistics_history[1], &ctdb->statistics_history[0], (MAX_STAT_HISTORY-1)*sizeof(struct ctdb_statistics));
+       memcpy(&ctdb->statistics_history[0], &ctdb->statistics_current, sizeof(struct ctdb_statistics));
+       ctdb->statistics_history[0].statistics_current_time = timeval_current();
+
+
+       bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
+       ctdb->statistics_current.statistics_start_time = timeval_current();
+
+       
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(ctdb->tunable.stat_history_interval, 0), ctdb_statistics_update, ctdb);
+}
+
+int ctdb_statistics_init(struct ctdb_context *ctdb)
+{
+       bzero(&ctdb->statistics, sizeof(struct ctdb_statistics));
+       ctdb->statistics.statistics_start_time = timeval_current();
+
+       bzero(&ctdb->statistics_current, sizeof(struct ctdb_statistics));
+       ctdb->statistics_current.statistics_start_time = timeval_current();
+
+       bzero(ctdb->statistics_history, sizeof(ctdb->statistics_history));
+
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(ctdb->tunable.stat_history_interval, 0), ctdb_statistics_update, ctdb);
+       return 0;
+}
+
+
+int32_t ctdb_control_get_stat_history(struct ctdb_context *ctdb, 
+                                     struct ctdb_req_control *c,
+                                     TDB_DATA *outdata)
+{
+       int len;
+       struct ctdb_statistics_wire *s;
+
+       len = offsetof(struct ctdb_statistics_wire, stats) + MAX_STAT_HISTORY*sizeof(struct ctdb_statistics);
+
+       s = talloc_size(outdata, len);
+       if (s == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to allocate statistics history structure\n"));
+               return -1;
+       }
+
+       s->num = MAX_STAT_HISTORY;
+       memcpy(&s->stats[0], &ctdb->statistics_history[0], sizeof(ctdb->statistics_history));
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)s;
+
+       return 0;
+}
diff --git a/ctdb/server/ctdb_takeover.c b/ctdb/server/ctdb_takeover.c
new file mode 100644 (file)
index 0000000..91f3030
--- /dev/null
@@ -0,0 +1,4638 @@
+/* 
+   ctdb ip takeover code
+
+   Copyright (C) Ronnie Sahlberg  2007
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "tdb.h"
+#include "lib/util/dlinklist.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+
+#define TAKEOVER_TIMEOUT() timeval_current_ofs(ctdb->tunable.takeover_timeout,0)
+
+#define CTDB_ARP_INTERVAL 1
+#define CTDB_ARP_REPEAT   3
+
+/* Flags used in IP allocation algorithms. */
+struct ctdb_ipflags {
+       bool noiptakeover;
+       bool noiphost;
+};
+
+struct ctdb_iface {
+       struct ctdb_iface *prev, *next;
+       const char *name;
+       bool link_up;
+       uint32_t references;
+};
+
+static const char *ctdb_vnn_iface_string(const struct ctdb_vnn *vnn)
+{
+       if (vnn->iface) {
+               return vnn->iface->name;
+       }
+
+       return "__none__";
+}
+
+static int ctdb_add_local_iface(struct ctdb_context *ctdb, const char *iface)
+{
+       struct ctdb_iface *i;
+
+       /* Verify that we dont have an entry for this ip yet */
+       for (i=ctdb->ifaces;i;i=i->next) {
+               if (strcmp(i->name, iface) == 0) {
+                       return 0;
+               }
+       }
+
+       /* create a new structure for this interface */
+       i = talloc_zero(ctdb, struct ctdb_iface);
+       CTDB_NO_MEMORY_FATAL(ctdb, i);
+       i->name = talloc_strdup(i, iface);
+       CTDB_NO_MEMORY(ctdb, i->name);
+       /*
+        * If link_up defaults to true then IPs can be allocated to a
+        * node during the first recovery.  However, then an interface
+        * could have its link marked down during the startup event,
+        * causing the IP to move almost immediately.  If link_up
+        * defaults to false then, during normal operation, IPs added
+        * to a new interface can't be assigned until a monitor cycle
+        * has occurred and marked the new interfaces up.  This makes
+        * IP allocation unpredictable.  The following is a neat
+        * compromise: early in startup link_up defaults to false, so
+        * IPs can't be assigned, and after startup IPs can be
+        * assigned immediately.
+        */
+       i->link_up = (ctdb->runstate == CTDB_RUNSTATE_RUNNING);
+
+       DLIST_ADD(ctdb->ifaces, i);
+
+       return 0;
+}
+
+static bool vnn_has_interface_with_name(struct ctdb_vnn *vnn,
+                                       const char *name)
+{
+       int n;
+
+       for (n = 0; vnn->ifaces[n] != NULL; n++) {
+               if (strcmp(name, vnn->ifaces[n]) == 0) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+/* If any interfaces now have no possible IPs then delete them.  This
+ * implementation is naive (i.e. simple) rather than clever
+ * (i.e. complex).  Given that this is run on delip and that operation
+ * is rare, this doesn't need to be efficient - it needs to be
+ * foolproof.  One alternative is reference counting, where the logic
+ * is distributed and can, therefore, be broken in multiple places.
+ * Another alternative is to build a red-black tree of interfaces that
+ * can have addresses (by walking ctdb->vnn and ctdb->single_ip_vnn
+ * once) and then walking ctdb->ifaces once and deleting those not in
+ * the tree.  Let's go to one of those if the naive implementation
+ * causes problems...  :-)
+ */
+static void ctdb_remove_orphaned_ifaces(struct ctdb_context *ctdb,
+                                       struct ctdb_vnn *vnn,
+                                       TALLOC_CTX *mem_ctx)
+{
+       struct ctdb_iface *i;
+
+       /* For each interface, check if there's an IP using it. */
+       for(i=ctdb->ifaces; i; i=i->next) {
+               struct ctdb_vnn *tv;
+               bool found;
+
+               /* Only consider interfaces named in the given VNN. */
+               if (!vnn_has_interface_with_name(vnn, i->name)) {
+                       continue;
+               }
+
+               /* Is the "single IP" on this interface? */
+               if ((ctdb->single_ip_vnn != NULL) &&
+                   (ctdb->single_ip_vnn->ifaces[0] != NULL) &&
+                   (strcmp(i->name, ctdb->single_ip_vnn->ifaces[0]) == 0)) {
+                       /* Found, next interface please... */
+                       continue;
+               }
+               /* Search for a vnn with this interface. */
+               found = false;
+               for (tv=ctdb->vnn; tv; tv=tv->next) {
+                       if (vnn_has_interface_with_name(tv, i->name)) {
+                               found = true;
+                               break;
+                       }
+               }
+
+               if (!found) {
+                       /* None of the VNNs are using this interface. */
+                       DLIST_REMOVE(ctdb->ifaces, i);
+                       /* Caller will free mem_ctx when convenient. */
+                       talloc_steal(mem_ctx, i);
+               }
+       }
+}
+
+
+static struct ctdb_iface *ctdb_find_iface(struct ctdb_context *ctdb,
+                                         const char *iface)
+{
+       struct ctdb_iface *i;
+
+       for (i=ctdb->ifaces;i;i=i->next) {
+               if (strcmp(i->name, iface) == 0) {
+                       return i;
+               }
+       }
+
+       return NULL;
+}
+
+static struct ctdb_iface *ctdb_vnn_best_iface(struct ctdb_context *ctdb,
+                                             struct ctdb_vnn *vnn)
+{
+       int i;
+       struct ctdb_iface *cur = NULL;
+       struct ctdb_iface *best = NULL;
+
+       for (i=0; vnn->ifaces[i]; i++) {
+
+               cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+               if (cur == NULL) {
+                       continue;
+               }
+
+               if (!cur->link_up) {
+                       continue;
+               }
+
+               if (best == NULL) {
+                       best = cur;
+                       continue;
+               }
+
+               if (cur->references < best->references) {
+                       best = cur;
+                       continue;
+               }
+       }
+
+       return best;
+}
+
+static int32_t ctdb_vnn_assign_iface(struct ctdb_context *ctdb,
+                                    struct ctdb_vnn *vnn)
+{
+       struct ctdb_iface *best = NULL;
+
+       if (vnn->iface) {
+               DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+                                  "still assigned to iface '%s'\n",
+                                  ctdb_addr_to_str(&vnn->public_address),
+                                  ctdb_vnn_iface_string(vnn)));
+               return 0;
+       }
+
+       best = ctdb_vnn_best_iface(ctdb, vnn);
+       if (best == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " public address '%s' "
+                                 "cannot assign to iface any iface\n",
+                                 ctdb_addr_to_str(&vnn->public_address)));
+               return -1;
+       }
+
+       vnn->iface = best;
+       best->references++;
+       vnn->pnn = ctdb->pnn;
+
+       DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+                          "now assigned to iface '%s' refs[%d]\n",
+                          ctdb_addr_to_str(&vnn->public_address),
+                          ctdb_vnn_iface_string(vnn),
+                          best->references));
+       return 0;
+}
+
+static void ctdb_vnn_unassign_iface(struct ctdb_context *ctdb,
+                                   struct ctdb_vnn *vnn)
+{
+       DEBUG(DEBUG_INFO, (__location__ " public address '%s' "
+                          "now unassigned (old iface '%s' refs[%d])\n",
+                          ctdb_addr_to_str(&vnn->public_address),
+                          ctdb_vnn_iface_string(vnn),
+                          vnn->iface?vnn->iface->references:0));
+       if (vnn->iface) {
+               vnn->iface->references--;
+       }
+       vnn->iface = NULL;
+       if (vnn->pnn == ctdb->pnn) {
+               vnn->pnn = -1;
+       }
+}
+
+static bool ctdb_vnn_available(struct ctdb_context *ctdb,
+                              struct ctdb_vnn *vnn)
+{
+       int i;
+
+       if (vnn->iface && vnn->iface->link_up) {
+               return true;
+       }
+
+       for (i=0; vnn->ifaces[i]; i++) {
+               struct ctdb_iface *cur;
+
+               cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+               if (cur == NULL) {
+                       continue;
+               }
+
+               if (cur->link_up) {
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+struct ctdb_takeover_arp {
+       struct ctdb_context *ctdb;
+       uint32_t count;
+       ctdb_sock_addr addr;
+       struct ctdb_tcp_array *tcparray;
+       struct ctdb_vnn *vnn;
+};
+
+
+/*
+  lists of tcp endpoints
+ */
+struct ctdb_tcp_list {
+       struct ctdb_tcp_list *prev, *next;
+       struct ctdb_tcp_connection connection;
+};
+
+/*
+  list of clients to kill on IP release
+ */
+struct ctdb_client_ip {
+       struct ctdb_client_ip *prev, *next;
+       struct ctdb_context *ctdb;
+       ctdb_sock_addr addr;
+       uint32_t client_id;
+};
+
+
+/*
+  send a gratuitous arp
+ */
+static void ctdb_control_send_arp(struct event_context *ev, struct timed_event *te, 
+                                 struct timeval t, void *private_data)
+{
+       struct ctdb_takeover_arp *arp = talloc_get_type(private_data, 
+                                                       struct ctdb_takeover_arp);
+       int i, ret;
+       struct ctdb_tcp_array *tcparray;
+       const char *iface = ctdb_vnn_iface_string(arp->vnn);
+
+       ret = ctdb_sys_send_arp(&arp->addr, iface);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT,(__location__ " sending of arp failed on iface '%s' (%s)\n",
+                                 iface, strerror(errno)));
+       }
+
+       tcparray = arp->tcparray;
+       if (tcparray) {
+               for (i=0;i<tcparray->num;i++) {
+                       struct ctdb_tcp_connection *tcon;
+
+                       tcon = &tcparray->connections[i];
+                       DEBUG(DEBUG_INFO,("sending tcp tickle ack for %u->%s:%u\n",
+                               (unsigned)ntohs(tcon->dst_addr.ip.sin_port), 
+                               ctdb_addr_to_str(&tcon->src_addr),
+                               (unsigned)ntohs(tcon->src_addr.ip.sin_port)));
+                       ret = ctdb_sys_send_tcp(
+                               &tcon->src_addr, 
+                               &tcon->dst_addr,
+                               0, 0, 0);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_CRIT,(__location__ " Failed to send tcp tickle ack for %s\n",
+                                       ctdb_addr_to_str(&tcon->src_addr)));
+                       }
+               }
+       }
+
+       arp->count++;
+
+       if (arp->count == CTDB_ARP_REPEAT) {
+               talloc_free(arp);
+               return;
+       }
+
+       event_add_timed(arp->ctdb->ev, arp->vnn->takeover_ctx, 
+                       timeval_current_ofs(CTDB_ARP_INTERVAL, 100000), 
+                       ctdb_control_send_arp, arp);
+}
+
+static int32_t ctdb_announce_vnn_iface(struct ctdb_context *ctdb,
+                                      struct ctdb_vnn *vnn)
+{
+       struct ctdb_takeover_arp *arp;
+       struct ctdb_tcp_array *tcparray;
+
+       if (!vnn->takeover_ctx) {
+               vnn->takeover_ctx = talloc_new(vnn);
+               if (!vnn->takeover_ctx) {
+                       return -1;
+               }
+       }
+
+       arp = talloc_zero(vnn->takeover_ctx, struct ctdb_takeover_arp);
+       if (!arp) {
+               return -1;
+       }
+
+       arp->ctdb = ctdb;
+       arp->addr = vnn->public_address;
+       arp->vnn  = vnn;
+
+       tcparray = vnn->tcp_array;
+       if (tcparray) {
+               /* add all of the known tcp connections for this IP to the
+                  list of tcp connections to send tickle acks for */
+               arp->tcparray = talloc_steal(arp, tcparray);
+
+               vnn->tcp_array = NULL;
+               vnn->tcp_update_needed = true;
+       }
+
+       event_add_timed(arp->ctdb->ev, vnn->takeover_ctx,
+                       timeval_zero(), ctdb_control_send_arp, arp);
+
+       return 0;
+}
+
+struct takeover_callback_state {
+       struct ctdb_req_control *c;
+       ctdb_sock_addr *addr;
+       struct ctdb_vnn *vnn;
+};
+
+struct ctdb_do_takeip_state {
+       struct ctdb_req_control *c;
+       struct ctdb_vnn *vnn;
+};
+
+/*
+  called when takeip event finishes
+ */
+static void ctdb_do_takeip_callback(struct ctdb_context *ctdb, int status,
+                                   void *private_data)
+{
+       struct ctdb_do_takeip_state *state =
+               talloc_get_type(private_data, struct ctdb_do_takeip_state);
+       int32_t ret;
+       TDB_DATA data;
+
+       if (status != 0) {
+               struct ctdb_node *node = ctdb->nodes[ctdb->pnn];
+       
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
+               DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+                                ctdb_addr_to_str(&state->vnn->public_address),
+                                ctdb_vnn_iface_string(state->vnn)));
+               ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+
+               node->flags |= NODE_FLAGS_UNHEALTHY;
+               talloc_free(state);
+               return;
+       }
+
+       if (ctdb->do_checkpublicip) {
+
+       ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+       if (ret != 0) {
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+               talloc_free(state);
+               return;
+       }
+
+       }
+
+       data.dptr  = (uint8_t *)ctdb_addr_to_str(&state->vnn->public_address);
+       data.dsize = strlen((char *)data.dptr) + 1;
+       DEBUG(DEBUG_INFO,(__location__ " sending TAKE_IP for '%s'\n", data.dptr));
+
+       ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_TAKE_IP, data);
+
+
+       /* the control succeeded */
+       ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+       talloc_free(state);
+       return;
+}
+
+static int ctdb_takeip_destructor(struct ctdb_do_takeip_state *state)
+{
+       state->vnn->update_in_flight = false;
+       return 0;
+}
+
+/*
+  take over an ip address
+ */
+static int32_t ctdb_do_takeip(struct ctdb_context *ctdb,
+                             struct ctdb_req_control *c,
+                             struct ctdb_vnn *vnn)
+{
+       int ret;
+       struct ctdb_do_takeip_state *state;
+
+       if (vnn->update_in_flight) {
+               DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u rejected "
+                                   "update for this IP already in flight\n",
+                                   ctdb_addr_to_str(&vnn->public_address),
+                                   vnn->public_netmask_bits));
+               return -1;
+       }
+
+       ret = ctdb_vnn_assign_iface(ctdb, vnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Takeover of IP %s/%u failed to "
+                                "assign a usable interface\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                vnn->public_netmask_bits));
+               return -1;
+       }
+
+       state = talloc(vnn, struct ctdb_do_takeip_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c = talloc_steal(ctdb, c);
+       state->vnn   = vnn;
+
+       vnn->update_in_flight = true;
+       talloc_set_destructor(state, ctdb_takeip_destructor);
+
+       DEBUG(DEBUG_NOTICE,("Takeover of IP %s/%u on interface %s\n",
+                           ctdb_addr_to_str(&vnn->public_address),
+                           vnn->public_netmask_bits,
+                           ctdb_vnn_iface_string(vnn)));
+
+       ret = ctdb_event_script_callback(ctdb,
+                                        state,
+                                        ctdb_do_takeip_callback,
+                                        state,
+                                        false,
+                                        CTDB_EVENT_TAKE_IP,
+                                        "%s %s %u",
+                                        ctdb_vnn_iface_string(vnn),
+                                        ctdb_addr_to_str(&vnn->public_address),
+                                        vnn->public_netmask_bits);
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to takeover IP %s on interface %s\n",
+                       ctdb_addr_to_str(&vnn->public_address),
+                       ctdb_vnn_iface_string(vnn)));
+               talloc_free(state);
+               return -1;
+       }
+
+       return 0;
+}
+
+struct ctdb_do_updateip_state {
+       struct ctdb_req_control *c;
+       struct ctdb_iface *old;
+       struct ctdb_vnn *vnn;
+};
+
+/*
+  called when updateip event finishes
+ */
+static void ctdb_do_updateip_callback(struct ctdb_context *ctdb, int status,
+                                     void *private_data)
+{
+       struct ctdb_do_updateip_state *state =
+               talloc_get_type(private_data, struct ctdb_do_updateip_state);
+       int32_t ret;
+
+       if (status != 0) {
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
+               DEBUG(DEBUG_ERR,(__location__ " Failed to move IP %s from interface %s to %s\n",
+                       ctdb_addr_to_str(&state->vnn->public_address),
+                       state->old->name,
+                       ctdb_vnn_iface_string(state->vnn)));
+
+               /*
+                * All we can do is reset the old interface
+                * and let the next run fix it
+                */
+               ctdb_vnn_unassign_iface(ctdb, state->vnn);
+               state->vnn->iface = state->old;
+               state->vnn->iface->references++;
+
+               ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+               talloc_free(state);
+               return;
+       }
+
+       if (ctdb->do_checkpublicip) {
+
+       ret = ctdb_announce_vnn_iface(ctdb, state->vnn);
+       if (ret != 0) {
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+               talloc_free(state);
+               return;
+       }
+
+       }
+
+       /* the control succeeded */
+       ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+       talloc_free(state);
+       return;
+}
+
+static int ctdb_updateip_destructor(struct ctdb_do_updateip_state *state)
+{
+       state->vnn->update_in_flight = false;
+       return 0;
+}
+
+/*
+  update (move) an ip address
+ */
+static int32_t ctdb_do_updateip(struct ctdb_context *ctdb,
+                               struct ctdb_req_control *c,
+                               struct ctdb_vnn *vnn)
+{
+       int ret;
+       struct ctdb_do_updateip_state *state;
+       struct ctdb_iface *old = vnn->iface;
+       const char *new_name;
+
+       if (vnn->update_in_flight) {
+               DEBUG(DEBUG_NOTICE,("Update of IP %s/%u rejected "
+                                   "update for this IP already in flight\n",
+                                   ctdb_addr_to_str(&vnn->public_address),
+                                   vnn->public_netmask_bits));
+               return -1;
+       }
+
+       ctdb_vnn_unassign_iface(ctdb, vnn);
+       ret = ctdb_vnn_assign_iface(ctdb, vnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("update of IP %s/%u failed to "
+                                "assin a usable interface (old iface '%s')\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                vnn->public_netmask_bits,
+                                old->name));
+               return -1;
+       }
+
+       new_name = ctdb_vnn_iface_string(vnn);
+       if (old->name != NULL && new_name != NULL && !strcmp(old->name, new_name)) {
+               /* A benign update from one interface onto itself.
+                * no need to run the eventscripts in this case, just return
+                * success.
+                */
+               ctdb_request_control_reply(ctdb, c, NULL, 0, NULL);
+               return 0;
+       }
+
+       state = talloc(vnn, struct ctdb_do_updateip_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c = talloc_steal(ctdb, c);
+       state->old = old;
+       state->vnn = vnn;
+
+       vnn->update_in_flight = true;
+       talloc_set_destructor(state, ctdb_updateip_destructor);
+
+       DEBUG(DEBUG_NOTICE,("Update of IP %s/%u from "
+                           "interface %s to %s\n",
+                           ctdb_addr_to_str(&vnn->public_address),
+                           vnn->public_netmask_bits,
+                           old->name,
+                           new_name));
+
+       ret = ctdb_event_script_callback(ctdb,
+                                        state,
+                                        ctdb_do_updateip_callback,
+                                        state,
+                                        false,
+                                        CTDB_EVENT_UPDATE_IP,
+                                        "%s %s %s %u",
+                                        state->old->name,
+                                        new_name,
+                                        ctdb_addr_to_str(&vnn->public_address),
+                                        vnn->public_netmask_bits);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed update IP %s from interface %s to %s\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                old->name, new_name));
+               talloc_free(state);
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  Find the vnn of the node that has a public ip address
+  returns -1 if the address is not known as a public address
+ */
+static struct ctdb_vnn *find_public_ip_vnn(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
+{
+       struct ctdb_vnn *vnn;
+
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (ctdb_same_ip(&vnn->public_address, addr)) {
+                       return vnn;
+               }
+       }
+
+       return NULL;
+}
+
+/*
+  take over an ip address
+ */
+int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb,
+                                struct ctdb_req_control *c,
+                                TDB_DATA indata,
+                                bool *async_reply)
+{
+       int ret;
+       struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
+       struct ctdb_vnn *vnn;
+       bool have_ip = false;
+       bool do_updateip = false;
+       bool do_takeip = false;
+       struct ctdb_iface *best_iface = NULL;
+
+       if (pip->pnn != ctdb->pnn) {
+               DEBUG(DEBUG_ERR,(__location__" takeoverip called for an ip '%s' "
+                                "with pnn %d, but we're node %d\n",
+                                ctdb_addr_to_str(&pip->addr),
+                                pip->pnn, ctdb->pnn));
+               return -1;
+       }
+
+       /* update out vnn list */
+       vnn = find_public_ip_vnn(ctdb, &pip->addr);
+       if (vnn == NULL) {
+               DEBUG(DEBUG_INFO,("takeoverip called for an ip '%s' that is not a public address\n",
+                       ctdb_addr_to_str(&pip->addr)));
+               return 0;
+       }
+
+       if (ctdb->do_checkpublicip) {
+               have_ip = ctdb_sys_have_ip(&pip->addr);
+       }
+       best_iface = ctdb_vnn_best_iface(ctdb, vnn);
+       if (best_iface == NULL) {
+               DEBUG(DEBUG_ERR,("takeoverip of IP %s/%u failed to find"
+                                "a usable interface (old %s, have_ip %d)\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                vnn->public_netmask_bits,
+                                ctdb_vnn_iface_string(vnn),
+                                have_ip));
+               return -1;
+       }
+
+       if (vnn->iface == NULL && vnn->pnn == -1 && have_ip && best_iface != NULL) {
+               DEBUG(DEBUG_ERR,("Taking over newly created ip\n"));
+               have_ip = false;
+       }
+
+
+       if (vnn->iface == NULL && have_ip) {
+               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+                                 "but we have no interface assigned, has someone manually configured it? Ignore for now.\n",
+                                ctdb_addr_to_str(&vnn->public_address)));
+               return 0;
+       }
+
+       if (vnn->pnn != ctdb->pnn && have_ip && vnn->pnn != -1) {
+               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+                                 "and we have it on iface[%s], but it was assigned to node %d"
+                                 "and we are node %d, banning ourself\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                ctdb_vnn_iface_string(vnn), vnn->pnn, ctdb->pnn));
+               ctdb_ban_self(ctdb);
+               return -1;
+       }
+
+       if (vnn->pnn == -1 && have_ip) {
+               vnn->pnn = ctdb->pnn;
+               DEBUG(DEBUG_CRIT,(__location__ " takeoverip of IP %s is known to the kernel, "
+                                 "and we already have it on iface[%s], update local daemon\n",
+                                ctdb_addr_to_str(&vnn->public_address),
+                                 ctdb_vnn_iface_string(vnn)));
+               return 0;
+       }
+
+       if (vnn->iface) {
+               if (vnn->iface != best_iface) {
+                       if (!vnn->iface->link_up) {
+                               do_updateip = true;
+                       } else if (vnn->iface->references > (best_iface->references + 1)) {
+                               /* only move when the rebalance gains something */
+                                       do_updateip = true;
+                       }
+               }
+       }
+
+       if (!have_ip) {
+               if (do_updateip) {
+                       ctdb_vnn_unassign_iface(ctdb, vnn);
+                       do_updateip = false;
+               }
+               do_takeip = true;
+       }
+
+       if (do_takeip) {
+               ret = ctdb_do_takeip(ctdb, c, vnn);
+               if (ret != 0) {
+                       return -1;
+               }
+       } else if (do_updateip) {
+               ret = ctdb_do_updateip(ctdb, c, vnn);
+               if (ret != 0) {
+                       return -1;
+               }
+       } else {
+               /*
+                * The interface is up and the kernel known the ip
+                * => do nothing
+                */
+               DEBUG(DEBUG_INFO,("Redundant takeover of IP %s/%u on interface %s (ip already held)\n",
+                       ctdb_addr_to_str(&pip->addr),
+                       vnn->public_netmask_bits,
+                       ctdb_vnn_iface_string(vnn)));
+               return 0;
+       }
+
+       /* tell ctdb_control.c that we will be replying asynchronously */
+       *async_reply = true;
+
+       return 0;
+}
+
+/*
+  takeover an ip address old v4 style
+ */
+int32_t ctdb_control_takeover_ipv4(struct ctdb_context *ctdb, 
+                               struct ctdb_req_control *c,
+                               TDB_DATA indata, 
+                               bool *async_reply)
+{
+       TDB_DATA data;
+       
+       data.dsize = sizeof(struct ctdb_public_ip);
+       data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
+       CTDB_NO_MEMORY(ctdb, data.dptr);
+       
+       memcpy(data.dptr, indata.dptr, indata.dsize);
+       return ctdb_control_takeover_ip(ctdb, c, data, async_reply);
+}
+
+/*
+  kill any clients that are registered with a IP that is being released
+ */
+static void release_kill_clients(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
+{
+       struct ctdb_client_ip *ip;
+
+       DEBUG(DEBUG_INFO,("release_kill_clients for ip %s\n",
+               ctdb_addr_to_str(addr)));
+
+       for (ip=ctdb->client_ip_list; ip; ip=ip->next) {
+               ctdb_sock_addr tmp_addr;
+
+               tmp_addr = ip->addr;
+               DEBUG(DEBUG_INFO,("checking for client %u with IP %s\n", 
+                       ip->client_id,
+                       ctdb_addr_to_str(&ip->addr)));
+
+               if (ctdb_same_ip(&tmp_addr, addr)) {
+                       struct ctdb_client *client = ctdb_reqid_find(ctdb, 
+                                                                    ip->client_id, 
+                                                                    struct ctdb_client);
+                       DEBUG(DEBUG_INFO,("matched client %u with IP %s and pid %u\n", 
+                               ip->client_id,
+                               ctdb_addr_to_str(&ip->addr),
+                               client->pid));
+
+                       if (client->pid != 0) {
+                               DEBUG(DEBUG_INFO,(__location__ " Killing client pid %u for IP %s on client_id %u\n",
+                                       (unsigned)client->pid,
+                                       ctdb_addr_to_str(addr),
+                                       ip->client_id));
+                               kill(client->pid, SIGKILL);
+                       }
+               }
+       }
+}
+
+/*
+  called when releaseip event finishes
+ */
+static void release_ip_callback(struct ctdb_context *ctdb, int status, 
+                               void *private_data)
+{
+       struct takeover_callback_state *state = 
+               talloc_get_type(private_data, struct takeover_callback_state);
+       TDB_DATA data;
+
+       if (status == -ETIME) {
+               ctdb_ban_self(ctdb);
+       }
+
+       if (ctdb->do_checkpublicip && ctdb_sys_have_ip(state->addr)) {
+               DEBUG(DEBUG_ERR, ("IP %s still hosted during release IP callback, failing\n",
+                                 ctdb_addr_to_str(state->addr)));
+               ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL);
+               talloc_free(state);
+               return;
+       }
+
+       /* send a message to all clients of this node telling them
+          that the cluster has been reconfigured and they should
+          release any sockets on this IP */
+       data.dptr = (uint8_t *)talloc_strdup(state, ctdb_addr_to_str(state->addr));
+       CTDB_NO_MEMORY_VOID(ctdb, data.dptr);
+       data.dsize = strlen((char *)data.dptr)+1;
+
+       DEBUG(DEBUG_INFO,(__location__ " sending RELEASE_IP for '%s'\n", data.dptr));
+
+       ctdb_daemon_send_message(ctdb, ctdb->pnn, CTDB_SRVID_RELEASE_IP, data);
+
+       /* kill clients that have registered with this IP */
+       release_kill_clients(ctdb, state->addr);
+
+       ctdb_vnn_unassign_iface(ctdb, state->vnn);
+
+       /* the control succeeded */
+       ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL);
+       talloc_free(state);
+}
+
+static int ctdb_releaseip_destructor(struct takeover_callback_state *state)
+{
+       state->vnn->update_in_flight = false;
+       return 0;
+}
+
+/*
+  release an ip address
+ */
+int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, 
+                               struct ctdb_req_control *c,
+                               TDB_DATA indata, 
+                               bool *async_reply)
+{
+       int ret;
+       struct takeover_callback_state *state;
+       struct ctdb_public_ip *pip = (struct ctdb_public_ip *)indata.dptr;
+       struct ctdb_vnn *vnn;
+       char *iface;
+
+       /* update our vnn list */
+       vnn = find_public_ip_vnn(ctdb, &pip->addr);
+       if (vnn == NULL) {
+               DEBUG(DEBUG_INFO,("releaseip called for an ip '%s' that is not a public address\n",
+                       ctdb_addr_to_str(&pip->addr)));
+               return 0;
+       }
+       vnn->pnn = pip->pnn;
+
+       /* stop any previous arps */
+       talloc_free(vnn->takeover_ctx);
+       vnn->takeover_ctx = NULL;
+
+       /* Some ctdb tool commands (e.g. moveip, rebalanceip) send
+        * lazy multicast to drop an IP from any node that isn't the
+        * intended new node.  The following causes makes ctdbd ignore
+        * a release for any address it doesn't host.
+        */
+       if (ctdb->do_checkpublicip) {
+               if (!ctdb_sys_have_ip(&pip->addr)) {
+                       DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u on interface %s (ip not held)\n",
+                               ctdb_addr_to_str(&pip->addr),
+                               vnn->public_netmask_bits,
+                               ctdb_vnn_iface_string(vnn)));
+                       ctdb_vnn_unassign_iface(ctdb, vnn);
+                       return 0;
+               }
+       } else {
+               if (vnn->iface == NULL) {
+                       DEBUG(DEBUG_DEBUG,("Redundant release of IP %s/%u (ip not held)\n",
+                                          ctdb_addr_to_str(&pip->addr),
+                                          vnn->public_netmask_bits));
+                       return 0;
+               }
+       }
+
+       /* There is a potential race between take_ip and us because we
+        * update the VNN via a callback that run when the
+        * eventscripts have been run.  Avoid the race by allowing one
+        * update to be in flight at a time.
+        */
+       if (vnn->update_in_flight) {
+               DEBUG(DEBUG_NOTICE,("Release of IP %s/%u rejected "
+                                   "update for this IP already in flight\n",
+                                   ctdb_addr_to_str(&vnn->public_address),
+                                   vnn->public_netmask_bits));
+               return -1;
+       }
+
+       if (ctdb->do_checkpublicip) {
+               iface = ctdb_sys_find_ifname(&pip->addr);
+               if (iface == NULL) {
+                       DEBUG(DEBUG_ERR, ("Could not find which interface the ip address is hosted on. can not release it\n"));
+                       return 0;
+               }
+               if (vnn->iface == NULL) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Public IP %s is hosted on interface %s but we have no VNN\n",
+                              ctdb_addr_to_str(&pip->addr),
+                              iface));
+               } else if (strcmp(iface, ctdb_vnn_iface_string(vnn)) != 0) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Public IP %s is hosted on inteterface %s but VNN says %s\n",
+                              ctdb_addr_to_str(&pip->addr),
+                              iface,
+                              ctdb_vnn_iface_string(vnn)));
+                       /* Should we fix vnn->iface?  If we do, what
+                        * happens to reference counts?
+                        */
+               }
+       } else {
+               iface = strdup(ctdb_vnn_iface_string(vnn));
+       }
+
+       DEBUG(DEBUG_NOTICE,("Release of IP %s/%u on interface %s  node:%d\n",
+               ctdb_addr_to_str(&pip->addr),
+               vnn->public_netmask_bits,
+               iface,
+               pip->pnn));
+
+       state = talloc(ctdb, struct takeover_callback_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c = talloc_steal(state, c);
+       state->addr = talloc(state, ctdb_sock_addr);       
+       CTDB_NO_MEMORY(ctdb, state->addr);
+       *state->addr = pip->addr;
+       state->vnn   = vnn;
+
+       vnn->update_in_flight = true;
+       talloc_set_destructor(state, ctdb_releaseip_destructor);
+
+       ret = ctdb_event_script_callback(ctdb, 
+                                        state, release_ip_callback, state,
+                                        false,
+                                        CTDB_EVENT_RELEASE_IP,
+                                        "%s %s %u",
+                                        iface,
+                                        ctdb_addr_to_str(&pip->addr),
+                                        vnn->public_netmask_bits);
+       free(iface);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to release IP %s on interface %s\n",
+                       ctdb_addr_to_str(&pip->addr),
+                       ctdb_vnn_iface_string(vnn)));
+               talloc_free(state);
+               return -1;
+       }
+
+       /* tell the control that we will be reply asynchronously */
+       *async_reply = true;
+       return 0;
+}
+
+/*
+  release an ip address old v4 style
+ */
+int32_t ctdb_control_release_ipv4(struct ctdb_context *ctdb, 
+                               struct ctdb_req_control *c,
+                               TDB_DATA indata, 
+                               bool *async_reply)
+{
+       TDB_DATA data;
+       
+       data.dsize = sizeof(struct ctdb_public_ip);
+       data.dptr  = (uint8_t *)talloc_zero(c, struct ctdb_public_ip);
+       CTDB_NO_MEMORY(ctdb, data.dptr);
+       
+       memcpy(data.dptr, indata.dptr, indata.dsize);
+       return ctdb_control_release_ip(ctdb, c, data, async_reply);
+}
+
+
+static int ctdb_add_public_address(struct ctdb_context *ctdb,
+                                  ctdb_sock_addr *addr,
+                                  unsigned mask, const char *ifaces,
+                                  bool check_address)
+{
+       struct ctdb_vnn      *vnn;
+       uint32_t num = 0;
+       char *tmp;
+       const char *iface;
+       int i;
+       int ret;
+
+       tmp = strdup(ifaces);
+       for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
+               if (!ctdb_sys_check_iface_exists(iface)) {
+                       DEBUG(DEBUG_CRIT,("Interface %s does not exist. Can not add public-address : %s\n", iface, ctdb_addr_to_str(addr)));
+                       free(tmp);
+                       return -1;
+               }
+       }
+       free(tmp);
+
+       /* Verify that we dont have an entry for this ip yet */
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (ctdb_same_sockaddr(addr, &vnn->public_address)) {
+                       DEBUG(DEBUG_CRIT,("Same ip '%s' specified multiple times in the public address list \n", 
+                               ctdb_addr_to_str(addr)));
+                       return -1;
+               }               
+       }
+
+       /* create a new vnn structure for this ip address */
+       vnn = talloc_zero(ctdb, struct ctdb_vnn);
+       CTDB_NO_MEMORY_FATAL(ctdb, vnn);
+       vnn->ifaces = talloc_array(vnn, const char *, num + 2);
+       tmp = talloc_strdup(vnn, ifaces);
+       CTDB_NO_MEMORY_FATAL(ctdb, tmp);
+       for (iface = strtok(tmp, ","); iface; iface = strtok(NULL, ",")) {
+               vnn->ifaces = talloc_realloc(vnn, vnn->ifaces, const char *, num + 2);
+               CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces);
+               vnn->ifaces[num] = talloc_strdup(vnn, iface);
+               CTDB_NO_MEMORY_FATAL(ctdb, vnn->ifaces[num]);
+               num++;
+       }
+       talloc_free(tmp);
+       vnn->ifaces[num] = NULL;
+       vnn->public_address      = *addr;
+       vnn->public_netmask_bits = mask;
+       vnn->pnn                 = -1;
+       if (check_address) {
+               if (ctdb_sys_have_ip(addr)) {
+                       DEBUG(DEBUG_ERR,("We are already hosting public address '%s'. setting PNN to ourself:%d\n", ctdb_addr_to_str(addr), ctdb->pnn));
+                       vnn->pnn = ctdb->pnn;
+               }
+       }
+
+       for (i=0; vnn->ifaces[i]; i++) {
+               ret = ctdb_add_local_iface(ctdb, vnn->ifaces[i]);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
+                                          "for public_address[%s]\n",
+                                          vnn->ifaces[i], ctdb_addr_to_str(addr)));
+                       talloc_free(vnn);
+                       return -1;
+               }
+       }
+
+       DLIST_ADD(ctdb->vnn, vnn);
+
+       return 0;
+}
+
+static void ctdb_check_interfaces_event(struct event_context *ev, struct timed_event *te, 
+                                 struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, 
+                                                       struct ctdb_context);
+       struct ctdb_vnn *vnn;
+
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               int i;
+
+               for (i=0; vnn->ifaces[i] != NULL; i++) {
+                       if (!ctdb_sys_check_iface_exists(vnn->ifaces[i])) {
+                               DEBUG(DEBUG_CRIT,("Interface %s does not exist but is used by public ip %s\n",
+                                       vnn->ifaces[i],
+                                       ctdb_addr_to_str(&vnn->public_address)));
+                       }
+               }
+       }
+
+       event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
+               timeval_current_ofs(30, 0), 
+               ctdb_check_interfaces_event, ctdb);
+}
+
+
+int ctdb_start_monitoring_interfaces(struct ctdb_context *ctdb)
+{
+       if (ctdb->check_public_ifaces_ctx != NULL) {
+               talloc_free(ctdb->check_public_ifaces_ctx);
+               ctdb->check_public_ifaces_ctx = NULL;
+       }
+
+       ctdb->check_public_ifaces_ctx = talloc_new(ctdb);
+       if (ctdb->check_public_ifaces_ctx == NULL) {
+               ctdb_fatal(ctdb, "failed to allocate context for checking interfaces");
+       }
+
+       event_add_timed(ctdb->ev, ctdb->check_public_ifaces_ctx, 
+               timeval_current_ofs(30, 0), 
+               ctdb_check_interfaces_event, ctdb);
+
+       return 0;
+}
+
+
+/*
+  setup the public address lists from a file
+*/
+int ctdb_set_public_addresses(struct ctdb_context *ctdb, bool check_addresses)
+{
+       char **lines;
+       int nlines;
+       int i;
+
+       lines = file_lines_load(ctdb->public_addresses_file, &nlines, ctdb);
+       if (lines == NULL) {
+               ctdb_set_error(ctdb, "Failed to load public address list '%s'\n", ctdb->public_addresses_file);
+               return -1;
+       }
+       while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
+               nlines--;
+       }
+
+       for (i=0;i<nlines;i++) {
+               unsigned mask;
+               ctdb_sock_addr addr;
+               const char *addrstr;
+               const char *ifaces;
+               char *tok, *line;
+
+               line = lines[i];
+               while ((*line == ' ') || (*line == '\t')) {
+                       line++;
+               }
+               if (*line == '#') {
+                       continue;
+               }
+               if (strcmp(line, "") == 0) {
+                       continue;
+               }
+               tok = strtok(line, " \t");
+               addrstr = tok;
+               tok = strtok(NULL, " \t");
+               if (tok == NULL) {
+                       if (NULL == ctdb->default_public_interface) {
+                               DEBUG(DEBUG_CRIT,("No default public interface and no interface specified at line %u of public address list\n",
+                                        i+1));
+                               talloc_free(lines);
+                               return -1;
+                       }
+                       ifaces = ctdb->default_public_interface;
+               } else {
+                       ifaces = tok;
+               }
+
+               if (!addrstr || !parse_ip_mask(addrstr, ifaces, &addr, &mask)) {
+                       DEBUG(DEBUG_CRIT,("Badly formed line %u in public address list\n", i+1));
+                       talloc_free(lines);
+                       return -1;
+               }
+               if (ctdb_add_public_address(ctdb, &addr, mask, ifaces, check_addresses)) {
+                       DEBUG(DEBUG_CRIT,("Failed to add line %u to the public address list\n", i+1));
+                       talloc_free(lines);
+                       return -1;
+               }
+       }
+
+
+       talloc_free(lines);
+       return 0;
+}
+
+int ctdb_set_single_public_ip(struct ctdb_context *ctdb,
+                             const char *iface,
+                             const char *ip)
+{
+       struct ctdb_vnn *svnn;
+       struct ctdb_iface *cur = NULL;
+       bool ok;
+       int ret;
+
+       svnn = talloc_zero(ctdb, struct ctdb_vnn);
+       CTDB_NO_MEMORY(ctdb, svnn);
+
+       svnn->ifaces = talloc_array(svnn, const char *, 2);
+       CTDB_NO_MEMORY(ctdb, svnn->ifaces);
+       svnn->ifaces[0] = talloc_strdup(svnn->ifaces, iface);
+       CTDB_NO_MEMORY(ctdb, svnn->ifaces[0]);
+       svnn->ifaces[1] = NULL;
+
+       ok = parse_ip(ip, iface, 0, &svnn->public_address);
+       if (!ok) {
+               talloc_free(svnn);
+               return -1;
+       }
+
+       ret = ctdb_add_local_iface(ctdb, svnn->ifaces[0]);
+       if (ret != 0) {
+               DEBUG(DEBUG_CRIT, (__location__ " failed to add iface[%s] "
+                                  "for single_ip[%s]\n",
+                                  svnn->ifaces[0],
+                                  ctdb_addr_to_str(&svnn->public_address)));
+               talloc_free(svnn);
+               return -1;
+       }
+
+       /* assume the single public ip interface is initially "good" */
+       cur = ctdb_find_iface(ctdb, iface);
+       if (cur == NULL) {
+               DEBUG(DEBUG_CRIT,("Can not find public interface %s used by --single-public-ip", iface));
+               return -1;
+       }
+       cur->link_up = true;
+
+       ret = ctdb_vnn_assign_iface(ctdb, svnn);
+       if (ret != 0) {
+               talloc_free(svnn);
+               return -1;
+       }
+
+       ctdb->single_ip_vnn = svnn;
+       return 0;
+}
+
+struct ctdb_public_ip_list {
+       struct ctdb_public_ip_list *next;
+       uint32_t pnn;
+       ctdb_sock_addr addr;
+};
+
+/* Given a physical node, return the number of
+   public addresses that is currently assigned to this node.
+*/
+static int node_ip_coverage(struct ctdb_context *ctdb, 
+       int32_t pnn,
+       struct ctdb_public_ip_list *ips)
+{
+       int num=0;
+
+       for (;ips;ips=ips->next) {
+               if (ips->pnn == pnn) {
+                       num++;
+               }
+       }
+       return num;
+}
+
+
+/* Can the given node host the given IP: is the public IP known to the
+ * node and is NOIPHOST unset?
+*/
+static bool can_node_host_ip(struct ctdb_context *ctdb, int32_t pnn, 
+                            struct ctdb_ipflags ipflags,
+                            struct ctdb_public_ip_list *ip)
+{
+       struct ctdb_all_public_ips *public_ips;
+       int i;
+
+       if (ipflags.noiphost) {
+               return false;
+       }
+
+       public_ips = ctdb->nodes[pnn]->available_public_ips;
+
+       if (public_ips == NULL) {
+               return false;
+       }
+
+       for (i=0; i<public_ips->num; i++) {
+               if (ctdb_same_ip(&ip->addr, &public_ips->ips[i].addr)) {
+                       /* yes, this node can serve this public ip */
+                       return true;
+               }
+       }
+
+       return false;
+}
+
+static bool can_node_takeover_ip(struct ctdb_context *ctdb, int32_t pnn, 
+                                struct ctdb_ipflags ipflags,
+                                struct ctdb_public_ip_list *ip)
+{
+       if (ipflags.noiptakeover) {
+               return false;
+       }
+
+       return can_node_host_ip(ctdb, pnn, ipflags, ip);
+}
+
+/* search the node lists list for a node to takeover this ip.
+   pick the node that currently are serving the least number of ips
+   so that the ips get spread out evenly.
+*/
+static int find_takeover_node(struct ctdb_context *ctdb, 
+               struct ctdb_ipflags *ipflags,
+               struct ctdb_public_ip_list *ip,
+               struct ctdb_public_ip_list *all_ips)
+{
+       int pnn, min=0, num;
+       int i, numnodes;
+
+       numnodes = talloc_array_length(ipflags);
+       pnn    = -1;
+       for (i=0; i<numnodes; i++) {
+               /* verify that this node can serve this ip */
+               if (!can_node_takeover_ip(ctdb, i, ipflags[i], ip)) {
+                       /* no it couldnt   so skip to the next node */
+                       continue;
+               }
+
+               num = node_ip_coverage(ctdb, i, all_ips);
+               /* was this the first node we checked ? */
+               if (pnn == -1) {
+                       pnn = i;
+                       min  = num;
+               } else {
+                       if (num < min) {
+                               pnn = i;
+                               min  = num;
+                       }
+               }
+       }       
+       if (pnn == -1) {
+               DEBUG(DEBUG_WARNING,(__location__ " Could not find node to take over public address '%s'\n",
+                       ctdb_addr_to_str(&ip->addr)));
+
+               return -1;
+       }
+
+       ip->pnn = pnn;
+       return 0;
+}
+
+#define IP_KEYLEN      4
+static uint32_t *ip_key(ctdb_sock_addr *ip)
+{
+       static uint32_t key[IP_KEYLEN];
+
+       bzero(key, sizeof(key));
+
+       switch (ip->sa.sa_family) {
+       case AF_INET:
+               key[3]  = htonl(ip->ip.sin_addr.s_addr);
+               break;
+       case AF_INET6: {
+               uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
+               key[0]  = htonl(s6_a32[0]);
+               key[1]  = htonl(s6_a32[1]);
+               key[2]  = htonl(s6_a32[2]);
+               key[3]  = htonl(s6_a32[3]);
+               break;
+       }
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
+               return key;
+       }
+
+       return key;
+}
+
+static void *add_ip_callback(void *parm, void *data)
+{
+       struct ctdb_public_ip_list *this_ip = parm; 
+       struct ctdb_public_ip_list *prev_ip = data; 
+
+       if (prev_ip == NULL) {
+               return parm;
+       }
+       if (this_ip->pnn == -1) {
+               this_ip->pnn = prev_ip->pnn;
+       }
+
+       return parm;
+}
+
+static int getips_count_callback(void *param, void *data)
+{
+       struct ctdb_public_ip_list **ip_list = (struct ctdb_public_ip_list **)param;
+       struct ctdb_public_ip_list *new_ip = (struct ctdb_public_ip_list *)data;
+
+       new_ip->next = *ip_list;
+       *ip_list     = new_ip;
+       return 0;
+}
+
+static struct ctdb_public_ip_list *
+create_merged_ip_list(struct ctdb_context *ctdb)
+{
+       int i, j;
+       struct ctdb_public_ip_list *ip_list;
+       struct ctdb_all_public_ips *public_ips;
+
+       if (ctdb->ip_tree != NULL) {
+               talloc_free(ctdb->ip_tree);
+               ctdb->ip_tree = NULL;
+       }
+       ctdb->ip_tree = trbt_create(ctdb, 0);
+
+       for (i=0;i<ctdb->num_nodes;i++) {
+               public_ips = ctdb->nodes[i]->known_public_ips;
+
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+
+               /* there were no public ips for this node */
+               if (public_ips == NULL) {
+                       continue;
+               }               
+
+               for (j=0;j<public_ips->num;j++) {
+                       struct ctdb_public_ip_list *tmp_ip; 
+
+                       tmp_ip = talloc_zero(ctdb->ip_tree, struct ctdb_public_ip_list);
+                       CTDB_NO_MEMORY_NULL(ctdb, tmp_ip);
+                       /* Do not use information about IP addresses hosted
+                        * on other nodes, it may not be accurate */
+                       if (public_ips->ips[j].pnn == ctdb->nodes[i]->pnn) {
+                               tmp_ip->pnn = public_ips->ips[j].pnn;
+                       } else {
+                               tmp_ip->pnn = -1;
+                       }
+                       tmp_ip->addr = public_ips->ips[j].addr;
+                       tmp_ip->next = NULL;
+
+                       trbt_insertarray32_callback(ctdb->ip_tree,
+                               IP_KEYLEN, ip_key(&public_ips->ips[j].addr),
+                               add_ip_callback,
+                               tmp_ip);
+               }
+       }
+
+       ip_list = NULL;
+       trbt_traversearray32(ctdb->ip_tree, IP_KEYLEN, getips_count_callback, &ip_list);
+
+       return ip_list;
+}
+
+/* 
+ * This is the length of the longtest common prefix between the IPs.
+ * It is calculated by XOR-ing the 2 IPs together and counting the
+ * number of leading zeroes.  The implementation means that all
+ * addresses end up being 128 bits long.
+ *
+ * FIXME? Should we consider IPv4 and IPv6 separately given that the
+ * 12 bytes of 0 prefix padding will hurt the algorithm if there are
+ * lots of nodes and IP addresses?
+ */
+static uint32_t ip_distance(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
+{
+       uint32_t ip1_k[IP_KEYLEN];
+       uint32_t *t;
+       int i;
+       uint32_t x;
+
+       uint32_t distance = 0;
+
+       memcpy(ip1_k, ip_key(ip1), sizeof(ip1_k));
+       t = ip_key(ip2);
+       for (i=0; i<IP_KEYLEN; i++) {
+               x = ip1_k[i] ^ t[i];
+               if (x == 0) {
+                       distance += 32;
+               } else {
+                       /* Count number of leading zeroes. 
+                        * FIXME? This could be optimised...
+                        */
+                       while ((x & (1 << 31)) == 0) {
+                               x <<= 1;
+                               distance += 1;
+                       }
+               }
+       }
+
+       return distance;
+}
+
+/* Calculate the IP distance for the given IP relative to IPs on the
+   given node.  The ips argument is generally the all_ips variable
+   used in the main part of the algorithm.
+ */
+static uint32_t ip_distance_2_sum(ctdb_sock_addr *ip,
+                                 struct ctdb_public_ip_list *ips,
+                                 int pnn)
+{
+       struct ctdb_public_ip_list *t;
+       uint32_t d;
+
+       uint32_t sum = 0;
+
+       for (t=ips; t != NULL; t=t->next) {
+               if (t->pnn != pnn) {
+                       continue;
+               }
+
+               /* Optimisation: We never calculate the distance
+                * between an address and itself.  This allows us to
+                * calculate the effect of removing an address from a
+                * node by simply calculating the distance between
+                * that address and all of the exitsing addresses.
+                * Moreover, we assume that we're only ever dealing
+                * with addresses from all_ips so we can identify an
+                * address via a pointer rather than doing a more
+                * expensive address comparison. */
+               if (&(t->addr) == ip) {
+                       continue;
+               }
+
+               d = ip_distance(ip, &(t->addr));
+               sum += d * d;  /* Cheaper than pulling in math.h :-) */
+       }
+
+       return sum;
+}
+
+/* Return the LCP2 imbalance metric for addresses currently assigned
+   to the given node.
+ */
+static uint32_t lcp2_imbalance(struct ctdb_public_ip_list * all_ips, int pnn)
+{
+       struct ctdb_public_ip_list *t;
+
+       uint32_t imbalance = 0;
+
+       for (t=all_ips; t!=NULL; t=t->next) {
+               if (t->pnn != pnn) {
+                       continue;
+               }
+               /* Pass the rest of the IPs rather than the whole
+                  all_ips input list.
+               */
+               imbalance += ip_distance_2_sum(&(t->addr), t->next, pnn);
+       }
+
+       return imbalance;
+}
+
+/* Allocate any unassigned IPs just by looping through the IPs and
+ * finding the best node for each.
+ */
+static void basic_allocate_unassigned(struct ctdb_context *ctdb,
+                                     struct ctdb_ipflags *ipflags,
+                                     struct ctdb_public_ip_list *all_ips)
+{
+       struct ctdb_public_ip_list *tmp_ip;
+
+       /* loop over all ip's and find a physical node to cover for 
+          each unassigned ip.
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       if (find_takeover_node(ctdb, ipflags, tmp_ip, all_ips)) {
+                               DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
+                                       ctdb_addr_to_str(&tmp_ip->addr)));
+                       }
+               }
+       }
+}
+
+/* Basic non-deterministic rebalancing algorithm.
+ */
+static void basic_failback(struct ctdb_context *ctdb,
+                          struct ctdb_ipflags *ipflags,
+                          struct ctdb_public_ip_list *all_ips,
+                          int num_ips)
+{
+       int i, numnodes;
+       int maxnode, maxnum, minnode, minnum, num, retries;
+       struct ctdb_public_ip_list *tmp_ip;
+
+       numnodes = talloc_array_length(ipflags);
+       retries = 0;
+
+try_again:
+       maxnum=0;
+       minnum=0;
+
+       /* for each ip address, loop over all nodes that can serve
+          this ip and make sure that the difference between the node
+          serving the most and the node serving the least ip's are
+          not greater than 1.
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       continue;
+               }
+
+               /* Get the highest and lowest number of ips's served by any 
+                  valid node which can serve this ip.
+               */
+               maxnode = -1;
+               minnode = -1;
+               for (i=0; i<numnodes; i++) {
+                       /* only check nodes that can actually serve this ip */
+                       if (!can_node_takeover_ip(ctdb, i, ipflags[i], tmp_ip)) {
+                               /* no it couldnt   so skip to the next node */
+                               continue;
+                       }
+
+                       num = node_ip_coverage(ctdb, i, all_ips);
+                       if (maxnode == -1) {
+                               maxnode = i;
+                               maxnum  = num;
+                       } else {
+                               if (num > maxnum) {
+                                       maxnode = i;
+                                       maxnum  = num;
+                               }
+                       }
+                       if (minnode == -1) {
+                               minnode = i;
+                               minnum  = num;
+                       } else {
+                               if (num < minnum) {
+                                       minnode = i;
+                                       minnum  = num;
+                               }
+                       }
+               }
+               if (maxnode == -1) {
+                       DEBUG(DEBUG_WARNING,(__location__ " Could not find maxnode. May not be able to serve ip '%s'\n",
+                               ctdb_addr_to_str(&tmp_ip->addr)));
+
+                       continue;
+               }
+
+               /* if the spread between the smallest and largest coverage by
+                  a node is >=2 we steal one of the ips from the node with
+                  most coverage to even things out a bit.
+                  try to do this a limited number of times since we dont
+                  want to spend too much time balancing the ip coverage.
+               */
+               if ( (maxnum > minnum+1)
+                    && (retries < (num_ips + 5)) ){
+                       struct ctdb_public_ip_list *tmp;
+
+                       /* Reassign one of maxnode's VNNs */
+                       for (tmp=all_ips;tmp;tmp=tmp->next) {
+                               if (tmp->pnn == maxnode) {
+                                       (void)find_takeover_node(ctdb, ipflags, tmp, all_ips);
+                                       retries++;
+                                       goto try_again;;
+                               }
+                       }
+               }
+       }
+}
+
+static void lcp2_init(struct ctdb_context *tmp_ctx,
+                     struct ctdb_ipflags *ipflags,
+                     struct ctdb_public_ip_list *all_ips,
+                     uint32_t *force_rebalance_nodes,
+                     uint32_t **lcp2_imbalances,
+                     bool **rebalance_candidates)
+{
+       int i, numnodes;
+       struct ctdb_public_ip_list *tmp_ip;
+
+       numnodes = talloc_array_length(ipflags);
+
+       *rebalance_candidates = talloc_array(tmp_ctx, bool, numnodes);
+       CTDB_NO_MEMORY_FATAL(tmp_ctx, *rebalance_candidates);
+       *lcp2_imbalances = talloc_array(tmp_ctx, uint32_t, numnodes);
+       CTDB_NO_MEMORY_FATAL(tmp_ctx, *lcp2_imbalances);
+
+       for (i=0; i<numnodes; i++) {
+               (*lcp2_imbalances)[i] = lcp2_imbalance(all_ips, i);
+               /* First step: assume all nodes are candidates */
+               (*rebalance_candidates)[i] = true;
+       }
+
+       /* 2nd step: if a node has IPs assigned then it must have been
+        * healthy before, so we remove it from consideration.  This
+        * is overkill but is all we have because we don't maintain
+        * state between takeover runs.  An alternative would be to
+        * keep state and invalidate it every time the recovery master
+        * changes.
+        */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn != -1) {
+                       (*rebalance_candidates)[tmp_ip->pnn] = false;
+               }
+       }
+
+       /* 3rd step: if a node is forced to re-balance then
+          we allow failback onto the node */
+       if (force_rebalance_nodes == NULL) {
+               return;
+       }
+       for (i = 0; i < talloc_array_length(force_rebalance_nodes); i++) {
+               uint32_t pnn = force_rebalance_nodes[i];
+               if (pnn >= numnodes) {
+                       DEBUG(DEBUG_ERR,
+                             (__location__ "unknown node %u\n", pnn));
+                       continue;
+               }
+
+               DEBUG(DEBUG_NOTICE,
+                     ("Forcing rebalancing of IPs to node %u\n", pnn));
+               (*rebalance_candidates)[pnn] = true;
+       }
+}
+
+/* Allocate any unassigned addresses using the LCP2 algorithm to find
+ * the IP/node combination that will cost the least.
+ */
+static void lcp2_allocate_unassigned(struct ctdb_context *ctdb,
+                                    struct ctdb_ipflags *ipflags,
+                                    struct ctdb_public_ip_list *all_ips,
+                                    uint32_t *lcp2_imbalances)
+{
+       struct ctdb_public_ip_list *tmp_ip;
+       int dstnode, numnodes;
+
+       int minnode;
+       uint32_t mindsum, dstdsum, dstimbl, minimbl;
+       struct ctdb_public_ip_list *minip;
+
+       bool should_loop = true;
+       bool have_unassigned = true;
+
+       numnodes = talloc_array_length(ipflags);
+
+       while (have_unassigned && should_loop) {
+               should_loop = false;
+
+               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+               DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES (UNASSIGNED)\n"));
+
+               minnode = -1;
+               mindsum = 0;
+               minip = NULL;
+
+               /* loop over each unassigned ip. */
+               for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+                       if (tmp_ip->pnn != -1) {
+                               continue;
+                       }
+
+                       for (dstnode=0; dstnode<numnodes; dstnode++) {
+                               /* only check nodes that can actually takeover this ip */
+                               if (!can_node_takeover_ip(ctdb, dstnode,
+                                                         ipflags[dstnode],
+                                                         tmp_ip)) {
+                                       /* no it couldnt   so skip to the next node */
+                                       continue;
+                               }
+
+                               dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+                               dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+                               DEBUG(DEBUG_DEBUG,(" %s -> %d [+%d]\n",
+                                                  ctdb_addr_to_str(&(tmp_ip->addr)),
+                                                  dstnode,
+                                                  dstimbl - lcp2_imbalances[dstnode]));
+
+
+                               if ((minnode == -1) || (dstdsum < mindsum)) {
+                                       minnode = dstnode;
+                                       minimbl = dstimbl;
+                                       mindsum = dstdsum;
+                                       minip = tmp_ip;
+                                       should_loop = true;
+                               }
+                       }
+               }
+
+               DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+               /* If we found one then assign it to the given node. */
+               if (minnode != -1) {
+                       minip->pnn = minnode;
+                       lcp2_imbalances[minnode] = minimbl;
+                       DEBUG(DEBUG_INFO,(" %s -> %d [+%d]\n",
+                                         ctdb_addr_to_str(&(minip->addr)),
+                                         minnode,
+                                         mindsum));
+               }
+
+               /* There might be a better way but at least this is clear. */
+               have_unassigned = false;
+               for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+                       if (tmp_ip->pnn == -1) {
+                               have_unassigned = true;
+                       }
+               }
+       }
+
+       /* We know if we have an unassigned addresses so we might as
+        * well optimise.
+        */
+       if (have_unassigned) {
+               for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+                       if (tmp_ip->pnn == -1) {
+                               DEBUG(DEBUG_WARNING,("Failed to find node to cover ip %s\n",
+                                                    ctdb_addr_to_str(&tmp_ip->addr)));
+                       }
+               }
+       }
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  Given a candidate node
+ * to move IPs from, determines the best IP/destination node
+ * combination to move from the source node.
+ */
+static bool lcp2_failback_candidate(struct ctdb_context *ctdb,
+                                   struct ctdb_ipflags *ipflags,
+                                   struct ctdb_public_ip_list *all_ips,
+                                   int srcnode,
+                                   uint32_t candimbl,
+                                   uint32_t *lcp2_imbalances,
+                                   bool *rebalance_candidates)
+{
+       int dstnode, mindstnode, numnodes;
+       uint32_t srcimbl, srcdsum, dstimbl, dstdsum;
+       uint32_t minsrcimbl, mindstimbl;
+       struct ctdb_public_ip_list *minip;
+       struct ctdb_public_ip_list *tmp_ip;
+
+       /* Find an IP and destination node that best reduces imbalance. */
+       srcimbl = 0;
+       minip = NULL;
+       minsrcimbl = 0;
+       mindstnode = -1;
+       mindstimbl = 0;
+
+       numnodes = talloc_array_length(ipflags);
+
+       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+       DEBUG(DEBUG_DEBUG,(" CONSIDERING MOVES FROM %d [%d]\n", srcnode, candimbl));
+
+       for (tmp_ip=all_ips; tmp_ip; tmp_ip=tmp_ip->next) {
+               /* Only consider addresses on srcnode. */
+               if (tmp_ip->pnn != srcnode) {
+                       continue;
+               }
+
+               /* What is this IP address costing the source node? */
+               srcdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, srcnode);
+               srcimbl = candimbl - srcdsum;
+
+               /* Consider this IP address would cost each potential
+                * destination node.  Destination nodes are limited to
+                * those that are newly healthy, since we don't want
+                * to do gratuitous failover of IPs just to make minor
+                * balance improvements.
+                */
+               for (dstnode=0; dstnode<numnodes; dstnode++) {
+                       if (!rebalance_candidates[dstnode]) {
+                               continue;
+                       }
+
+                       /* only check nodes that can actually takeover this ip */
+                       if (!can_node_takeover_ip(ctdb, dstnode,
+                                                 ipflags[dstnode], tmp_ip)) {
+                               /* no it couldnt   so skip to the next node */
+                               continue;
+                       }
+
+                       dstdsum = ip_distance_2_sum(&(tmp_ip->addr), all_ips, dstnode);
+                       dstimbl = lcp2_imbalances[dstnode] + dstdsum;
+                       DEBUG(DEBUG_DEBUG,(" %d [%d] -> %s -> %d [+%d]\n",
+                                          srcnode, srcimbl - lcp2_imbalances[srcnode],
+                                          ctdb_addr_to_str(&(tmp_ip->addr)),
+                                          dstnode, dstimbl - lcp2_imbalances[dstnode]));
+
+                       if ((dstimbl < candimbl) && (dstdsum < srcdsum) && \
+                           ((mindstnode == -1) ||                              \
+                            ((srcimbl + dstimbl) < (minsrcimbl + mindstimbl)))) {
+
+                               minip = tmp_ip;
+                               minsrcimbl = srcimbl;
+                               mindstnode = dstnode;
+                               mindstimbl = dstimbl;
+                       }
+               }
+       }
+       DEBUG(DEBUG_DEBUG,(" ----------------------------------------\n"));
+
+        if (mindstnode != -1) {
+               /* We found a move that makes things better... */
+               DEBUG(DEBUG_INFO,("%d [%d] -> %s -> %d [+%d]\n",
+                                 srcnode, minsrcimbl - lcp2_imbalances[srcnode],
+                                 ctdb_addr_to_str(&(minip->addr)),
+                                 mindstnode, mindstimbl - lcp2_imbalances[mindstnode]));
+
+
+               lcp2_imbalances[srcnode] = srcimbl;
+               lcp2_imbalances[mindstnode] = mindstimbl;
+               minip->pnn = mindstnode;
+
+               return true;
+       }
+
+        return false;
+       
+}
+
+struct lcp2_imbalance_pnn {
+       uint32_t imbalance;
+       int pnn;
+};
+
+static int lcp2_cmp_imbalance_pnn(const void * a, const void * b)
+{
+       const struct lcp2_imbalance_pnn * lipa = (const struct lcp2_imbalance_pnn *) a;
+       const struct lcp2_imbalance_pnn * lipb = (const struct lcp2_imbalance_pnn *) b;
+
+       if (lipa->imbalance > lipb->imbalance) {
+               return -1;
+       } else if (lipa->imbalance == lipb->imbalance) {
+               return 0;
+       } else {
+               return 1;
+       }
+}
+
+/* LCP2 algorithm for rebalancing the cluster.  This finds the source
+ * node with the highest LCP2 imbalance, and then determines the best
+ * IP/destination node combination to move from the source node.
+ */
+static void lcp2_failback(struct ctdb_context *ctdb,
+                         struct ctdb_ipflags *ipflags,
+                         struct ctdb_public_ip_list *all_ips,
+                         uint32_t *lcp2_imbalances,
+                         bool *rebalance_candidates)
+{
+       int i, num_rebalance_candidates, numnodes;
+       struct lcp2_imbalance_pnn * lips;
+       bool again;
+
+       numnodes = talloc_array_length(ipflags);
+
+try_again:
+
+       /* It is only worth continuing if we have suitable target
+        * nodes to transfer IPs to.  This check is much cheaper than
+        * continuing on...
+        */
+       num_rebalance_candidates = 0;
+       for (i=0; i<numnodes; i++) {
+               if (rebalance_candidates[i]) {
+                       num_rebalance_candidates++;
+               }
+       }
+       if (num_rebalance_candidates == 0) {
+               return;
+       }
+
+       /* Put the imbalances and nodes into an array, sort them and
+        * iterate through candidates.  Usually the 1st one will be
+        * used, so this doesn't cost much...
+        */
+       lips = talloc_array(ctdb, struct lcp2_imbalance_pnn, numnodes);
+       for (i=0; i<numnodes; i++) {
+               lips[i].imbalance = lcp2_imbalances[i];
+               lips[i].pnn = i;
+       }
+       qsort(lips, numnodes, sizeof(struct lcp2_imbalance_pnn),
+             lcp2_cmp_imbalance_pnn);
+
+       again = false;
+       for (i=0; i<numnodes; i++) {
+               /* This means that all nodes had 0 or 1 addresses, so
+                * can't be imbalanced.
+                */
+               if (lips[i].imbalance == 0) {
+                       break;
+               }
+
+               if (lcp2_failback_candidate(ctdb,
+                                           ipflags,
+                                           all_ips,
+                                           lips[i].pnn,
+                                           lips[i].imbalance,
+                                           lcp2_imbalances,
+                                           rebalance_candidates)) {
+                       again = true;
+                       break;
+               }
+       }
+
+       talloc_free(lips);
+       if (again) {
+               goto try_again;
+       }
+}
+
+static void unassign_unsuitable_ips(struct ctdb_context *ctdb,
+                                   struct ctdb_ipflags *ipflags,
+                                   struct ctdb_public_ip_list *all_ips)
+{
+       struct ctdb_public_ip_list *tmp_ip;
+
+       /* verify that the assigned nodes can serve that public ip
+          and set it to -1 if not
+       */
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       continue;
+               }
+               if (!can_node_host_ip(ctdb, tmp_ip->pnn,
+                                     ipflags[tmp_ip->pnn], tmp_ip) != 0) {
+                       /* this node can not serve this ip. */
+                       DEBUG(DEBUG_DEBUG,("Unassign IP: %s from %d\n",
+                                          ctdb_addr_to_str(&(tmp_ip->addr)),
+                                          tmp_ip->pnn));
+                       tmp_ip->pnn = -1;
+               }
+       }
+}
+
+static void ip_alloc_deterministic_ips(struct ctdb_context *ctdb,
+                                      struct ctdb_ipflags *ipflags,
+                                      struct ctdb_public_ip_list *all_ips)
+{
+       struct ctdb_public_ip_list *tmp_ip;
+       int i, numnodes;
+
+       numnodes = talloc_array_length(ipflags);
+
+       DEBUG(DEBUG_NOTICE,("Deterministic IPs enabled. Resetting all ip allocations\n"));
+       /* Allocate IPs to nodes in a modulo fashion so that IPs will
+        *  always be allocated the same way for a specific set of
+        *  available/unavailable nodes.
+       */
+
+       for (i=0,tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next,i++) {
+               tmp_ip->pnn = i % numnodes;
+       }
+
+       /* IP failback doesn't make sense with deterministic
+        * IPs, since the modulo step above implicitly fails
+        * back IPs to their "home" node.
+        */
+       if (1 == ctdb->tunable.no_ip_failback) {
+               DEBUG(DEBUG_WARNING, ("WARNING: 'NoIPFailback' set but ignored - incompatible with 'DeterministicIPs\n"));
+       }
+
+       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
+
+       basic_allocate_unassigned(ctdb, ipflags, all_ips);
+
+       /* No failback here! */
+}
+
+static void ip_alloc_nondeterministic_ips(struct ctdb_context *ctdb,
+                                         struct ctdb_ipflags *ipflags,
+                                         struct ctdb_public_ip_list *all_ips)
+{
+       /* This should be pushed down into basic_failback. */
+       struct ctdb_public_ip_list *tmp_ip;
+       int num_ips = 0;
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               num_ips++;
+       }
+
+       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
+
+       basic_allocate_unassigned(ctdb, ipflags, all_ips);
+
+       /* If we don't want IPs to fail back then don't rebalance IPs. */
+       if (1 == ctdb->tunable.no_ip_failback) {
+               return;
+       }
+
+       /* Now, try to make sure the ip adresses are evenly distributed
+          across the nodes.
+       */
+       basic_failback(ctdb, ipflags, all_ips, num_ips);
+}
+
+static void ip_alloc_lcp2(struct ctdb_context *ctdb,
+                         struct ctdb_ipflags *ipflags,
+                         struct ctdb_public_ip_list *all_ips,
+                         uint32_t *force_rebalance_nodes)
+{
+       uint32_t *lcp2_imbalances;
+       bool *rebalance_candidates;
+
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       unassign_unsuitable_ips(ctdb, ipflags, all_ips);
+
+       lcp2_init(tmp_ctx, ipflags, all_ips,force_rebalance_nodes,
+                 &lcp2_imbalances, &rebalance_candidates);
+
+       lcp2_allocate_unassigned(ctdb, ipflags, all_ips, lcp2_imbalances);
+
+       /* If we don't want IPs to fail back then don't rebalance IPs. */
+       if (1 == ctdb->tunable.no_ip_failback) {
+               goto finished;
+       }
+
+       /* Now, try to make sure the ip adresses are evenly distributed
+          across the nodes.
+       */
+       lcp2_failback(ctdb, ipflags, all_ips,
+                     lcp2_imbalances, rebalance_candidates);
+
+finished:
+       talloc_free(tmp_ctx);
+}
+
+static bool all_nodes_are_disabled(struct ctdb_node_map *nodemap)
+{
+       int i, num_healthy;
+
+       /* Count how many completely healthy nodes we have */
+       num_healthy = 0;
+       for (i=0;i<nodemap->num;i++) {
+               if (!(nodemap->nodes[i].flags & (NODE_FLAGS_INACTIVE|NODE_FLAGS_DISABLED))) {
+                       num_healthy++;
+               }
+       }
+
+       return num_healthy == 0;
+}
+
+/* The calculation part of the IP allocation algorithm. */
+static void ctdb_takeover_run_core(struct ctdb_context *ctdb,
+                                  struct ctdb_ipflags *ipflags,
+                                  struct ctdb_public_ip_list **all_ips_p,
+                                  uint32_t *force_rebalance_nodes)
+{
+       /* since nodes only know about those public addresses that
+          can be served by that particular node, no single node has
+          a full list of all public addresses that exist in the cluster.
+          Walk over all node structures and create a merged list of
+          all public addresses that exist in the cluster.
+
+          keep the tree of ips around as ctdb->ip_tree
+       */
+       *all_ips_p = create_merged_ip_list(ctdb);
+
+        if (1 == ctdb->tunable.lcp2_public_ip_assignment) {
+               ip_alloc_lcp2(ctdb, ipflags, *all_ips_p, force_rebalance_nodes);
+       } else if (1 == ctdb->tunable.deterministic_public_ips) {
+               ip_alloc_deterministic_ips(ctdb, ipflags, *all_ips_p);
+       } else {
+               ip_alloc_nondeterministic_ips(ctdb, ipflags, *all_ips_p);
+       }
+
+       /* at this point ->pnn is the node which will own each IP
+          or -1 if there is no node that can cover this ip
+       */
+
+       return;
+}
+
+struct get_tunable_callback_data {
+       const char *tunable;
+       uint32_t *out;
+       bool fatal;
+};
+
+static void get_tunable_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                int32_t res, TDB_DATA outdata,
+                                void *callback)
+{
+       struct get_tunable_callback_data *cd =
+               (struct get_tunable_callback_data *)callback;
+       int size;
+
+       if (res != 0) {
+               /* Already handled in fail callback */
+               return;
+       }
+
+       if (outdata.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,("Wrong size of returned data when reading \"%s\" tunable from node %d. Expected %d bytes but received %d bytes\n",
+                                cd->tunable, pnn, (int)sizeof(uint32_t),
+                                (int)outdata.dsize));
+               cd->fatal = true;
+               return;
+       }
+
+       size = talloc_array_length(cd->out);
+       if (pnn >= size) {
+               DEBUG(DEBUG_ERR,("Got %s reply from node %d but nodemap only has %d entries\n",
+                                cd->tunable, pnn, size));
+               return;
+       }
+
+               
+       cd->out[pnn] = *(uint32_t *)outdata.dptr;
+}
+
+static void get_tunable_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                      int32_t res, TDB_DATA outdata,
+                                      void *callback)
+{
+       struct get_tunable_callback_data *cd =
+               (struct get_tunable_callback_data *)callback;
+
+       switch (res) {
+       case -ETIME:
+               DEBUG(DEBUG_ERR,
+                     ("Timed out getting tunable \"%s\" from node %d\n",
+                      cd->tunable, pnn));
+               cd->fatal = true;
+               break;
+       case -EINVAL:
+       case -1:
+               DEBUG(DEBUG_WARNING,
+                     ("Tunable \"%s\" not implemented on node %d\n",
+                      cd->tunable, pnn));
+               break;
+       default:
+               DEBUG(DEBUG_ERR,
+                     ("Unexpected error getting tunable \"%s\" from node %d\n",
+                      cd->tunable, pnn));
+               cd->fatal = true;
+       }
+}
+
+static uint32_t *get_tunable_from_nodes(struct ctdb_context *ctdb,
+                                       TALLOC_CTX *tmp_ctx,
+                                       struct ctdb_node_map *nodemap,
+                                       const char *tunable,
+                                       uint32_t default_value)
+{
+       TDB_DATA data;
+       struct ctdb_control_get_tunable *t;
+       uint32_t *nodes;
+       uint32_t *tvals;
+       struct get_tunable_callback_data callback_data;
+       int i;
+
+       tvals = talloc_array(tmp_ctx, uint32_t, nodemap->num);
+       CTDB_NO_MEMORY_NULL(ctdb, tvals);
+       for (i=0; i<nodemap->num; i++) {
+               tvals[i] = default_value;
+       }
+               
+       callback_data.out = tvals;
+       callback_data.tunable = tunable;
+       callback_data.fatal = false;
+
+       data.dsize = offsetof(struct ctdb_control_get_tunable, name) + strlen(tunable) + 1;
+       data.dptr  = talloc_size(tmp_ctx, data.dsize);
+       t = (struct ctdb_control_get_tunable *)data.dptr;
+       t->length = strlen(tunable)+1;
+       memcpy(t->name, tunable, t->length);
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_TUNABLE,
+                                     nodes, 0, TAKEOVER_TIMEOUT(),
+                                     false, data,
+                                     get_tunable_callback,
+                                     get_tunable_fail_callback,
+                                     &callback_data) != 0) {
+               if (callback_data.fatal) {
+                       talloc_free(tvals);
+                       tvals = NULL;
+               }
+       }
+       talloc_free(nodes);
+       talloc_free(data.dptr);
+
+       return tvals;
+}
+
+struct get_runstate_callback_data {
+       enum ctdb_runstate *out;
+       bool fatal;
+};
+
+static void get_runstate_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                 int32_t res, TDB_DATA outdata,
+                                 void *callback_data)
+{
+       struct get_runstate_callback_data *cd =
+               (struct get_runstate_callback_data *)callback_data;
+       int size;
+
+       if (res != 0) {
+               /* Already handled in fail callback */
+               return;
+       }
+
+       if (outdata.dsize != sizeof(uint32_t)) {
+               DEBUG(DEBUG_ERR,("Wrong size of returned data when getting runstate from node %d. Expected %d bytes but received %d bytes\n",
+                                pnn, (int)sizeof(uint32_t),
+                                (int)outdata.dsize));
+               cd->fatal = true;
+               return;
+       }
+
+       size = talloc_array_length(cd->out);
+       if (pnn >= size) {
+               DEBUG(DEBUG_ERR,("Got reply from node %d but nodemap only has %d entries\n",
+                                pnn, size));
+               return;
+       }
+
+       cd->out[pnn] = (enum ctdb_runstate)*(uint32_t *)outdata.dptr;
+}
+
+static void get_runstate_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                      int32_t res, TDB_DATA outdata,
+                                      void *callback)
+{
+       struct get_runstate_callback_data *cd =
+               (struct get_runstate_callback_data *)callback;
+
+       switch (res) {
+       case -ETIME:
+               DEBUG(DEBUG_ERR,
+                     ("Timed out getting runstate from node %d\n", pnn));
+               cd->fatal = true;
+               break;
+       default:
+               DEBUG(DEBUG_WARNING,
+                     ("Error getting runstate from node %d - assuming runstates not supported\n",
+                      pnn));
+       }
+}
+
+static enum ctdb_runstate * get_runstate_from_nodes(struct ctdb_context *ctdb,
+                                                   TALLOC_CTX *tmp_ctx,
+                                                   struct ctdb_node_map *nodemap,
+                                                   enum ctdb_runstate default_value)
+{
+       uint32_t *nodes;
+       enum ctdb_runstate *rs;
+       struct get_runstate_callback_data callback_data;
+       int i;
+
+       rs = talloc_array(tmp_ctx, enum ctdb_runstate, nodemap->num);
+       CTDB_NO_MEMORY_NULL(ctdb, rs);
+       for (i=0; i<nodemap->num; i++) {
+               rs[i] = default_value;
+       }
+
+       callback_data.out = rs;
+       callback_data.fatal = false;
+
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_GET_RUNSTATE,
+                                     nodes, 0, TAKEOVER_TIMEOUT(),
+                                     true, tdb_null,
+                                     get_runstate_callback,
+                                     get_runstate_fail_callback,
+                                     &callback_data) != 0) {
+               if (callback_data.fatal) {
+                       free(rs);
+                       rs = NULL;
+               }
+       }
+       talloc_free(nodes);
+
+       return rs;
+}
+
+/* Set internal flags for IP allocation:
+ *   Clear ip flags
+ *   Set NOIPTAKOVER ip flags from per-node NoIPTakeover tunable
+ *   Set NOIPHOST ip flag for each INACTIVE node
+ *   if all nodes are disabled:
+ *     Set NOIPHOST ip flags from per-node NoIPHostOnAllDisabled tunable
+ *   else
+ *     Set NOIPHOST ip flags for disabled nodes
+ */
+static struct ctdb_ipflags *
+set_ipflags_internal(struct ctdb_context *ctdb,
+                    TALLOC_CTX *tmp_ctx,
+                    struct ctdb_node_map *nodemap,
+                    uint32_t *tval_noiptakeover,
+                    uint32_t *tval_noiphostonalldisabled,
+                    enum ctdb_runstate *runstate)
+{
+       int i;
+       struct ctdb_ipflags *ipflags;
+
+       /* Clear IP flags - implicit due to talloc_zero */
+       ipflags = talloc_zero_array(tmp_ctx, struct ctdb_ipflags, nodemap->num);
+       CTDB_NO_MEMORY_NULL(ctdb, ipflags);
+
+       for (i=0;i<nodemap->num;i++) {
+               /* Can not take IPs on node with NoIPTakeover set */
+               if (tval_noiptakeover[i] != 0) {
+                       ipflags[i].noiptakeover = true;
+               }
+
+               /* Can not host IPs on node not in RUNNING state */
+               if (runstate[i] != CTDB_RUNSTATE_RUNNING) {
+                       ipflags[i].noiphost = true;
+                       continue;
+               }
+               /* Can not host IPs on INACTIVE node */
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       ipflags[i].noiphost = true;
+               }
+       }
+
+       if (all_nodes_are_disabled(nodemap)) {
+               /* If all nodes are disabled, can not host IPs on node
+                * with NoIPHostOnAllDisabled set
+                */
+               for (i=0;i<nodemap->num;i++) {
+                       if (tval_noiphostonalldisabled[i] != 0) {
+                               ipflags[i].noiphost = true;
+                       }
+               }
+       } else {
+               /* If some nodes are not disabled, then can not host
+                * IPs on DISABLED node
+                */
+               for (i=0;i<nodemap->num;i++) {
+                       if (nodemap->nodes[i].flags & NODE_FLAGS_DISABLED) {
+                               ipflags[i].noiphost = true;
+                       }
+               }
+       }
+
+       return ipflags;
+}
+
+static struct ctdb_ipflags *set_ipflags(struct ctdb_context *ctdb,
+                                       TALLOC_CTX *tmp_ctx,
+                                       struct ctdb_node_map *nodemap)
+{
+       uint32_t *tval_noiptakeover;
+       uint32_t *tval_noiphostonalldisabled;
+       struct ctdb_ipflags *ipflags;
+       enum ctdb_runstate *runstate;
+
+
+       tval_noiptakeover = get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
+                                                  "NoIPTakeover", 0);
+       if (tval_noiptakeover == NULL) {
+               return NULL;
+       }
+
+       tval_noiphostonalldisabled =
+               get_tunable_from_nodes(ctdb, tmp_ctx, nodemap,
+                                      "NoIPHostOnAllDisabled", 0);
+       if (tval_noiphostonalldisabled == NULL) {
+               /* Caller frees tmp_ctx */
+               return NULL;
+       }
+
+       /* Any nodes where CTDB_CONTROL_GET_RUNSTATE is not supported
+        * will default to CTDB_RUNSTATE_RUNNING.  This ensures
+        * reasonable behaviour on a mixed cluster during upgrade.
+        */
+       runstate = get_runstate_from_nodes(ctdb, tmp_ctx, nodemap,
+                                          CTDB_RUNSTATE_RUNNING);
+       if (runstate == NULL) {
+               /* Caller frees tmp_ctx */
+               return NULL;
+       }
+
+       ipflags = set_ipflags_internal(ctdb, tmp_ctx, nodemap,
+                                      tval_noiptakeover,
+                                      tval_noiphostonalldisabled,
+                                      runstate);
+
+       talloc_free(tval_noiptakeover);
+       talloc_free(tval_noiphostonalldisabled);
+       talloc_free(runstate);
+
+       return ipflags;
+}
+
+struct iprealloc_callback_data {
+       bool *retry_nodes;
+       int retry_count;
+       client_async_callback fail_callback;
+       void *fail_callback_data;
+       struct ctdb_node_map *nodemap;
+};
+
+static void iprealloc_fail_callback(struct ctdb_context *ctdb, uint32_t pnn,
+                                       int32_t res, TDB_DATA outdata,
+                                       void *callback)
+{
+       int numnodes;
+       struct iprealloc_callback_data *cd =
+               (struct iprealloc_callback_data *)callback;
+
+       switch (res) {
+       case -ETIME:
+               /* If the control timed out then that's a real error,
+                * so call the real fail callback
+                */
+               cd->fail_callback(ctdb, pnn, res, outdata,
+                                 cd->fail_callback_data);
+               break;
+       default:
+               /* If not a timeout then either the ipreallocated
+                * eventscript (or some setup) failed.  This might
+                * have failed because the IPREALLOCATED control isn't
+                * implemented - right now there is no way of knowing
+                * because the error codes are all folded down to -1.
+                * Consider retrying using EVENTSCRIPT control...
+                */
+
+               numnodes = talloc_array_length(cd->retry_nodes);
+               if (pnn > numnodes) {
+                       DEBUG(DEBUG_ERR,
+                             ("ipreallocated failure from node %d, but only %d nodes in nodemap\n",
+                              pnn, numnodes));
+                       return;
+               }
+
+               /* Can't run the "ipreallocated" event on a INACTIVE node */
+               if (cd->nodemap->nodes[pnn].flags & NODE_FLAGS_INACTIVE) {
+                       DEBUG(DEBUG_ERR,
+                             ("ipreallocated failure from node %d, but node is inactive - not flagging a retry\n",
+                              pnn));
+                       return;
+               }
+
+               DEBUG(DEBUG_WARNING,
+                     ("ipreallocated failure from node %d, flagging retry\n",
+                      pnn));
+               cd->retry_nodes[pnn] = true;
+               cd->retry_count++;
+       }
+}
+
+struct takeover_callback_data {
+       bool *node_failed;
+       client_async_callback fail_callback;
+       void *fail_callback_data;
+       struct ctdb_node_map *nodemap;
+};
+
+static void takeover_run_fail_callback(struct ctdb_context *ctdb,
+                                      uint32_t node_pnn, int32_t res,
+                                      TDB_DATA outdata, void *callback_data)
+{
+       struct takeover_callback_data *cd =
+               talloc_get_type_abort(callback_data,
+                                     struct takeover_callback_data);
+       int i;
+
+       for (i = 0; i < cd->nodemap->num; i++) {
+               if (node_pnn == cd->nodemap->nodes[i].pnn) {
+                       break;
+               }
+       }
+
+       if (i == cd->nodemap->num) {
+               DEBUG(DEBUG_ERR, (__location__ " invalid PNN %u\n", node_pnn));
+               return;
+       }
+
+       if (!cd->node_failed[i]) {
+               cd->node_failed[i] = true;
+               cd->fail_callback(ctdb, node_pnn, res, outdata,
+                                 cd->fail_callback_data);
+       }
+}
+
+/*
+  make any IP alias changes for public addresses that are necessary 
+ */
+int ctdb_takeover_run(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap,
+                     uint32_t *force_rebalance_nodes,
+                     client_async_callback fail_callback, void *callback_data)
+{
+       int i, j, ret;
+       struct ctdb_public_ip ip;
+       struct ctdb_public_ipv4 ipv4;
+       uint32_t *nodes;
+       struct ctdb_public_ip_list *all_ips, *tmp_ip;
+       TDB_DATA data;
+       struct timeval timeout;
+       struct client_async_data *async_data;
+       struct ctdb_client_control_state *state;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_ipflags *ipflags;
+       struct takeover_callback_data *takeover_data;
+       struct iprealloc_callback_data iprealloc_data;
+       bool *retry_data;
+
+       /*
+        * ip failover is completely disabled, just send out the 
+        * ipreallocated event.
+        */
+       if (ctdb->tunable.disable_ip_failover != 0) {
+               goto ipreallocated;
+       }
+
+       ipflags = set_ipflags(ctdb, tmp_ctx, nodemap);
+       if (ipflags == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to set IP flags - aborting takeover run\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ZERO_STRUCT(ip);
+
+       /* Do the IP reassignment calculations */
+       ctdb_takeover_run_core(ctdb, ipflags, &all_ips, force_rebalance_nodes);
+
+       /* Now tell all nodes to release any public IPs should not
+        * host.  This will be a NOOP on nodes that don't currently
+        * hold the given IP.
+        */
+       takeover_data = talloc_zero(tmp_ctx, struct takeover_callback_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data);
+
+       takeover_data->node_failed = talloc_zero_array(tmp_ctx,
+                                                      bool, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(ctdb, takeover_data->node_failed);
+       takeover_data->fail_callback = fail_callback;
+       takeover_data->fail_callback_data = callback_data;
+       takeover_data->nodemap = nodemap;
+
+       async_data = talloc_zero(tmp_ctx, struct client_async_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+       async_data->fail_callback = takeover_run_fail_callback;
+       async_data->callback_data = takeover_data;
+
+       for (i=0;i<nodemap->num;i++) {
+               /* don't talk to unconnected nodes, but do talk to banned nodes */
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+
+               for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+                       if (tmp_ip->pnn == nodemap->nodes[i].pnn) {
+                               /* This node should be serving this
+                                  vnn so dont tell it to release the ip
+                               */
+                               continue;
+                       }
+                       if (tmp_ip->addr.sa.sa_family == AF_INET) {
+                               ipv4.pnn = tmp_ip->pnn;
+                               ipv4.sin = tmp_ip->addr.ip;
+
+                               timeout = TAKEOVER_TIMEOUT();
+                               data.dsize = sizeof(ipv4);
+                               data.dptr  = (uint8_t *)&ipv4;
+                               state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
+                                               0, CTDB_CONTROL_RELEASE_IPv4, 0,
+                                               data, async_data,
+                                               &timeout, NULL);
+                       } else {
+                               ip.pnn  = tmp_ip->pnn;
+                               ip.addr = tmp_ip->addr;
+
+                               timeout = TAKEOVER_TIMEOUT();
+                               data.dsize = sizeof(ip);
+                               data.dptr  = (uint8_t *)&ip;
+                               state = ctdb_control_send(ctdb, nodemap->nodes[i].pnn,
+                                               0, CTDB_CONTROL_RELEASE_IP, 0,
+                                               data, async_data,
+                                               &timeout, NULL);
+                       }
+
+                       if (state == NULL) {
+                               DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_RELEASE_IP to node %u\n", nodemap->nodes[i].pnn));
+                               talloc_free(tmp_ctx);
+                               return -1;
+                       }
+               
+                       ctdb_client_async_add(async_data, state);
+               }
+       }
+       if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_RELEASE_IP failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       talloc_free(async_data);
+
+
+       /* tell all nodes to get their own IPs */
+       async_data = talloc_zero(tmp_ctx, struct client_async_data);
+       CTDB_NO_MEMORY_FATAL(ctdb, async_data);
+
+       async_data->fail_callback = fail_callback;
+       async_data->callback_data = callback_data;
+
+       for (tmp_ip=all_ips;tmp_ip;tmp_ip=tmp_ip->next) {
+               if (tmp_ip->pnn == -1) {
+                       /* this IP won't be taken over */
+                       continue;
+               }
+
+               if (tmp_ip->addr.sa.sa_family == AF_INET) {
+                       ipv4.pnn = tmp_ip->pnn;
+                       ipv4.sin = tmp_ip->addr.ip;
+
+                       timeout = TAKEOVER_TIMEOUT();
+                       data.dsize = sizeof(ipv4);
+                       data.dptr  = (uint8_t *)&ipv4;
+                       state = ctdb_control_send(ctdb, tmp_ip->pnn,
+                                       0, CTDB_CONTROL_TAKEOVER_IPv4, 0,
+                                       data, async_data,
+                                       &timeout, NULL);
+               } else {
+                       ip.pnn  = tmp_ip->pnn;
+                       ip.addr = tmp_ip->addr;
+
+                       timeout = TAKEOVER_TIMEOUT();
+                       data.dsize = sizeof(ip);
+                       data.dptr  = (uint8_t *)&ip;
+                       state = ctdb_control_send(ctdb, tmp_ip->pnn,
+                                       0, CTDB_CONTROL_TAKEOVER_IP, 0,
+                                       data, async_data,
+                                       &timeout, NULL);
+               }
+               if (state == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to call async control CTDB_CONTROL_TAKEOVER_IP to node %u\n", tmp_ip->pnn));
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+               
+               ctdb_client_async_add(async_data, state);
+       }
+       if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Async control CTDB_CONTROL_TAKEOVER_IP failed\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+ipreallocated:
+       /* 
+        * Tell all nodes to run eventscripts to process the
+        * "ipreallocated" event.  This can do a lot of things,
+        * including restarting services to reconfigure them if public
+        * IPs have moved.  Once upon a time this event only used to
+        * update natwg.
+        */
+       retry_data = talloc_zero_array(tmp_ctx, bool, nodemap->num);
+       CTDB_NO_MEMORY_FATAL(ctdb, retry_data);
+       iprealloc_data.retry_nodes = retry_data;
+       iprealloc_data.retry_count = 0;
+       iprealloc_data.fail_callback = fail_callback;
+       iprealloc_data.fail_callback_data = callback_data;
+       iprealloc_data.nodemap = nodemap;
+
+       nodes = list_of_connected_nodes(ctdb, nodemap, tmp_ctx, true);
+       ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_IPREALLOCATED,
+                                       nodes, 0, TAKEOVER_TIMEOUT(),
+                                       false, tdb_null,
+                                       NULL, iprealloc_fail_callback,
+                                       &iprealloc_data);
+       if (ret != 0) {
+               /* If the control failed then we should retry to any
+                * nodes flagged by iprealloc_fail_callback using the
+                * EVENTSCRIPT control.  This is a best-effort at
+                * backward compatiblity when running a mixed cluster
+                * where some nodes have not yet been upgraded to
+                * support the IPREALLOCATED control.
+                */
+               DEBUG(DEBUG_WARNING,
+                     ("Retry ipreallocated to some nodes using eventscript control\n"));
+
+               nodes = talloc_array(tmp_ctx, uint32_t,
+                                    iprealloc_data.retry_count);
+               CTDB_NO_MEMORY_FATAL(ctdb, nodes);
+
+               j = 0;
+               for (i=0; i<nodemap->num; i++) {
+                       if (iprealloc_data.retry_nodes[i]) {
+                               nodes[j] = i;
+                               j++;
+                       }
+               }
+
+               data.dptr  = discard_const("ipreallocated");
+               data.dsize = strlen((char *)data.dptr) + 1; 
+               ret = ctdb_client_async_control(ctdb,
+                                               CTDB_CONTROL_RUN_EVENTSCRIPTS,
+                                               nodes, 0, TAKEOVER_TIMEOUT(),
+                                               false, data,
+                                               NULL, fail_callback,
+                                               callback_data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " failed to send control to run eventscripts with \"ipreallocated\"\n"));
+               }
+       }
+
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+
+/*
+  destroy a ctdb_client_ip structure
+ */
+static int ctdb_client_ip_destructor(struct ctdb_client_ip *ip)
+{
+       DEBUG(DEBUG_DEBUG,("destroying client tcp for %s:%u (client_id %u)\n",
+               ctdb_addr_to_str(&ip->addr),
+               ntohs(ip->addr.ip.sin_port),
+               ip->client_id));
+
+       DLIST_REMOVE(ip->ctdb->client_ip_list, ip);
+       return 0;
+}
+
+/*
+  called by a client to inform us of a TCP connection that it is managing
+  that should tickled with an ACK when IP takeover is done
+  we handle both the old ipv4 style of packets as well as the new ipv4/6
+  pdus.
+ */
+int32_t ctdb_control_tcp_client(struct ctdb_context *ctdb, uint32_t client_id,
+                               TDB_DATA indata)
+{
+       struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+       struct ctdb_control_tcp *old_addr = NULL;
+       struct ctdb_control_tcp_addr new_addr;
+       struct ctdb_control_tcp_addr *tcp_sock = NULL;
+       struct ctdb_tcp_list *tcp;
+       struct ctdb_tcp_connection t;
+       int ret;
+       TDB_DATA data;
+       struct ctdb_client_ip *ip;
+       struct ctdb_vnn *vnn;
+       ctdb_sock_addr addr;
+
+       switch (indata.dsize) {
+       case sizeof(struct ctdb_control_tcp):
+               old_addr = (struct ctdb_control_tcp *)indata.dptr;
+               ZERO_STRUCT(new_addr);
+               tcp_sock = &new_addr;
+               tcp_sock->src.ip  = old_addr->src;
+               tcp_sock->dest.ip = old_addr->dest;
+               break;
+       case sizeof(struct ctdb_control_tcp_addr):
+               tcp_sock = (struct ctdb_control_tcp_addr *)indata.dptr;
+               break;
+       default:
+               DEBUG(DEBUG_ERR,(__location__ " Invalid data structure passed "
+                                "to ctdb_control_tcp_client. size was %d but "
+                                "only allowed sizes are %lu and %lu\n",
+                                (int)indata.dsize,
+                                (long unsigned)sizeof(struct ctdb_control_tcp),
+                                (long unsigned)sizeof(struct ctdb_control_tcp_addr)));
+               return -1;
+       }
+
+       addr = tcp_sock->src;
+       ctdb_canonicalize_ip(&addr,  &tcp_sock->src);
+       addr = tcp_sock->dest;
+       ctdb_canonicalize_ip(&addr, &tcp_sock->dest);
+
+       ZERO_STRUCT(addr);
+       memcpy(&addr, &tcp_sock->dest, sizeof(addr));
+       vnn = find_public_ip_vnn(ctdb, &addr);
+       if (vnn == NULL) {
+               switch (addr.sa.sa_family) {
+               case AF_INET:
+                       if (ntohl(addr.ip.sin_addr.s_addr) != INADDR_LOOPBACK) {
+                               DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public address.\n", 
+                                       ctdb_addr_to_str(&addr)));
+                       }
+                       break;
+               case AF_INET6:
+                       DEBUG(DEBUG_ERR,("Could not add client IP %s. This is not a public ipv6 address.\n", 
+                               ctdb_addr_to_str(&addr)));
+                       break;
+               default:
+                       DEBUG(DEBUG_ERR,(__location__ " Unknown family type %d\n", addr.sa.sa_family));
+               }
+
+               return 0;
+       }
+
+       if (vnn->pnn != ctdb->pnn) {
+               DEBUG(DEBUG_ERR,("Attempt to register tcp client for IP %s we don't hold - failing (client_id %u pid %u)\n",
+                       ctdb_addr_to_str(&addr),
+                       client_id, client->pid));
+               /* failing this call will tell smbd to die */
+               return -1;
+       }
+
+       ip = talloc(client, struct ctdb_client_ip);
+       CTDB_NO_MEMORY(ctdb, ip);
+
+       ip->ctdb      = ctdb;
+       ip->addr      = addr;
+       ip->client_id = client_id;
+       talloc_set_destructor(ip, ctdb_client_ip_destructor);
+       DLIST_ADD(ctdb->client_ip_list, ip);
+
+       tcp = talloc(client, struct ctdb_tcp_list);
+       CTDB_NO_MEMORY(ctdb, tcp);
+
+       tcp->connection.src_addr = tcp_sock->src;
+       tcp->connection.dst_addr = tcp_sock->dest;
+
+       DLIST_ADD(client->tcp_list, tcp);
+
+       t.src_addr = tcp_sock->src;
+       t.dst_addr = tcp_sock->dest;
+
+       data.dptr = (uint8_t *)&t;
+       data.dsize = sizeof(t);
+
+       switch (addr.sa.sa_family) {
+       case AF_INET:
+               DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
+                       (unsigned)ntohs(tcp_sock->dest.ip.sin_port), 
+                       ctdb_addr_to_str(&tcp_sock->src),
+                       (unsigned)ntohs(tcp_sock->src.ip.sin_port), client_id, client->pid));
+               break;
+       case AF_INET6:
+               DEBUG(DEBUG_INFO,("registered tcp client for %u->%s:%u (client_id %u pid %u)\n",
+                       (unsigned)ntohs(tcp_sock->dest.ip6.sin6_port), 
+                       ctdb_addr_to_str(&tcp_sock->src),
+                       (unsigned)ntohs(tcp_sock->src.ip6.sin6_port), client_id, client->pid));
+               break;
+       default:
+               DEBUG(DEBUG_ERR,(__location__ " Unknown family %d\n", addr.sa.sa_family));
+       }
+
+
+       /* tell all nodes about this tcp connection */
+       ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
+                                      CTDB_CONTROL_TCP_ADD,
+                                      0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to send CTDB_CONTROL_TCP_ADD\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  find a tcp address on a list
+ */
+static struct ctdb_tcp_connection *ctdb_tcp_find(struct ctdb_tcp_array *array, 
+                                          struct ctdb_tcp_connection *tcp)
+{
+       int i;
+
+       if (array == NULL) {
+               return NULL;
+       }
+
+       for (i=0;i<array->num;i++) {
+               if (ctdb_same_sockaddr(&array->connections[i].src_addr, &tcp->src_addr) &&
+                   ctdb_same_sockaddr(&array->connections[i].dst_addr, &tcp->dst_addr)) {
+                       return &array->connections[i];
+               }
+       }
+       return NULL;
+}
+
+
+
+/*
+  called by a daemon to inform us of a TCP connection that one of its
+  clients managing that should tickled with an ACK when IP takeover is
+  done
+ */
+int32_t ctdb_control_tcp_add(struct ctdb_context *ctdb, TDB_DATA indata, bool tcp_update_needed)
+{
+       struct ctdb_tcp_connection *p = (struct ctdb_tcp_connection *)indata.dptr;
+       struct ctdb_tcp_array *tcparray;
+       struct ctdb_tcp_connection tcp;
+       struct ctdb_vnn *vnn;
+
+       vnn = find_public_ip_vnn(ctdb, &p->dst_addr);
+       if (vnn == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " got TCP_ADD control for an address which is not a public address '%s'\n",
+                       ctdb_addr_to_str(&p->dst_addr)));
+
+               return -1;
+       }
+
+
+       tcparray = vnn->tcp_array;
+
+       /* If this is the first tickle */
+       if (tcparray == NULL) {
+               tcparray = talloc_size(ctdb->nodes, 
+                       offsetof(struct ctdb_tcp_array, connections) +
+                       sizeof(struct ctdb_tcp_connection) * 1);
+               CTDB_NO_MEMORY(ctdb, tcparray);
+               vnn->tcp_array = tcparray;
+
+               tcparray->num = 0;
+               tcparray->connections = talloc_size(tcparray, sizeof(struct ctdb_tcp_connection));
+               CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+               tcparray->connections[tcparray->num].src_addr = p->src_addr;
+               tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
+               tcparray->num++;
+
+               if (tcp_update_needed) {
+                       vnn->tcp_update_needed = true;
+               }
+               return 0;
+       }
+
+
+       /* Do we already have this tickle ?*/
+       tcp.src_addr = p->src_addr;
+       tcp.dst_addr = p->dst_addr;
+       if (ctdb_tcp_find(vnn->tcp_array, &tcp) != NULL) {
+               DEBUG(DEBUG_DEBUG,("Already had tickle info for %s:%u for vnn:%u\n",
+                       ctdb_addr_to_str(&tcp.dst_addr),
+                       ntohs(tcp.dst_addr.ip.sin_port),
+                       vnn->pnn));
+               return 0;
+       }
+
+       /* A new tickle, we must add it to the array */
+       tcparray->connections = talloc_realloc(tcparray, tcparray->connections,
+                                       struct ctdb_tcp_connection,
+                                       tcparray->num+1);
+       CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+       vnn->tcp_array = tcparray;
+       tcparray->connections[tcparray->num].src_addr = p->src_addr;
+       tcparray->connections[tcparray->num].dst_addr = p->dst_addr;
+       tcparray->num++;
+                               
+       DEBUG(DEBUG_INFO,("Added tickle info for %s:%u from vnn %u\n",
+               ctdb_addr_to_str(&tcp.dst_addr),
+               ntohs(tcp.dst_addr.ip.sin_port),
+               vnn->pnn));
+
+       if (tcp_update_needed) {
+               vnn->tcp_update_needed = true;
+       }
+
+       return 0;
+}
+
+
+/*
+  called by a daemon to inform us of a TCP connection that one of its
+  clients managing that should tickled with an ACK when IP takeover is
+  done
+ */
+static void ctdb_remove_tcp_connection(struct ctdb_context *ctdb, struct ctdb_tcp_connection *conn)
+{
+       struct ctdb_tcp_connection *tcpp;
+       struct ctdb_vnn *vnn = find_public_ip_vnn(ctdb, &conn->dst_addr);
+
+       if (vnn == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " unable to find public address %s\n",
+                       ctdb_addr_to_str(&conn->dst_addr)));
+               return;
+       }
+
+       /* if the array is empty we cant remove it
+          and we dont need to do anything
+        */
+       if (vnn->tcp_array == NULL) {
+               DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist (array is empty) %s:%u\n",
+                       ctdb_addr_to_str(&conn->dst_addr),
+                       ntohs(conn->dst_addr.ip.sin_port)));
+               return;
+       }
+
+
+       /* See if we know this connection
+          if we dont know this connection  then we dont need to do anything
+        */
+       tcpp = ctdb_tcp_find(vnn->tcp_array, conn);
+       if (tcpp == NULL) {
+               DEBUG(DEBUG_INFO,("Trying to remove tickle that doesnt exist %s:%u\n",
+                       ctdb_addr_to_str(&conn->dst_addr),
+                       ntohs(conn->dst_addr.ip.sin_port)));
+               return;
+       }
+
+
+       /* We need to remove this entry from the array.
+           Instead of allocating a new array and copying data to it
+          we cheat and just copy the last entry in the existing array
+          to the entry that is to be removed and just shring the 
+          ->num field
+        */
+       *tcpp = vnn->tcp_array->connections[vnn->tcp_array->num - 1];
+       vnn->tcp_array->num--;
+
+       /* If we deleted the last entry we also need to remove the entire array
+        */
+       if (vnn->tcp_array->num == 0) {
+               talloc_free(vnn->tcp_array);
+               vnn->tcp_array = NULL;
+       }               
+
+       vnn->tcp_update_needed = true;
+
+       DEBUG(DEBUG_INFO,("Removed tickle info for %s:%u\n",
+               ctdb_addr_to_str(&conn->src_addr),
+               ntohs(conn->src_addr.ip.sin_port)));
+}
+
+
+/*
+  called by a daemon to inform us of a TCP connection that one of its
+  clients used are no longer needed in the tickle database
+ */
+int32_t ctdb_control_tcp_remove(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_tcp_connection *conn = (struct ctdb_tcp_connection *)indata.dptr;
+
+       ctdb_remove_tcp_connection(ctdb, conn);
+
+       return 0;
+}
+
+
+/*
+  called when a daemon restarts - send all tickes for all public addresses
+  we are serving immediately to the new node.
+ */
+int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn)
+{
+/*XXX here we should send all tickes we are serving to the new node */
+       return 0;
+}
+
+
+/*
+  called when a client structure goes away - hook to remove
+  elements from the tcp_list in all daemons
+ */
+void ctdb_takeover_client_destructor_hook(struct ctdb_client *client)
+{
+       while (client->tcp_list) {
+               struct ctdb_tcp_list *tcp = client->tcp_list;
+               DLIST_REMOVE(client->tcp_list, tcp);
+               ctdb_remove_tcp_connection(client->ctdb, &tcp->connection);
+       }
+}
+
+
+/*
+  release all IPs on shutdown
+ */
+void ctdb_release_all_ips(struct ctdb_context *ctdb)
+{
+       struct ctdb_vnn *vnn;
+       int count = 0;
+
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (!ctdb_sys_have_ip(&vnn->public_address)) {
+                       ctdb_vnn_unassign_iface(ctdb, vnn);
+                       continue;
+               }
+               if (!vnn->iface) {
+                       continue;
+               }
+
+               DEBUG(DEBUG_INFO,("Release of IP %s/%u on interface %s node:-1\n",
+                                   ctdb_addr_to_str(&vnn->public_address),
+                                   vnn->public_netmask_bits,
+                                   ctdb_vnn_iface_string(vnn)));
+
+               ctdb_event_script_args(ctdb, CTDB_EVENT_RELEASE_IP, "%s %s %u",
+                                 ctdb_vnn_iface_string(vnn),
+                                 ctdb_addr_to_str(&vnn->public_address),
+                                 vnn->public_netmask_bits);
+               release_kill_clients(ctdb, &vnn->public_address);
+               ctdb_vnn_unassign_iface(ctdb, vnn);
+               count++;
+       }
+
+       DEBUG(DEBUG_NOTICE,(__location__ " Released %d public IPs\n", count));
+}
+
+
+/*
+  get list of public IPs
+ */
+int32_t ctdb_control_get_public_ips(struct ctdb_context *ctdb, 
+                                   struct ctdb_req_control *c, TDB_DATA *outdata)
+{
+       int i, num, len;
+       struct ctdb_all_public_ips *ips;
+       struct ctdb_vnn *vnn;
+       bool only_available = false;
+
+       if (c->flags & CTDB_PUBLIC_IP_FLAGS_ONLY_AVAILABLE) {
+               only_available = true;
+       }
+
+       /* count how many public ip structures we have */
+       num = 0;
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               num++;
+       }
+
+       len = offsetof(struct ctdb_all_public_ips, ips) + 
+               num*sizeof(struct ctdb_public_ip);
+       ips = talloc_zero_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, ips);
+
+       i = 0;
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (only_available && !ctdb_vnn_available(ctdb, vnn)) {
+                       continue;
+               }
+               ips->ips[i].pnn  = vnn->pnn;
+               ips->ips[i].addr = vnn->public_address;
+               i++;
+       }
+       ips->num = i;
+       len = offsetof(struct ctdb_all_public_ips, ips) +
+               i*sizeof(struct ctdb_public_ip);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)ips;
+
+       return 0;
+}
+
+
+/*
+  get list of public IPs, old ipv4 style.  only returns ipv4 addresses
+ */
+int32_t ctdb_control_get_public_ipsv4(struct ctdb_context *ctdb, 
+                                   struct ctdb_req_control *c, TDB_DATA *outdata)
+{
+       int i, num, len;
+       struct ctdb_all_public_ipsv4 *ips;
+       struct ctdb_vnn *vnn;
+
+       /* count how many public ip structures we have */
+       num = 0;
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (vnn->public_address.sa.sa_family != AF_INET) {
+                       continue;
+               }
+               num++;
+       }
+
+       len = offsetof(struct ctdb_all_public_ipsv4, ips) + 
+               num*sizeof(struct ctdb_public_ipv4);
+       ips = talloc_zero_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, ips);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)ips;
+
+       ips->num = num;
+       i = 0;
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (vnn->public_address.sa.sa_family != AF_INET) {
+                       continue;
+               }
+               ips->ips[i].pnn = vnn->pnn;
+               ips->ips[i].sin = vnn->public_address.ip;
+               i++;
+       }
+
+       return 0;
+}
+
+int32_t ctdb_control_get_public_ip_info(struct ctdb_context *ctdb,
+                                       struct ctdb_req_control *c,
+                                       TDB_DATA indata,
+                                       TDB_DATA *outdata)
+{
+       int i, num, len;
+       ctdb_sock_addr *addr;
+       struct ctdb_control_public_ip_info *info;
+       struct ctdb_vnn *vnn;
+
+       addr = (ctdb_sock_addr *)indata.dptr;
+
+       vnn = find_public_ip_vnn(ctdb, addr);
+       if (vnn == NULL) {
+               /* if it is not a public ip   it could be our 'single ip' */
+               if (ctdb->single_ip_vnn) {
+                       if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, addr)) {
+                               vnn = ctdb->single_ip_vnn;
+                       }
+               }
+       }
+       if (vnn == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Could not get public ip info, "
+                                "'%s'not a public address\n",
+                                ctdb_addr_to_str(addr)));
+               return -1;
+       }
+
+       /* count how many public ip structures we have */
+       num = 0;
+       for (;vnn->ifaces[num];) {
+               num++;
+       }
+
+       len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
+               num*sizeof(struct ctdb_control_iface_info);
+       info = talloc_zero_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, info);
+
+       info->ip.addr = vnn->public_address;
+       info->ip.pnn = vnn->pnn;
+       info->active_idx = 0xFFFFFFFF;
+
+       for (i=0; vnn->ifaces[i]; i++) {
+               struct ctdb_iface *cur;
+
+               cur = ctdb_find_iface(ctdb, vnn->ifaces[i]);
+               if (cur == NULL) {
+                       DEBUG(DEBUG_CRIT, (__location__ " internal error iface[%s] unknown\n",
+                                          vnn->ifaces[i]));
+                       return -1;
+               }
+               if (vnn->iface == cur) {
+                       info->active_idx = i;
+               }
+               strcpy(info->ifaces[i].name, cur->name);
+               info->ifaces[i].link_state = cur->link_up;
+               info->ifaces[i].references = cur->references;
+       }
+       info->num = i;
+       len = offsetof(struct ctdb_control_public_ip_info, ifaces) +
+               i*sizeof(struct ctdb_control_iface_info);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)info;
+
+       return 0;
+}
+
+int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
+                               struct ctdb_req_control *c,
+                               TDB_DATA *outdata)
+{
+       int i, num, len;
+       struct ctdb_control_get_ifaces *ifaces;
+       struct ctdb_iface *cur;
+
+       /* count how many public ip structures we have */
+       num = 0;
+       for (cur=ctdb->ifaces;cur;cur=cur->next) {
+               num++;
+       }
+
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+               num*sizeof(struct ctdb_control_iface_info);
+       ifaces = talloc_zero_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, ifaces);
+
+       i = 0;
+       for (cur=ctdb->ifaces;cur;cur=cur->next) {
+               strcpy(ifaces->ifaces[i].name, cur->name);
+               ifaces->ifaces[i].link_state = cur->link_up;
+               ifaces->ifaces[i].references = cur->references;
+               i++;
+       }
+       ifaces->num = i;
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+               i*sizeof(struct ctdb_control_iface_info);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)ifaces;
+
+       return 0;
+}
+
+int32_t ctdb_control_set_iface_link(struct ctdb_context *ctdb,
+                                   struct ctdb_req_control *c,
+                                   TDB_DATA indata)
+{
+       struct ctdb_control_iface_info *info;
+       struct ctdb_iface *iface;
+       bool link_up = false;
+
+       info = (struct ctdb_control_iface_info *)indata.dptr;
+
+       if (info->name[CTDB_IFACE_SIZE] != '\0') {
+               int len = strnlen(info->name, CTDB_IFACE_SIZE);
+               DEBUG(DEBUG_ERR, (__location__ " name[%*.*s] not terminated\n",
+                                 len, len, info->name));
+               return -1;
+       }
+
+       switch (info->link_state) {
+       case 0:
+               link_up = false;
+               break;
+       case 1:
+               link_up = true;
+               break;
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " link_state[%u] invalid\n",
+                                 (unsigned int)info->link_state));
+               return -1;
+       }
+
+       if (info->references != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " references[%u] should be 0\n",
+                                 (unsigned int)info->references));
+               return -1;
+       }
+
+       iface = ctdb_find_iface(ctdb, info->name);
+       if (iface == NULL) {
+               return -1;
+       }
+
+       if (link_up == iface->link_up) {
+               return 0;
+       }
+
+       DEBUG(iface->link_up?DEBUG_ERR:DEBUG_NOTICE,
+             ("iface[%s] has changed it's link status %s => %s\n",
+              iface->name,
+              iface->link_up?"up":"down",
+              link_up?"up":"down"));
+
+       iface->link_up = link_up;
+       return 0;
+}
+
+
+/* 
+   structure containing the listening socket and the list of tcp connections
+   that the ctdb daemon is to kill
+*/
+struct ctdb_kill_tcp {
+       struct ctdb_vnn *vnn;
+       struct ctdb_context *ctdb;
+       int capture_fd;
+       struct fd_event *fde;
+       trbt_tree_t *connections;
+       void *private_data;
+};
+
+/*
+  a tcp connection that is to be killed
+ */
+struct ctdb_killtcp_con {
+       ctdb_sock_addr src_addr;
+       ctdb_sock_addr dst_addr;
+       int count;
+       struct ctdb_kill_tcp *killtcp;
+};
+
+/* this function is used to create a key to represent this socketpair
+   in the killtcp tree.
+   this key is used to insert and lookup matching socketpairs that are
+   to be tickled and RST
+*/
+#define KILLTCP_KEYLEN 10
+static uint32_t *killtcp_key(ctdb_sock_addr *src, ctdb_sock_addr *dst)
+{
+       static uint32_t key[KILLTCP_KEYLEN];
+
+       bzero(key, sizeof(key));
+
+       if (src->sa.sa_family != dst->sa.sa_family) {
+               DEBUG(DEBUG_ERR, (__location__ " ERROR, different families passed :%u vs %u\n", src->sa.sa_family, dst->sa.sa_family));
+               return key;
+       }
+       
+       switch (src->sa.sa_family) {
+       case AF_INET:
+               key[0]  = dst->ip.sin_addr.s_addr;
+               key[1]  = src->ip.sin_addr.s_addr;
+               key[2]  = dst->ip.sin_port;
+               key[3]  = src->ip.sin_port;
+               break;
+       case AF_INET6: {
+               uint32_t *dst6_addr32 =
+                       (uint32_t *)&(dst->ip6.sin6_addr.s6_addr);
+               uint32_t *src6_addr32 =
+                       (uint32_t *)&(src->ip6.sin6_addr.s6_addr);
+               key[0]  = dst6_addr32[3];
+               key[1]  = src6_addr32[3];
+               key[2]  = dst6_addr32[2];
+               key[3]  = src6_addr32[2];
+               key[4]  = dst6_addr32[1];
+               key[5]  = src6_addr32[1];
+               key[6]  = dst6_addr32[0];
+               key[7]  = src6_addr32[0];
+               key[8]  = dst->ip6.sin6_port;
+               key[9]  = src->ip6.sin6_port;
+               break;
+       }
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", src->sa.sa_family));
+               return key;
+       }
+
+       return key;
+}
+
+/*
+  called when we get a read event on the raw socket
+ */
+static void capture_tcp_handler(struct event_context *ev, struct fd_event *fde, 
+                               uint16_t flags, void *private_data)
+{
+       struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
+       struct ctdb_killtcp_con *con;
+       ctdb_sock_addr src, dst;
+       uint32_t ack_seq, seq;
+
+       if (!(flags & EVENT_FD_READ)) {
+               return;
+       }
+
+       if (ctdb_sys_read_tcp_packet(killtcp->capture_fd,
+                               killtcp->private_data,
+                               &src, &dst,
+                               &ack_seq, &seq) != 0) {
+               /* probably a non-tcp ACK packet */
+               return;
+       }
+
+       /* check if we have this guy in our list of connections
+          to kill
+       */
+       con = trbt_lookuparray32(killtcp->connections, 
+                       KILLTCP_KEYLEN, killtcp_key(&src, &dst));
+       if (con == NULL) {
+               /* no this was some other packet we can just ignore */
+               return;
+       }
+
+       /* This one has been tickled !
+          now reset him and remove him from the list.
+        */
+       DEBUG(DEBUG_INFO, ("sending a tcp reset to kill connection :%d -> %s:%d\n",
+               ntohs(con->dst_addr.ip.sin_port),
+               ctdb_addr_to_str(&con->src_addr),
+               ntohs(con->src_addr.ip.sin_port)));
+
+       ctdb_sys_send_tcp(&con->dst_addr, &con->src_addr, ack_seq, seq, 1);
+       talloc_free(con);
+}
+
+
+/* when traversing the list of all tcp connections to send tickle acks to
+   (so that we can capture the ack coming back and kill the connection
+    by a RST)
+   this callback is called for each connection we are currently trying to kill
+*/
+static int tickle_connection_traverse(void *param, void *data)
+{
+       struct ctdb_killtcp_con *con = talloc_get_type(data, struct ctdb_killtcp_con);
+
+       /* have tried too many times, just give up */
+       if (con->count >= 5) {
+               /* can't delete in traverse: reparent to delete_cons */
+               talloc_steal(param, con);
+               return 0;
+       }
+
+       /* othervise, try tickling it again */
+       con->count++;
+       ctdb_sys_send_tcp(
+               (ctdb_sock_addr *)&con->dst_addr,
+               (ctdb_sock_addr *)&con->src_addr,
+               0, 0, 0);
+       return 0;
+}
+
+
+/* 
+   called every second until all sentenced connections have been reset
+ */
+static void ctdb_tickle_sentenced_connections(struct event_context *ev, struct timed_event *te, 
+                                             struct timeval t, void *private_data)
+{
+       struct ctdb_kill_tcp *killtcp = talloc_get_type(private_data, struct ctdb_kill_tcp);
+       void *delete_cons = talloc_new(NULL);
+
+       /* loop over all connections sending tickle ACKs */
+       trbt_traversearray32(killtcp->connections, KILLTCP_KEYLEN, tickle_connection_traverse, delete_cons);
+
+       /* now we've finished traverse, it's safe to do deletion. */
+       talloc_free(delete_cons);
+
+       /* If there are no more connections to kill we can remove the
+          entire killtcp structure
+        */
+       if ( (killtcp->connections == NULL) || 
+            (killtcp->connections->root == NULL) ) {
+               talloc_free(killtcp);
+               return;
+       }
+
+       /* try tickling them again in a seconds time
+        */
+       event_add_timed(killtcp->ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
+                       ctdb_tickle_sentenced_connections, killtcp);
+}
+
+/*
+  destroy the killtcp structure
+ */
+static int ctdb_killtcp_destructor(struct ctdb_kill_tcp *killtcp)
+{
+       struct ctdb_vnn *tmpvnn;
+
+       /* verify that this vnn is still active */
+       for (tmpvnn = killtcp->ctdb->vnn; tmpvnn; tmpvnn = tmpvnn->next) {
+               if (tmpvnn == killtcp->vnn) {
+                       break;
+               }
+       }
+
+       if (tmpvnn == NULL) {
+               return 0;
+       }
+
+       if (killtcp->vnn->killtcp != killtcp) {
+               return 0;
+       }
+
+       killtcp->vnn->killtcp = NULL;
+
+       return 0;
+}
+
+
+/* nothing fancy here, just unconditionally replace any existing
+   connection structure with the new one.
+
+   dont even free the old one if it did exist, that one is talloc_stolen
+   by the same node in the tree anyway and will be deleted when the new data 
+   is deleted
+*/
+static void *add_killtcp_callback(void *parm, void *data)
+{
+       return parm;
+}
+
+/*
+  add a tcp socket to the list of connections we want to RST
+ */
+static int ctdb_killtcp_add_connection(struct ctdb_context *ctdb, 
+                                      ctdb_sock_addr *s,
+                                      ctdb_sock_addr *d)
+{
+       ctdb_sock_addr src, dst;
+       struct ctdb_kill_tcp *killtcp;
+       struct ctdb_killtcp_con *con;
+       struct ctdb_vnn *vnn;
+
+       ctdb_canonicalize_ip(s, &src);
+       ctdb_canonicalize_ip(d, &dst);
+
+       vnn = find_public_ip_vnn(ctdb, &dst);
+       if (vnn == NULL) {
+               vnn = find_public_ip_vnn(ctdb, &src);
+       }
+       if (vnn == NULL) {
+               /* if it is not a public ip   it could be our 'single ip' */
+               if (ctdb->single_ip_vnn) {
+                       if (ctdb_same_ip(&ctdb->single_ip_vnn->public_address, &dst)) {
+                               vnn = ctdb->single_ip_vnn;
+                       }
+               }
+       }
+       if (vnn == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Could not killtcp, not a public address\n")); 
+               return -1;
+       }
+
+       killtcp = vnn->killtcp;
+       
+       /* If this is the first connection to kill we must allocate
+          a new structure
+        */
+       if (killtcp == NULL) {
+               killtcp = talloc_zero(vnn, struct ctdb_kill_tcp);
+               CTDB_NO_MEMORY(ctdb, killtcp);
+
+               killtcp->vnn         = vnn;
+               killtcp->ctdb        = ctdb;
+               killtcp->capture_fd  = -1;
+               killtcp->connections = trbt_create(killtcp, 0);
+
+               vnn->killtcp         = killtcp;
+               talloc_set_destructor(killtcp, ctdb_killtcp_destructor);
+       }
+
+
+
+       /* create a structure that describes this connection we want to
+          RST and store it in killtcp->connections
+       */
+       con = talloc(killtcp, struct ctdb_killtcp_con);
+       CTDB_NO_MEMORY(ctdb, con);
+       con->src_addr = src;
+       con->dst_addr = dst;
+       con->count    = 0;
+       con->killtcp  = killtcp;
+
+
+       trbt_insertarray32_callback(killtcp->connections,
+                       KILLTCP_KEYLEN, killtcp_key(&con->dst_addr, &con->src_addr),
+                       add_killtcp_callback, con);
+
+       /* 
+          If we dont have a socket to listen on yet we must create it
+        */
+       if (killtcp->capture_fd == -1) {
+               const char *iface = ctdb_vnn_iface_string(vnn);
+               killtcp->capture_fd = ctdb_sys_open_capture_socket(iface, &killtcp->private_data);
+               if (killtcp->capture_fd == -1) {
+                       DEBUG(DEBUG_CRIT,(__location__ " Failed to open capturing "
+                                         "socket on iface '%s' for killtcp (%s)\n",
+                                         iface, strerror(errno)));
+                       goto failed;
+               }
+       }
+
+
+       if (killtcp->fde == NULL) {
+               killtcp->fde = event_add_fd(ctdb->ev, killtcp, killtcp->capture_fd, 
+                                           EVENT_FD_READ,
+                                           capture_tcp_handler, killtcp);
+               tevent_fd_set_auto_close(killtcp->fde);
+
+               /* We also need to set up some events to tickle all these connections
+                  until they are all reset
+               */
+               event_add_timed(ctdb->ev, killtcp, timeval_current_ofs(1, 0), 
+                               ctdb_tickle_sentenced_connections, killtcp);
+       }
+
+       /* tickle him once now */
+       ctdb_sys_send_tcp(
+               &con->dst_addr,
+               &con->src_addr,
+               0, 0, 0);
+
+       return 0;
+
+failed:
+       talloc_free(vnn->killtcp);
+       vnn->killtcp = NULL;
+       return -1;
+}
+
+/*
+  kill a TCP connection.
+ */
+int32_t ctdb_control_kill_tcp(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_control_killtcp *killtcp = (struct ctdb_control_killtcp *)indata.dptr;
+
+       return ctdb_killtcp_add_connection(ctdb, &killtcp->src_addr, &killtcp->dst_addr);
+}
+
+/*
+  called by a daemon to inform us of the entire list of TCP tickles for
+  a particular public address.
+  this control should only be sent by the node that is currently serving
+  that public address.
+ */
+int32_t ctdb_control_set_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_control_tcp_tickle_list *list = (struct ctdb_control_tcp_tickle_list *)indata.dptr;
+       struct ctdb_tcp_array *tcparray;
+       struct ctdb_vnn *vnn;
+
+       /* We must at least have tickles.num or else we cant verify the size
+          of the received data blob
+        */
+       if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
+                                       tickles.connections)) {
+               DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list. Not enough data for the tickle.num field\n"));
+               return -1;
+       }
+
+       /* verify that the size of data matches what we expect */
+       if (indata.dsize < offsetof(struct ctdb_control_tcp_tickle_list, 
+                               tickles.connections)
+                        + sizeof(struct ctdb_tcp_connection)
+                                * list->tickles.num) {
+               DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tcp_tickle_list\n"));
+               return -1;
+       }       
+
+       vnn = find_public_ip_vnn(ctdb, &list->addr);
+       if (vnn == NULL) {
+               DEBUG(DEBUG_INFO,(__location__ " Could not set tcp tickle list, '%s' is not a public address\n", 
+                       ctdb_addr_to_str(&list->addr)));
+
+               return 1;
+       }
+
+       /* remove any old ticklelist we might have */
+       talloc_free(vnn->tcp_array);
+       vnn->tcp_array = NULL;
+
+       tcparray = talloc(ctdb->nodes, struct ctdb_tcp_array);
+       CTDB_NO_MEMORY(ctdb, tcparray);
+
+       tcparray->num = list->tickles.num;
+
+       tcparray->connections = talloc_array(tcparray, struct ctdb_tcp_connection, tcparray->num);
+       CTDB_NO_MEMORY(ctdb, tcparray->connections);
+
+       memcpy(tcparray->connections, &list->tickles.connections[0], 
+              sizeof(struct ctdb_tcp_connection)*tcparray->num);
+
+       /* We now have a new fresh tickle list array for this vnn */
+       vnn->tcp_array = talloc_steal(vnn, tcparray);
+       
+       return 0;
+}
+
+/*
+  called to return the full list of tickles for the puclic address associated 
+  with the provided vnn
+ */
+int32_t ctdb_control_get_tcp_tickle_list(struct ctdb_context *ctdb, TDB_DATA indata, TDB_DATA *outdata)
+{
+       ctdb_sock_addr *addr = (ctdb_sock_addr *)indata.dptr;
+       struct ctdb_control_tcp_tickle_list *list;
+       struct ctdb_tcp_array *tcparray;
+       int num;
+       struct ctdb_vnn *vnn;
+
+       vnn = find_public_ip_vnn(ctdb, addr);
+       if (vnn == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Could not get tcp tickle list, '%s' is not a public address\n", 
+                       ctdb_addr_to_str(addr)));
+
+               return 1;
+       }
+
+       tcparray = vnn->tcp_array;
+       if (tcparray) {
+               num = tcparray->num;
+       } else {
+               num = 0;
+       }
+
+       outdata->dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
+                               tickles.connections)
+                       + sizeof(struct ctdb_tcp_connection) * num;
+
+       outdata->dptr  = talloc_size(outdata, outdata->dsize);
+       CTDB_NO_MEMORY(ctdb, outdata->dptr);
+       list = (struct ctdb_control_tcp_tickle_list *)outdata->dptr;
+
+       list->addr = *addr;
+       list->tickles.num = num;
+       if (num) {
+               memcpy(&list->tickles.connections[0], tcparray->connections, 
+                       sizeof(struct ctdb_tcp_connection) * num);
+       }
+
+       return 0;
+}
+
+
+/*
+  set the list of all tcp tickles for a public address
+ */
+static int ctdb_ctrl_set_tcp_tickles(struct ctdb_context *ctdb, 
+                             struct timeval timeout, uint32_t destnode, 
+                             ctdb_sock_addr *addr,
+                             struct ctdb_tcp_array *tcparray)
+{
+       int ret, num;
+       TDB_DATA data;
+       struct ctdb_control_tcp_tickle_list *list;
+
+       if (tcparray) {
+               num = tcparray->num;
+       } else {
+               num = 0;
+       }
+
+       data.dsize = offsetof(struct ctdb_control_tcp_tickle_list, 
+                               tickles.connections) +
+                       sizeof(struct ctdb_tcp_connection) * num;
+       data.dptr = talloc_size(ctdb, data.dsize);
+       CTDB_NO_MEMORY(ctdb, data.dptr);
+
+       list = (struct ctdb_control_tcp_tickle_list *)data.dptr;
+       list->addr = *addr;
+       list->tickles.num = num;
+       if (tcparray) {
+               memcpy(&list->tickles.connections[0], tcparray->connections, sizeof(struct ctdb_tcp_connection) * num);
+       }
+
+       ret = ctdb_daemon_send_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, 
+                                      CTDB_CONTROL_SET_TCP_TICKLE_LIST,
+                                      0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " ctdb_control for set tcp tickles failed\n"));
+               return -1;
+       }
+
+       talloc_free(data.dptr);
+
+       return ret;
+}
+
+
+/*
+  perform tickle updates if required
+ */
+static void ctdb_update_tcp_tickles(struct event_context *ev, 
+                               struct timed_event *te, 
+                               struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int ret;
+       struct ctdb_vnn *vnn;
+
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               /* we only send out updates for public addresses that 
+                  we have taken over
+                */
+               if (ctdb->pnn != vnn->pnn) {
+                       continue;
+               }
+               /* We only send out the updates if we need to */
+               if (!vnn->tcp_update_needed) {
+                       continue;
+               }
+               ret = ctdb_ctrl_set_tcp_tickles(ctdb, 
+                               TAKEOVER_TIMEOUT(),
+                               CTDB_BROADCAST_CONNECTED,
+                               &vnn->public_address,
+                               vnn->tcp_array);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to send the tickle update for public address %s\n",
+                               ctdb_addr_to_str(&vnn->public_address)));
+               }
+       }
+
+       event_add_timed(ctdb->ev, ctdb->tickle_update_context,
+                            timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
+                            ctdb_update_tcp_tickles, ctdb);
+}              
+       
+
+/*
+  start periodic update of tcp tickles
+ */
+void ctdb_start_tcp_tickle_update(struct ctdb_context *ctdb)
+{
+       ctdb->tickle_update_context = talloc_new(ctdb);
+
+       event_add_timed(ctdb->ev, ctdb->tickle_update_context,
+                            timeval_current_ofs(ctdb->tunable.tickle_update_interval, 0), 
+                            ctdb_update_tcp_tickles, ctdb);
+}
+
+
+
+
+struct control_gratious_arp {
+       struct ctdb_context *ctdb;
+       ctdb_sock_addr addr;
+       const char *iface;
+       int count;
+};
+
+/*
+  send a control_gratuitous arp
+ */
+static void send_gratious_arp(struct event_context *ev, struct timed_event *te, 
+                                 struct timeval t, void *private_data)
+{
+       int ret;
+       struct control_gratious_arp *arp = talloc_get_type(private_data, 
+                                                       struct control_gratious_arp);
+
+       ret = ctdb_sys_send_arp(&arp->addr, arp->iface);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " sending of gratious arp on iface '%s' failed (%s)\n",
+                                arp->iface, strerror(errno)));
+       }
+
+
+       arp->count++;
+       if (arp->count == CTDB_ARP_REPEAT) {
+               talloc_free(arp);
+               return;
+       }
+
+       event_add_timed(arp->ctdb->ev, arp, 
+                       timeval_current_ofs(CTDB_ARP_INTERVAL, 0), 
+                       send_gratious_arp, arp);
+}
+
+
+/*
+  send a gratious arp 
+ */
+int32_t ctdb_control_send_gratious_arp(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_control_gratious_arp *gratious_arp = (struct ctdb_control_gratious_arp *)indata.dptr;
+       struct control_gratious_arp *arp;
+
+       /* verify the size of indata */
+       if (indata.dsize < offsetof(struct ctdb_control_gratious_arp, iface)) {
+               DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_gratious_arp structure. Got %u require %u bytes\n", 
+                                (unsigned)indata.dsize, 
+                                (unsigned)offsetof(struct ctdb_control_gratious_arp, iface)));
+               return -1;
+       }
+       if (indata.dsize != 
+               ( offsetof(struct ctdb_control_gratious_arp, iface)
+               + gratious_arp->len ) ){
+
+               DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+                       "but should be %u bytes\n", 
+                        (unsigned)indata.dsize, 
+                        (unsigned)(offsetof(struct ctdb_control_gratious_arp, iface)+gratious_arp->len)));
+               return -1;
+       }
+
+
+       arp = talloc(ctdb, struct control_gratious_arp);
+       CTDB_NO_MEMORY(ctdb, arp);
+
+       arp->ctdb  = ctdb;
+       arp->addr   = gratious_arp->addr;
+       arp->iface = talloc_strdup(arp, gratious_arp->iface);
+       CTDB_NO_MEMORY(ctdb, arp->iface);
+       arp->count = 0;
+       
+       event_add_timed(arp->ctdb->ev, arp, 
+                       timeval_zero(), send_gratious_arp, arp);
+
+       return 0;
+}
+
+int32_t ctdb_control_add_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
+       int ret;
+
+       /* verify the size of indata */
+       if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
+               DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
+               return -1;
+       }
+       if (indata.dsize != 
+               ( offsetof(struct ctdb_control_ip_iface, iface)
+               + pub->len ) ){
+
+               DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+                       "but should be %u bytes\n", 
+                        (unsigned)indata.dsize, 
+                        (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Add IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
+       ret = ctdb_add_public_address(ctdb, &pub->addr, pub->mask, &pub->iface[0], true);
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to add public address\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  called when releaseip event finishes for del_public_address
+ */
+static void delete_ip_callback(struct ctdb_context *ctdb, int status, 
+                               void *private_data)
+{
+       talloc_free(private_data);
+}
+
+int32_t ctdb_control_del_public_address(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_control_ip_iface *pub = (struct ctdb_control_ip_iface *)indata.dptr;
+       struct ctdb_vnn *vnn;
+       int ret;
+
+       /* verify the size of indata */
+       if (indata.dsize < offsetof(struct ctdb_control_ip_iface, iface)) {
+               DEBUG(DEBUG_ERR,(__location__ " Too small indata to hold a ctdb_control_ip_iface structure\n"));
+               return -1;
+       }
+       if (indata.dsize != 
+               ( offsetof(struct ctdb_control_ip_iface, iface)
+               + pub->len ) ){
+
+               DEBUG(DEBUG_ERR,(__location__ " Wrong size of indata. Was %u bytes "
+                       "but should be %u bytes\n", 
+                        (unsigned)indata.dsize, 
+                        (unsigned)(offsetof(struct ctdb_control_ip_iface, iface)+pub->len)));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Delete IP %s\n", ctdb_addr_to_str(&pub->addr)));
+
+       /* walk over all public addresses until we find a match */
+       for (vnn=ctdb->vnn;vnn;vnn=vnn->next) {
+               if (ctdb_same_ip(&vnn->public_address, &pub->addr)) {
+                       TALLOC_CTX *mem_ctx = talloc_new(ctdb);
+
+                       DLIST_REMOVE(ctdb->vnn, vnn);
+                       talloc_steal(mem_ctx, vnn);
+                       ctdb_remove_orphaned_ifaces(ctdb, vnn, mem_ctx);
+                       if (vnn->pnn != ctdb->pnn) {
+                               if (vnn->iface != NULL) {
+                                       ctdb_vnn_unassign_iface(ctdb, vnn);
+                               }
+                               talloc_free(mem_ctx);
+                               return 0;
+                       }
+                       vnn->pnn = -1;
+
+                       ret = ctdb_event_script_callback(ctdb, 
+                                        mem_ctx, delete_ip_callback, mem_ctx,
+                                        false,
+                                        CTDB_EVENT_RELEASE_IP,
+                                        "%s %s %u",
+                                        ctdb_vnn_iface_string(vnn),
+                                        ctdb_addr_to_str(&vnn->public_address),
+                                        vnn->public_netmask_bits);
+                       if (vnn->iface != NULL) {
+                               ctdb_vnn_unassign_iface(ctdb, vnn);
+                       }
+                       if (ret != 0) {
+                               return -1;
+                       }
+                       return 0;
+               }
+       }
+
+       return -1;
+}
+
+
+struct ipreallocated_callback_state {
+       struct ctdb_req_control *c;
+};
+
+static void ctdb_ipreallocated_callback(struct ctdb_context *ctdb,
+                                       int status, void *p)
+{
+       struct ipreallocated_callback_state *state =
+               talloc_get_type(p, struct ipreallocated_callback_state);
+
+       if (status != 0) {
+               DEBUG(DEBUG_ERR,
+                     (" \"ipreallocated\" event script failed (status %d)\n",
+                      status));
+               if (status == -ETIME) {
+                       ctdb_ban_self(ctdb);
+               }
+       }
+
+       ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+       talloc_free(state);
+}
+
+/* A control to run the ipreallocated event */
+int32_t ctdb_control_ipreallocated(struct ctdb_context *ctdb,
+                                  struct ctdb_req_control *c,
+                                  bool *async_reply)
+{
+       int ret;
+       struct ipreallocated_callback_state *state;
+
+       state = talloc(ctdb, struct ipreallocated_callback_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       DEBUG(DEBUG_INFO,(__location__ " Running \"ipreallocated\" event\n"));
+
+       ret = ctdb_event_script_callback(ctdb, state,
+                                        ctdb_ipreallocated_callback, state,
+                                        false, CTDB_EVENT_IPREALLOCATED,
+                                        "%s", "");
+
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to run \"ipreallocated\" event \n"));
+               talloc_free(state);
+               return -1;
+       }
+
+       /* tell the control that we will be reply asynchronously */
+       state->c    = talloc_steal(state, c);
+       *async_reply = true;
+
+       return 0;
+}
+
+
+/* This function is called from the recovery daemon to verify that a remote
+   node has the expected ip allocation.
+   This is verified against ctdb->ip_tree
+*/
+int verify_remote_ip_allocation(struct ctdb_context *ctdb,
+                               struct ctdb_all_public_ips *ips,
+                               uint32_t pnn)
+{
+       struct ctdb_public_ip_list *tmp_ip; 
+       int i;
+
+       if (ctdb->ip_tree == NULL) {
+               /* dont know the expected allocation yet, assume remote node
+                  is correct. */
+               return 0;
+       }
+
+       if (ips == NULL) {
+               return 0;
+       }
+
+       for (i=0; i<ips->num; i++) {
+               tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ips->ips[i].addr));
+               if (tmp_ip == NULL) {
+                       DEBUG(DEBUG_ERR,("Node %u has new or unknown public IP %s\n", pnn, ctdb_addr_to_str(&ips->ips[i].addr)));
+                       return -1;
+               }
+
+               if (tmp_ip->pnn == -1 || ips->ips[i].pnn == -1) {
+                       continue;
+               }
+
+               if (tmp_ip->pnn != ips->ips[i].pnn) {
+                       DEBUG(DEBUG_ERR,
+                             ("Inconsistent IP allocation - node %u thinks %s is held by node %u while it is assigned to node %u\n",
+                              pnn,
+                              ctdb_addr_to_str(&ips->ips[i].addr),
+                              ips->ips[i].pnn, tmp_ip->pnn));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+int update_ip_assignment_tree(struct ctdb_context *ctdb, struct ctdb_public_ip *ip)
+{
+       struct ctdb_public_ip_list *tmp_ip; 
+
+       if (ctdb->ip_tree == NULL) {
+               DEBUG(DEBUG_ERR,("No ctdb->ip_tree yet. Failed to update ip assignment\n"));
+               return -1;
+       }
+
+       tmp_ip = trbt_lookuparray32(ctdb->ip_tree, IP_KEYLEN, ip_key(&ip->addr));
+       if (tmp_ip == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Could not find record for address %s, update ip\n", ctdb_addr_to_str(&ip->addr)));
+               return -1;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Updated ip assignment tree for ip : %s from node %u to node %u\n", ctdb_addr_to_str(&ip->addr), tmp_ip->pnn, ip->pnn));
+       tmp_ip->pnn = ip->pnn;
+
+       return 0;
+}
+
+
+struct ctdb_reloadips_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_req_control *c;
+       int status;
+       int fd[2];
+       pid_t child;
+       struct fd_event *fde;
+};
+
+static int ctdb_reloadips_destructor(struct ctdb_reloadips_handle *h)
+{
+       if (h == h->ctdb->reload_ips) {
+               h->ctdb->reload_ips = NULL;
+       }
+       if (h->c != NULL) {
+               ctdb_request_control_reply(h->ctdb, h->c, NULL, h->status, NULL);
+               h->c = NULL;
+       }
+       ctdb_kill(h->ctdb, h->child, SIGKILL);
+       return 0;
+}
+
+static void ctdb_reloadips_timeout_event(struct event_context *ev,
+                               struct timed_event *te,
+                               struct timeval t, void *private_data)
+{
+       struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
+
+       talloc_free(h);
+}      
+
+static void ctdb_reloadips_child_handler(struct event_context *ev, struct fd_event *fde, 
+                            uint16_t flags, void *private_data)
+{
+       struct ctdb_reloadips_handle *h = talloc_get_type(private_data, struct ctdb_reloadips_handle);
+
+       char res;
+       int ret;
+
+       ret = read(h->fd[0], &res, 1);
+       if (ret < 1 || res != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Reloadips child process returned error\n"));
+               res = 1;
+       }
+       h->status = res;
+
+       talloc_free(h);
+}
+
+static int ctdb_reloadips_child(struct ctdb_context *ctdb)
+{
+       TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       struct ctdb_all_public_ips *ips;
+       struct ctdb_vnn *vnn;
+       struct client_async_data *async_data;
+       struct timeval timeout;
+       TDB_DATA data;
+       struct ctdb_client_control_state *state;
+       bool first_add;
+       int i, ret;
+
+       CTDB_NO_MEMORY(ctdb, mem_ctx);
+
+       /* Read IPs from local node */
+       ret = ctdb_ctrl_get_public_ips(ctdb, TAKEOVER_TIMEOUT(),
+                                      CTDB_CURRENT_NODE, mem_ctx, &ips);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,
+                     ("Unable to fetch public IPs from local node\n"));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       /* Read IPs file - this is safe since this is a child process */
+       ctdb->vnn = NULL;
+       if (ctdb_set_public_addresses(ctdb, false) != 0) {
+               DEBUG(DEBUG_ERR,("Failed to re-read public addresses file\n"));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       async_data = talloc_zero(mem_ctx, struct client_async_data);
+       CTDB_NO_MEMORY(ctdb, async_data);
+
+       /* Compare IPs between node and file for IPs to be deleted */
+       for (i = 0; i < ips->num; i++) {
+               /* */
+               for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
+                       if (ctdb_same_ip(&vnn->public_address,
+                                        &ips->ips[i].addr)) {
+                               /* IP is still in file */
+                               break;
+                       }
+               }
+
+               if (vnn == NULL) {
+                       /* Delete IP ips->ips[i] */
+                       struct ctdb_control_ip_iface *pub;
+
+                       DEBUG(DEBUG_NOTICE,
+                             ("IP %s no longer configured, deleting it\n",
+                              ctdb_addr_to_str(&ips->ips[i].addr)));
+
+                       pub = talloc_zero(mem_ctx,
+                                         struct ctdb_control_ip_iface);
+                       CTDB_NO_MEMORY(ctdb, pub);
+
+                       pub->addr  = ips->ips[i].addr;
+                       pub->mask  = 0;
+                       pub->len   = 0;
+
+                       timeout = TAKEOVER_TIMEOUT();
+
+                       data.dsize = offsetof(struct ctdb_control_ip_iface,
+                                             iface) + pub->len;
+                       data.dptr = (uint8_t *)pub;
+
+                       state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+                                                 CTDB_CONTROL_DEL_PUBLIC_IP,
+                                                 0, data, async_data,
+                                                 &timeout, NULL);
+                       if (state == NULL) {
+                               DEBUG(DEBUG_ERR,
+                                     (__location__
+                                      " failed sending CTDB_CONTROL_DEL_PUBLIC_IP\n"));
+                               goto failed;
+                       }
+
+               }
+       }
+
+       /* Compare IPs between node and file for IPs to be added */
+       first_add = true;
+       for (vnn = ctdb->vnn; vnn; vnn = vnn->next) {
+               for (i = 0; i < ips->num; i++) {
+                       if (ctdb_same_ip(&vnn->public_address,
+                                        &ips->ips[i].addr)) {
+                               /* IP already on node */
+                               break;
+                       }
+               }
+               if (i == ips->num) {
+                       /* Add IP ips->ips[i] */
+                       struct ctdb_control_ip_iface *pub;
+                       const char *ifaces = NULL;
+                       uint32_t len;
+                       int iface = 0;
+
+                       DEBUG(DEBUG_NOTICE,
+                             ("New IP %s configured, adding it\n",
+                              ctdb_addr_to_str(&vnn->public_address)));
+                       if (first_add) {
+                               uint32_t pnn = ctdb_get_pnn(ctdb);
+
+                               data.dsize = sizeof(pnn);
+                               data.dptr  = (uint8_t *)&pnn;
+
+                               ret = ctdb_client_send_message(
+                                       ctdb,
+                                       CTDB_BROADCAST_CONNECTED,
+                                       CTDB_SRVID_REBALANCE_NODE,
+                                       data);
+                               if (ret != 0) {
+                                       DEBUG(DEBUG_WARNING,
+                                             ("Failed to send message to force node reallocation - IPs may be unbalanced\n"));
+                               }
+
+                               first_add = false;
+                       }
+
+                       ifaces = vnn->ifaces[0];
+                       iface = 1;
+                       while (vnn->ifaces[iface] != NULL) {
+                               ifaces = talloc_asprintf(vnn, "%s,%s", ifaces,
+                                                        vnn->ifaces[iface]);
+                               iface++;
+                       }
+
+                       len   = strlen(ifaces) + 1;
+                       pub = talloc_zero_size(mem_ctx,
+                                              offsetof(struct ctdb_control_ip_iface, iface) + len);
+                       CTDB_NO_MEMORY(ctdb, pub);
+
+                       pub->addr  = vnn->public_address;
+                       pub->mask  = vnn->public_netmask_bits;
+                       pub->len   = len;
+                       memcpy(&pub->iface[0], ifaces, pub->len);
+
+                       timeout = TAKEOVER_TIMEOUT();
+
+                       data.dsize = offsetof(struct ctdb_control_ip_iface,
+                                             iface) + pub->len;
+                       data.dptr = (uint8_t *)pub;
+
+                       state = ctdb_control_send(ctdb, CTDB_CURRENT_NODE, 0,
+                                                 CTDB_CONTROL_ADD_PUBLIC_IP,
+                                                 0, data, async_data,
+                                                 &timeout, NULL);
+                       if (state == NULL) {
+                               DEBUG(DEBUG_ERR,
+                                     (__location__
+                                      " failed sending CTDB_CONTROL_ADD_PUBLIC_IP\n"));
+                               goto failed;
+                       }
+               }
+       }
+
+       if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Add/delete IPs failed\n"));
+               goto failed;
+       }
+
+       talloc_free(mem_ctx);
+       return 0;
+
+failed:
+       talloc_free(mem_ctx);
+       return -1;
+}
+
+/* This control is sent to force the node to re-read the public addresses file
+   and drop any addresses we should nnot longer host, and add new addresses
+   that we are now able to host
+*/
+int32_t ctdb_control_reload_public_ips(struct ctdb_context *ctdb, struct ctdb_req_control *c, bool *async_reply)
+{
+       struct ctdb_reloadips_handle *h;
+       pid_t parent = getpid();
+
+       if (ctdb->reload_ips != NULL) {
+               talloc_free(ctdb->reload_ips);
+               ctdb->reload_ips = NULL;
+       }
+
+       h = talloc(ctdb, struct ctdb_reloadips_handle);
+       CTDB_NO_MEMORY(ctdb, h);
+       h->ctdb     = ctdb;
+       h->c        = NULL;
+       h->status   = -1;
+       
+       if (pipe(h->fd) == -1) {
+               DEBUG(DEBUG_ERR,("Failed to create pipe for ctdb_freeze_lock\n"));
+               talloc_free(h);
+               return -1;
+       }
+
+       h->child = ctdb_fork(ctdb);
+       if (h->child == (pid_t)-1) {
+               DEBUG(DEBUG_ERR, ("Failed to fork a child for reloadips\n"));
+               close(h->fd[0]);
+               close(h->fd[1]);
+               talloc_free(h);
+               return -1;
+       }
+
+       /* child process */
+       if (h->child == 0) {
+               signed char res = 0;
+
+               close(h->fd[0]);
+               debug_extra = talloc_asprintf(NULL, "reloadips:");
+
+               ctdb_set_process_name("ctdb_reloadips");
+               if (switch_from_server_to_client(ctdb, "reloadips-child") != 0) {
+                       DEBUG(DEBUG_CRIT,("ERROR: Failed to switch reloadips child into client mode\n"));
+                       res = -1;
+               } else {
+                       res = ctdb_reloadips_child(ctdb);
+                       if (res != 0) {
+                               DEBUG(DEBUG_ERR,("Failed to reload ips on local node\n"));
+                       }
+               }
+
+               write(h->fd[1], &res, 1);
+               /* make sure we die when our parent dies */
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+               }
+               _exit(0);
+       }
+
+       h->c             = talloc_steal(h, c);
+
+       close(h->fd[1]);
+       set_close_on_exec(h->fd[0]);
+
+       talloc_set_destructor(h, ctdb_reloadips_destructor);
+
+
+       h->fde = event_add_fd(ctdb->ev, h, h->fd[0],
+                       EVENT_FD_READ, ctdb_reloadips_child_handler,
+                       (void *)h);
+       tevent_fd_set_auto_close(h->fde);
+
+       event_add_timed(ctdb->ev, h,
+                       timeval_current_ofs(120, 0),
+                       ctdb_reloadips_timeout_event, h);
+
+       /* we reply later */
+       *async_reply = true;
+       return 0;
+}
diff --git a/ctdb/server/ctdb_traverse.c b/ctdb/server/ctdb_traverse.c
new file mode 100644 (file)
index 0000000..99e7e8f
--- /dev/null
@@ -0,0 +1,784 @@
+/* 
+   efficient async ctdb traverse
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "db_wrap.h"
+#include "tdb.h"
+#include "../include/ctdb_private.h"
+#include "lib/util/dlinklist.h"
+
+typedef void (*ctdb_traverse_fn_t)(void *private_data, TDB_DATA key, TDB_DATA data);
+
+/*
+  handle returned to caller - freeing this handler will kill the child and 
+  terminate the traverse
+ */
+struct ctdb_traverse_local_handle {
+       struct ctdb_traverse_local_handle *next, *prev;
+       struct ctdb_db_context *ctdb_db;
+       int fd[2];
+       pid_t child;
+       uint64_t srvid;
+       uint32_t client_reqid;
+       uint32_t reqid;
+       int srcnode;
+       void *private_data;
+       ctdb_traverse_fn_t callback;
+       bool withemptyrecords;
+       struct tevent_fd *fde;
+       int records_failed;
+       int records_sent;
+};
+
+/*
+ * called when traverse is completed by child or on error
+ */
+static void ctdb_traverse_child_handler(struct tevent_context *ev, struct tevent_fd *fde,
+                                       uint16_t flags, void *private_data)
+{
+       struct ctdb_traverse_local_handle *h = talloc_get_type(private_data,
+                                                       struct ctdb_traverse_local_handle);
+       ctdb_traverse_fn_t callback = h->callback;
+       void *p = h->private_data;
+       int res;
+       ssize_t n;
+
+       /* Read the number of records sent by traverse child */
+       n = read(h->fd[0], &res, sizeof(res));
+       if (n < 0 || n != sizeof(res)) {
+               /* Traverse child failed */
+               DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d\n",
+                                 h->ctdb_db->db_name, h->reqid));
+       } else if (res < 0) {
+               /* Traverse failed */
+               res = -res;
+               DEBUG(DEBUG_ERR, ("Local traverse failed db:%s reqid:%d records:%d\n",
+                                 h->ctdb_db->db_name, h->reqid, res));
+       } else {
+               DEBUG(DEBUG_INFO, ("Local traverse end db:%s reqid:%d records:%d\n",
+                                  h->ctdb_db->db_name, h->reqid, res));
+       }
+
+       callback(p, tdb_null, tdb_null);
+}
+
+/*
+  destroy a in-flight traverse operation
+ */
+static int traverse_local_destructor(struct ctdb_traverse_local_handle *h)
+{
+       DLIST_REMOVE(h->ctdb_db->traverse, h);
+       ctdb_kill(h->ctdb_db->ctdb, h->child, SIGKILL);
+       return 0;
+}
+
+/*
+  callback from tdb_traverse_read()
+ */
+static int ctdb_traverse_local_fn(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *p)
+{
+       struct ctdb_traverse_local_handle *h = talloc_get_type(p,
+                                                              struct ctdb_traverse_local_handle);
+       struct ctdb_rec_data *d;
+       struct ctdb_ltdb_header *hdr;
+       int res, status;
+       TDB_DATA outdata;
+
+       hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+       if (h->ctdb_db->persistent == 0) {
+               /* filter out zero-length records */
+               if (!h->withemptyrecords &&
+                   data.dsize <= sizeof(struct ctdb_ltdb_header))
+               {
+                       return 0;
+               }
+
+               /* filter out non-authoritative records */
+               if (hdr->dmaster != h->ctdb_db->ctdb->pnn) {
+                       return 0;
+               }
+       }
+
+       d = ctdb_marshall_record(h, h->reqid, key, NULL, data);
+       if (d == NULL) {
+               /* error handling is tricky in this child code .... */
+               h->records_failed++;
+               return -1;
+       }
+
+       outdata.dptr = (uint8_t *)d;
+       outdata.dsize = d->length;
+
+       res = ctdb_control(h->ctdb_db->ctdb, h->srcnode, 0, CTDB_CONTROL_TRAVERSE_DATA,
+                          CTDB_CTRL_FLAG_NOREPLY, outdata, NULL, NULL, &status, NULL, NULL);
+       if (res != 0 || status != 0) {
+               h->records_failed++;
+               return -1;
+       }
+
+       h->records_sent++;
+       return 0;
+}
+
+struct traverse_all_state {
+       struct ctdb_context *ctdb;
+       struct ctdb_traverse_local_handle *h;
+       uint32_t reqid;
+       uint32_t srcnode;
+       uint32_t client_reqid;
+       uint64_t srvid;
+       bool withemptyrecords;
+};
+
+/*
+  setup a non-blocking traverse of a local ltdb. The callback function
+  will be called on every record in the local ltdb. To stop the
+  traverse, talloc_free() the traverse_handle.
+
+  The traverse is finished when the callback is called with tdb_null for key and data
+ */
+static struct ctdb_traverse_local_handle *ctdb_traverse_local(struct ctdb_db_context *ctdb_db,
+                                                             ctdb_traverse_fn_t callback,
+                                                             struct traverse_all_state *all_state)
+{
+       struct ctdb_traverse_local_handle *h;
+       int ret;
+
+       h = talloc_zero(all_state, struct ctdb_traverse_local_handle);
+       if (h == NULL) {
+               return NULL;
+       }
+
+       ret = pipe(h->fd);
+
+       if (ret != 0) {
+               talloc_free(h);
+               return NULL;
+       }
+
+       h->child = ctdb_fork(ctdb_db->ctdb);
+
+       if (h->child == (pid_t)-1) {
+               close(h->fd[0]);
+               close(h->fd[1]);
+               talloc_free(h);
+               return NULL;
+       }
+
+       h->callback = callback;
+       h->private_data = all_state;
+       h->ctdb_db = ctdb_db;
+       h->client_reqid = all_state->client_reqid;
+       h->reqid = all_state->reqid;
+       h->srvid = all_state->srvid;
+       h->srcnode = all_state->srcnode;
+       h->withemptyrecords = all_state->withemptyrecords;
+
+       if (h->child == 0) {
+               /* start the traverse in the child */
+               int res, status;
+               pid_t parent = getpid();
+               struct ctdb_context *ctdb = ctdb_db->ctdb;
+               struct ctdb_rec_data *d;
+               TDB_DATA outdata;
+
+               close(h->fd[0]);
+
+               ctdb_set_process_name("ctdb_traverse");
+               if (switch_from_server_to_client(ctdb, "traverse_local-%s:",
+                                                ctdb_db->db_name) != 0) {
+                       DEBUG(DEBUG_CRIT, ("Failed to switch traverse child into client mode\n"));
+                       _exit(0);
+               }
+
+               d = ctdb_marshall_record(h, h->reqid, tdb_null, NULL, tdb_null);
+               if (d == NULL) {
+                       res = 0;
+                       write(h->fd[1], &res, sizeof(int));
+                       _exit(0);
+               }
+
+               res = tdb_traverse_read(ctdb_db->ltdb->tdb, ctdb_traverse_local_fn, h);
+               if (res == -1 || h->records_failed > 0) {
+                       /* traverse failed */
+                       res = -(h->records_sent);
+               } else {
+                       res = h->records_sent;
+               }
+
+               /* Wait till all the data is flushed from output queue */
+               while (ctdb_queue_length(ctdb->daemon.queue) > 0) {
+                       tevent_loop_once(ctdb->ev);
+               }
+
+               /* End traverse by sending empty record */
+               outdata.dptr = (uint8_t *)d;
+               outdata.dsize = d->length;
+               ret = ctdb_control(ctdb, h->srcnode, 0,
+                                  CTDB_CONTROL_TRAVERSE_DATA,
+                                  CTDB_CTRL_FLAG_NOREPLY, outdata,
+                                  NULL, NULL, &status, NULL, NULL);
+               if (ret == -1 || status == -1) {
+                       if (res > 0) {
+                               res = -res;
+                       }
+               }
+
+               write(h->fd[1], &res, sizeof(res));
+
+               while (ctdb_kill(ctdb, parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+               }
+               _exit(0);
+       }
+
+       close(h->fd[1]);
+       set_close_on_exec(h->fd[0]);
+
+       talloc_set_destructor(h, traverse_local_destructor);
+
+       DLIST_ADD(ctdb_db->traverse, h);
+
+       h->fde = tevent_add_fd(ctdb_db->ctdb->ev, h, h->fd[0], EVENT_FD_READ,
+                              ctdb_traverse_child_handler, h);
+       if (h->fde == NULL) {
+               close(h->fd[0]);
+               talloc_free(h);
+               return NULL;
+       }
+       tevent_fd_set_auto_close(h->fde);
+
+       return h;
+}
+
+
+struct ctdb_traverse_all_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       uint32_t reqid;
+       ctdb_traverse_fn_t callback;
+       void *private_data;
+       uint32_t null_count;
+       bool timedout;
+};
+
+/*
+  destroy a traverse_all op
+ */
+static int ctdb_traverse_all_destructor(struct ctdb_traverse_all_handle *state)
+{
+       ctdb_reqid_remove(state->ctdb, state->reqid);
+       return 0;
+}
+
+struct ctdb_traverse_all {
+       uint32_t db_id;
+       uint32_t reqid;
+       uint32_t pnn;
+       uint32_t client_reqid;
+       uint64_t srvid;
+};
+
+struct ctdb_traverse_all_ext {
+       uint32_t db_id;
+       uint32_t reqid;
+       uint32_t pnn;
+       uint32_t client_reqid;
+       uint64_t srvid;
+       bool withemptyrecords;
+};
+
+/* called when a traverse times out */
+static void ctdb_traverse_all_timeout(struct event_context *ev, struct timed_event *te, 
+                                     struct timeval t, void *private_data)
+{
+       struct ctdb_traverse_all_handle *state = talloc_get_type(private_data, struct ctdb_traverse_all_handle);
+
+       DEBUG(DEBUG_ERR,(__location__ " Traverse all timeout on database:%s\n", state->ctdb_db->db_name));
+       CTDB_INCREMENT_STAT(state->ctdb, timeouts.traverse);
+
+       state->timedout = true;
+       state->callback(state->private_data, tdb_null, tdb_null);
+}
+
+
+struct traverse_start_state {
+       struct ctdb_context *ctdb;
+       struct ctdb_traverse_all_handle *h;
+       uint32_t srcnode;
+       uint32_t reqid;
+       uint32_t db_id;
+       uint64_t srvid;
+       bool withemptyrecords;
+       int num_records;
+};
+
+
+/*
+  setup a cluster-wide non-blocking traverse of a ctdb. The
+  callback function will be called on every record in the local
+  ltdb. To stop the traverse, talloc_free() the traverse_handle.
+
+  The traverse is finished when the callback is called with tdb_null
+  for key and data
+ */
+static struct ctdb_traverse_all_handle *ctdb_daemon_traverse_all(struct ctdb_db_context *ctdb_db,
+                                                                ctdb_traverse_fn_t callback,
+                                                                struct traverse_start_state *start_state)
+{
+       struct ctdb_traverse_all_handle *state;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       int ret;
+       TDB_DATA data;
+       struct ctdb_traverse_all r;
+       struct ctdb_traverse_all_ext r_ext;
+       uint32_t destination;
+
+       state = talloc(start_state, struct ctdb_traverse_all_handle);
+       if (state == NULL) {
+               return NULL;
+       }
+
+       state->ctdb         = ctdb;
+       state->ctdb_db      = ctdb_db;
+       state->reqid        = ctdb_reqid_new(ctdb_db->ctdb, state);
+       state->callback     = callback;
+       state->private_data = start_state;
+       state->null_count   = 0;
+       state->timedout     = false;
+       
+       talloc_set_destructor(state, ctdb_traverse_all_destructor);
+
+       if (start_state->withemptyrecords) {
+               r_ext.db_id = ctdb_db->db_id;
+               r_ext.reqid = state->reqid;
+               r_ext.pnn   = ctdb->pnn;
+               r_ext.client_reqid = start_state->reqid;
+               r_ext.srvid = start_state->srvid;
+               r_ext.withemptyrecords = start_state->withemptyrecords;
+
+               data.dptr = (uint8_t *)&r_ext;
+               data.dsize = sizeof(r_ext);
+       } else {
+               r.db_id = ctdb_db->db_id;
+               r.reqid = state->reqid;
+               r.pnn   = ctdb->pnn;
+               r.client_reqid = start_state->reqid;
+               r.srvid = start_state->srvid;
+
+               data.dptr = (uint8_t *)&r;
+               data.dsize = sizeof(r);
+       }
+
+       if (ctdb_db->persistent == 0) {
+               /* normal database, traverse all nodes */         
+               destination = CTDB_BROADCAST_VNNMAP;
+       } else {
+               int i;
+               /* persistent database, traverse one node, preferably
+                * the local one
+                */
+               destination = ctdb->pnn;
+               /* check we are in the vnnmap */
+               for (i=0; i < ctdb->vnn_map->size; i++) {
+                       if (ctdb->vnn_map->map[i] == ctdb->pnn) {
+                               break;
+                       }
+               }
+               /* if we are not in the vnn map we just pick the first
+                * node instead
+                */
+               if (i == ctdb->vnn_map->size) {
+                       destination = ctdb->vnn_map->map[0];
+               }
+       }
+
+       /* tell all the nodes in the cluster to start sending records to this
+        * node, or if it is a persistent database, just tell the local
+        * node
+        */
+
+       if (start_state->withemptyrecords) {
+               ret = ctdb_daemon_send_control(ctdb, destination, 0,
+                                      CTDB_CONTROL_TRAVERSE_ALL_EXT,
+                                      0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+       } else {
+               ret = ctdb_daemon_send_control(ctdb, destination, 0,
+                                      CTDB_CONTROL_TRAVERSE_ALL,
+                                      0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+       }
+
+       if (ret != 0) {
+               talloc_free(state);
+               return NULL;
+       }
+
+       DEBUG(DEBUG_NOTICE,("Starting traverse on DB %s (id %d)\n",
+                           ctdb_db->db_name, state->reqid));
+
+       /* timeout the traverse */
+       event_add_timed(ctdb->ev, state, 
+                       timeval_current_ofs(ctdb->tunable.traverse_timeout, 0), 
+                       ctdb_traverse_all_timeout, state);
+
+       return state;
+}
+
+/*
+  called when local traverse ends
+ */
+static void traverse_all_callback(void *p, TDB_DATA key, TDB_DATA data)
+{
+       struct traverse_all_state *state = talloc_get_type(p, struct traverse_all_state);
+
+       /* we're done */
+       talloc_free(state);
+}
+
+/*
+ * extended version to take the "withemptyrecords" parameter"
+ */
+int32_t ctdb_control_traverse_all_ext(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+       struct ctdb_traverse_all_ext *c = (struct ctdb_traverse_all_ext *)data.dptr;
+       struct traverse_all_state *state;
+       struct ctdb_db_context *ctdb_db;
+
+       if (data.dsize != sizeof(struct ctdb_traverse_all_ext)) {
+               DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all_ext\n"));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, c->db_id);
+       if (ctdb_db == NULL) {
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+                       DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+                                       ctdb_db->db_name, ctdb_db->unhealthy_reason));
+                       return -1;
+               }
+               DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+                                    ctdb_db->db_name, ctdb_db->unhealthy_reason));
+       }
+
+       state = talloc(ctdb_db, struct traverse_all_state);
+       if (state == NULL) {
+               return -1;
+       }
+
+       state->reqid = c->reqid;
+       state->srcnode = c->pnn;
+       state->ctdb = ctdb;
+       state->client_reqid = c->client_reqid;
+       state->srvid = c->srvid;
+       state->withemptyrecords = c->withemptyrecords;
+
+       state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state);
+       if (state->h == NULL) {
+               talloc_free(state);
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  called when a CTDB_CONTROL_TRAVERSE_ALL control comes in. We then
+  setup a traverse of our local ltdb, sending the records as
+  CTDB_CONTROL_TRAVERSE_DATA records back to the originator
+ */
+int32_t ctdb_control_traverse_all(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+       struct ctdb_traverse_all *c = (struct ctdb_traverse_all *)data.dptr;
+       struct traverse_all_state *state;
+       struct ctdb_db_context *ctdb_db;
+
+       if (data.dsize != sizeof(struct ctdb_traverse_all)) {
+               DEBUG(DEBUG_ERR,(__location__ " Invalid size in ctdb_control_traverse_all\n"));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, c->db_id);
+       if (ctdb_db == NULL) {
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+                       DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+                                       ctdb_db->db_name, ctdb_db->unhealthy_reason));
+                       return -1;
+               }
+               DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_all: %s\n",
+                                    ctdb_db->db_name, ctdb_db->unhealthy_reason));
+       }
+
+       state = talloc(ctdb_db, struct traverse_all_state);
+       if (state == NULL) {
+               return -1;
+       }
+
+       state->reqid = c->reqid;
+       state->srcnode = c->pnn;
+       state->ctdb = ctdb;
+       state->client_reqid = c->client_reqid;
+       state->srvid = c->srvid;
+       state->withemptyrecords = false;
+
+       state->h = ctdb_traverse_local(ctdb_db, traverse_all_callback, state);
+       if (state->h == NULL) {
+               talloc_free(state);
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  called when a CTDB_CONTROL_TRAVERSE_DATA control comes in. We then
+  call the traverse_all callback with the record
+ */
+int32_t ctdb_control_traverse_data(struct ctdb_context *ctdb, TDB_DATA data, TDB_DATA *outdata)
+{
+       struct ctdb_rec_data *d = (struct ctdb_rec_data *)data.dptr;
+       struct ctdb_traverse_all_handle *state;
+       TDB_DATA key;
+       ctdb_traverse_fn_t callback;
+       void *private_data;
+
+       if (data.dsize < sizeof(uint32_t) || data.dsize != d->length) {
+               DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_data\n"));
+               return -1;
+       }
+
+       state = ctdb_reqid_find(ctdb, d->reqid, struct ctdb_traverse_all_handle);
+       if (state == NULL || d->reqid != state->reqid) {
+               /* traverse might have been terminated already */
+               return -1;
+       }
+
+       key.dsize = d->keylen;
+       key.dptr  = &d->data[0];
+       data.dsize = d->datalen;
+       data.dptr = &d->data[d->keylen];
+
+       if (key.dsize == 0 && data.dsize == 0) {
+               state->null_count++;
+               /* Persistent databases are only scanned on one node (the local
+                * node)
+                */
+               if (state->ctdb_db->persistent == 0) {
+                       if (state->null_count != ctdb_get_num_active_nodes(ctdb)) {
+                               return 0;
+                       }
+               }
+       }
+
+       callback = state->callback;
+       private_data = state->private_data;
+
+       callback(private_data, key, data);
+       return 0;
+}      
+
+/*
+  kill a in-progress traverse, used when a client disconnects
+ */
+int32_t ctdb_control_traverse_kill(struct ctdb_context *ctdb, TDB_DATA data, 
+                                  TDB_DATA *outdata, uint32_t srcnode)
+{
+       struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_traverse_local_handle *t;
+
+       ctdb_db = find_ctdb_db(ctdb, d->db_id);
+       if (ctdb_db == NULL) {
+               return -1;
+       }
+
+       for (t=ctdb_db->traverse; t; t=t->next) {
+               if (t->client_reqid == d->reqid &&
+                   t->srvid == d->srvid) {
+                       talloc_free(t);
+                       break;
+               }
+       }
+
+       return 0;
+}
+
+
+/*
+  this is called when a client disconnects during a traverse
+  we need to notify all the nodes taking part in the search that they
+  should kill their traverse children
+ */
+static int ctdb_traverse_start_destructor(struct traverse_start_state *state)
+{
+       struct ctdb_traverse_start r;
+       TDB_DATA data;
+
+       DEBUG(DEBUG_ERR,(__location__ " Traverse cancelled by client disconnect for database:0x%08x\n", state->db_id));
+       r.db_id = state->db_id;
+       r.reqid = state->reqid;
+       r.srvid = state->srvid;
+
+       data.dptr = (uint8_t *)&r;
+       data.dsize = sizeof(r);
+
+       ctdb_daemon_send_control(state->ctdb, CTDB_BROADCAST_CONNECTED, 0, 
+                                CTDB_CONTROL_TRAVERSE_KILL, 
+                                0, CTDB_CTRL_FLAG_NOREPLY, data, NULL, NULL);
+       return 0;
+}
+
+/*
+  callback which sends records as messages to the client
+ */
+static void traverse_start_callback(void *p, TDB_DATA key, TDB_DATA data)
+{
+       struct traverse_start_state *state;
+       struct ctdb_rec_data *d;
+       TDB_DATA cdata;
+
+       state = talloc_get_type(p, struct traverse_start_state);
+
+       d = ctdb_marshall_record(state, state->reqid, key, NULL, data);
+       if (d == NULL) {
+               return;
+       }
+
+       cdata.dptr = (uint8_t *)d;
+       cdata.dsize = d->length;
+
+       ctdb_dispatch_message(state->ctdb, state->srvid, cdata);
+       if (key.dsize == 0 && data.dsize == 0) {
+               DEBUG(DEBUG_NOTICE, ("Ending traverse on DB %s (id %d), records %d\n",
+                                    state->h->ctdb_db->db_name, state->h->reqid,
+                                    state->num_records));
+
+               if (state->h->timedout) {
+                       /* timed out, send TRAVERSE_KILL control */
+                       talloc_free(state);
+               } else {
+                       /* end of traverse */
+                       talloc_set_destructor(state, NULL);
+                       talloc_free(state);
+               }
+       } else {
+               state->num_records++;
+       }
+}
+
+
+/**
+ * start a traverse_all - called as a control from a client.
+ * extended version to take the "withemptyrecords" parameter.
+ */
+int32_t ctdb_control_traverse_start_ext(struct ctdb_context *ctdb,
+                                       TDB_DATA data,
+                                       TDB_DATA *outdata,
+                                       uint32_t srcnode,
+                                       uint32_t client_id)
+{
+       struct ctdb_traverse_start_ext *d = (struct ctdb_traverse_start_ext *)data.dptr;
+       struct traverse_start_state *state;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_client *client = ctdb_reqid_find(ctdb, client_id, struct ctdb_client);
+
+       if (client == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " No client found\n"));
+               return -1;              
+       }
+
+       if (data.dsize != sizeof(*d)) {
+               DEBUG(DEBUG_ERR,("Bad record size in ctdb_control_traverse_start\n"));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, d->db_id);
+       if (ctdb_db == NULL) {
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               if (ctdb->tunable.allow_unhealthy_db_read == 0) {
+                       DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+                                       ctdb_db->db_name, ctdb_db->unhealthy_reason));
+                       return -1;
+               }
+               DEBUG(DEBUG_WARNING,("warn: db(%s) unhealty in ctdb_control_traverse_start: %s\n",
+                                    ctdb_db->db_name, ctdb_db->unhealthy_reason));
+       }
+
+       state = talloc(client, struct traverse_start_state);
+       if (state == NULL) {
+               return -1;
+       }
+       
+       state->srcnode = srcnode;
+       state->reqid = d->reqid;
+       state->srvid = d->srvid;
+       state->db_id = d->db_id;
+       state->ctdb = ctdb;
+       state->withemptyrecords = d->withemptyrecords;
+       state->num_records = 0;
+
+       state->h = ctdb_daemon_traverse_all(ctdb_db, traverse_start_callback, state);
+       if (state->h == NULL) {
+               talloc_free(state);
+               return -1;
+       }
+
+       talloc_set_destructor(state, ctdb_traverse_start_destructor);
+
+       return 0;
+}
+
+/**
+ * start a traverse_all - called as a control from a client.
+ */
+int32_t ctdb_control_traverse_start(struct ctdb_context *ctdb,
+                                   TDB_DATA data,
+                                   TDB_DATA *outdata,
+                                   uint32_t srcnode,
+                                   uint32_t client_id)
+{
+       struct ctdb_traverse_start *d = (struct ctdb_traverse_start *)data.dptr;
+       struct ctdb_traverse_start_ext d2;
+       TDB_DATA data2;
+
+       ZERO_STRUCT(d2);
+       d2.db_id = d->db_id;
+       d2.reqid = d->reqid;
+       d2.srvid = d->srvid;
+       d2.withemptyrecords = false;
+
+       data2.dsize = sizeof(d2);
+       data2.dptr = (uint8_t *)&d2;
+
+       return ctdb_control_traverse_start_ext(ctdb, data2, outdata, srcnode, client_id);
+}
diff --git a/ctdb/server/ctdb_tunables.c b/ctdb/server/ctdb_tunables.c
new file mode 100644 (file)
index 0000000..f760cb5
--- /dev/null
@@ -0,0 +1,214 @@
+/* 
+   ctdb tunables code
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include "includes.h"
+#include "../include/ctdb_private.h"
+
+static const struct {
+       const char *name;
+       uint32_t default_v;
+       size_t offset;
+       bool obsolete;
+} tunable_map[] = {
+       { "MaxRedirectCount",     3,  offsetof(struct ctdb_tunable, max_redirect_count), false },
+       { "SeqnumInterval",      1000,  offsetof(struct ctdb_tunable, seqnum_interval), false },
+       { "ControlTimeout",      60, offsetof(struct ctdb_tunable, control_timeout), false },
+       { "TraverseTimeout",     20, offsetof(struct ctdb_tunable, traverse_timeout), false },
+       { "KeepaliveInterval",    5,  offsetof(struct ctdb_tunable, keepalive_interval), false },
+       { "KeepaliveLimit",       5,  offsetof(struct ctdb_tunable, keepalive_limit), false },
+       { "RecoverTimeout",     120,  offsetof(struct ctdb_tunable, recover_timeout), false },
+       { "RecoverInterval",      1,  offsetof(struct ctdb_tunable, recover_interval), false },
+       { "ElectionTimeout",      3,  offsetof(struct ctdb_tunable, election_timeout), false },
+       { "TakeoverTimeout",      9,  offsetof(struct ctdb_tunable, takeover_timeout), false },
+       { "MonitorInterval",     15,  offsetof(struct ctdb_tunable, monitor_interval), false },
+       { "TickleUpdateInterval",20,  offsetof(struct ctdb_tunable, tickle_update_interval), false },
+       { "EventScriptTimeout",  30,  offsetof(struct ctdb_tunable, script_timeout), false },
+       { "EventScriptTimeoutCount", 20,  offsetof(struct ctdb_tunable, script_timeout_count), false },
+       { "EventScriptUnhealthyOnTimeout", 0, offsetof(struct ctdb_tunable, script_unhealthy_on_timeout), true },
+       { "RecoveryGracePeriod", 120,  offsetof(struct ctdb_tunable, recovery_grace_period), false },
+       { "RecoveryBanPeriod",  300,  offsetof(struct ctdb_tunable, recovery_ban_period), false },
+       { "DatabaseHashSize", 100001, offsetof(struct ctdb_tunable, database_hash_size), false },
+       { "DatabaseMaxDead",      5,  offsetof(struct ctdb_tunable, database_max_dead), false },
+       { "RerecoveryTimeout",   10,  offsetof(struct ctdb_tunable, rerecovery_timeout), false },
+       { "EnableBans",           1,  offsetof(struct ctdb_tunable, enable_bans), false },
+       { "DeterministicIPs",     0,  offsetof(struct ctdb_tunable, deterministic_public_ips), false },
+       { "LCP2PublicIPs",        1,  offsetof(struct ctdb_tunable, lcp2_public_ip_assignment), false },
+       { "ReclockPingPeriod",   60,  offsetof(struct ctdb_tunable,  reclock_ping_period), false },
+       { "NoIPFailback",         0,  offsetof(struct ctdb_tunable, no_ip_failback), false },
+       { "DisableIPFailover",    0,  offsetof(struct ctdb_tunable, disable_ip_failover), false },
+       { "VerboseMemoryNames",   0,  offsetof(struct ctdb_tunable, verbose_memory_names), false },
+       { "RecdPingTimeout",     60,  offsetof(struct ctdb_tunable, recd_ping_timeout), false },
+       { "RecdFailCount",       10,  offsetof(struct ctdb_tunable, recd_ping_failcount), false },
+       { "LogLatencyMs",         0,  offsetof(struct ctdb_tunable, log_latency_ms), false },
+       { "RecLockLatencyMs",  1000,  offsetof(struct ctdb_tunable, reclock_latency_ms), false },
+       { "RecoveryDropAllIPs", 120,  offsetof(struct ctdb_tunable, recovery_drop_all_ips), false },
+       { "VerifyRecoveryLock",   1,  offsetof(struct ctdb_tunable, verify_recovery_lock), false },
+       { "VacuumInterval",   10,  offsetof(struct ctdb_tunable, vacuum_interval), false },
+       { "VacuumMaxRunTime",     120,  offsetof(struct ctdb_tunable, vacuum_max_run_time), false },
+       { "RepackLimit",      10000,  offsetof(struct ctdb_tunable, repack_limit), false },
+       { "VacuumLimit",       5000,  offsetof(struct ctdb_tunable, vacuum_limit), false },
+       { "VacuumFastPathCount", 60, offsetof(struct ctdb_tunable, vacuum_fast_path_count), false },
+       { "MaxQueueDropMsg",  1000000, offsetof(struct ctdb_tunable, max_queue_depth_drop_msg), false },
+       { "UseStatusEvents",     0,  offsetof(struct ctdb_tunable, use_status_events_for_monitoring), false },
+       { "AllowUnhealthyDBRead", 0,  offsetof(struct ctdb_tunable, allow_unhealthy_db_read), false },
+       { "StatHistoryInterval",  1,  offsetof(struct ctdb_tunable, stat_history_interval), false },
+       { "DeferredAttachTO",  120,  offsetof(struct ctdb_tunable, deferred_attach_timeout), false },
+       { "AllowClientDBAttach", 1, offsetof(struct ctdb_tunable, allow_client_db_attach), false },
+       { "RecoverPDBBySeqNum",  1, offsetof(struct ctdb_tunable, recover_pdb_by_seqnum), false },
+       { "DeferredRebalanceOnNodeAdd", 300, offsetof(struct ctdb_tunable, deferred_rebalance_on_node_add) },
+       { "FetchCollapse",       1, offsetof(struct ctdb_tunable, fetch_collapse) },
+       { "HopcountMakeSticky",   50,  offsetof(struct ctdb_tunable, hopcount_make_sticky) },
+       { "StickyDuration",      600,  offsetof(struct ctdb_tunable, sticky_duration) },
+       { "StickyPindown",       200,  offsetof(struct ctdb_tunable, sticky_pindown) },
+       { "NoIPTakeover",         0,  offsetof(struct ctdb_tunable, no_ip_takeover), false },
+       { "DBRecordCountWarn",    100000,  offsetof(struct ctdb_tunable, db_record_count_warn), false },
+       { "DBRecordSizeWarn",   10000000,  offsetof(struct ctdb_tunable, db_record_size_warn), false },
+       { "DBSizeWarn",        100000000,  offsetof(struct ctdb_tunable, db_size_warn), false },
+       { "PullDBPreallocation", 10*1024*1024,  offsetof(struct ctdb_tunable, pulldb_preallocation_size), false },
+       { "NoIPHostOnAllDisabled",    0,  offsetof(struct ctdb_tunable, no_ip_host_on_all_disabled), false },
+       { "Samba3AvoidDeadlocks", 0, offsetof(struct ctdb_tunable, samba3_hack), false },
+};
+
+/*
+  set all tunables to defaults
+ */
+void ctdb_tunables_set_defaults(struct ctdb_context *ctdb)
+{
+       int i;
+       for (i=0;i<ARRAY_SIZE(tunable_map);i++) {
+               *(uint32_t *)(tunable_map[i].offset + (uint8_t*)&ctdb->tunable) = tunable_map[i].default_v;
+       }
+}
+
+
+/*
+  get a tunable
+ */
+int32_t ctdb_control_get_tunable(struct ctdb_context *ctdb, TDB_DATA indata, 
+                                TDB_DATA *outdata)
+{
+       struct ctdb_control_get_tunable *t = 
+               (struct ctdb_control_get_tunable *)indata.dptr;
+       char *name;
+       uint32_t val;
+       int i;
+
+       if (indata.dsize < sizeof(*t) ||
+           t->length > indata.dsize - offsetof(struct ctdb_control_get_tunable, name)) {
+               DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_get_tunable\n"));
+               return -1;
+       }
+
+       name = talloc_strndup(ctdb, (char*)t->name, t->length);
+       CTDB_NO_MEMORY(ctdb, name);
+
+       for (i=0;i<ARRAY_SIZE(tunable_map);i++) {
+               if (strcasecmp(name, tunable_map[i].name) == 0) break;
+       }
+       talloc_free(name);
+       
+       if (i == ARRAY_SIZE(tunable_map)) {
+               return -EINVAL;
+       }
+
+       val = *(uint32_t *)(tunable_map[i].offset + (uint8_t*)&ctdb->tunable);
+
+       outdata->dptr = (uint8_t *)talloc(outdata, uint32_t);
+       CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+       *(uint32_t *)outdata->dptr = val;
+       outdata->dsize = sizeof(uint32_t);
+
+       return 0;
+}
+
+
+/*
+  set a tunable
+ */
+int32_t ctdb_control_set_tunable(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       struct ctdb_control_set_tunable *t = 
+               (struct ctdb_control_set_tunable *)indata.dptr;
+       char *name;
+       int i;
+
+       if (indata.dsize < sizeof(*t) ||
+           t->length > indata.dsize - offsetof(struct ctdb_control_set_tunable, name)) {
+               DEBUG(DEBUG_ERR,("Bad indata in ctdb_control_set_tunable\n"));
+               return -1;
+       }
+
+       name = talloc_strndup(ctdb, (char *)t->name, t->length);
+       CTDB_NO_MEMORY(ctdb, name);
+
+       for (i=0;i<ARRAY_SIZE(tunable_map);i++) {
+               if (strcasecmp(name, tunable_map[i].name) == 0) break;
+       }
+
+       if (!strcmp(name, "VerifyRecoveryLock") && t->value != 0
+       && ctdb->recovery_lock_file == NULL) {
+               DEBUG(DEBUG_ERR,("Can not activate tunable \"VerifyRecoveryLock\" since there is no recovery lock file set.\n"));
+               talloc_free(name);
+               return -1;
+       }
+
+       talloc_free(name);
+       
+       if (i == ARRAY_SIZE(tunable_map)) {
+               return -1;
+       }
+
+       *(uint32_t *)(tunable_map[i].offset + (uint8_t*)&ctdb->tunable) = t->value;
+
+       return 0;
+}
+
+/*
+  list tunables
+ */
+int32_t ctdb_control_list_tunables(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+       char *list = NULL;
+       int i;
+       struct ctdb_control_list_tunable *t;
+
+       list = talloc_strdup(outdata, tunable_map[0].name);
+       CTDB_NO_MEMORY(ctdb, list);
+
+       for (i=1;i<ARRAY_SIZE(tunable_map);i++) {
+               if (tunable_map[i].obsolete) {
+                       continue;
+               }
+               list = talloc_asprintf_append(list, ":%s", tunable_map[i].name);
+               CTDB_NO_MEMORY(ctdb, list);             
+       }
+
+       outdata->dsize = offsetof(struct ctdb_control_list_tunable, data) + 
+               strlen(list) + 1;
+       outdata->dptr = talloc_size(outdata, outdata->dsize);
+       CTDB_NO_MEMORY(ctdb, outdata->dptr);
+
+       t = (struct ctdb_control_list_tunable *)outdata->dptr;
+       t->length = strlen(list)+1;
+
+       memcpy(t->data, list, t->length);
+       talloc_free(list);
+
+       return 0;       
+}
diff --git a/ctdb/server/ctdb_update_record.c b/ctdb/server/ctdb_update_record.c
new file mode 100644 (file)
index 0000000..7bfa08a
--- /dev/null
@@ -0,0 +1,355 @@
+/* 
+   implementation of the update record control
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "db_wrap.h"
+#include "tdb.h"
+#include "ctdb_private.h"
+
+struct ctdb_persistent_write_state {
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_marshall_buffer *m;
+       struct ctdb_req_control *c;
+       uint32_t flags;
+};
+
+/* dont create/update records that does not exist locally */
+#define UPDATE_FLAGS_REPLACE_ONLY      1
+
+/*
+  called from a child process to write the data
+ */
+static int ctdb_persistent_store(struct ctdb_persistent_write_state *state)
+{
+       int ret, i;
+       struct ctdb_rec_data *rec = NULL;
+       struct ctdb_marshall_buffer *m = state->m;
+
+       ret = tdb_transaction_start(state->ctdb_db->ltdb->tdb);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR,("Failed to start transaction for db_id 0x%08x in ctdb_persistent_store\n",
+                                state->ctdb_db->db_id));
+               return -1;
+       }
+
+       for (i=0;i<m->count;i++) {
+               struct ctdb_ltdb_header oldheader;
+               struct ctdb_ltdb_header header;
+               TDB_DATA key, data, olddata;
+               TALLOC_CTX *tmp_ctx = talloc_new(state);
+
+               rec = ctdb_marshall_loop_next(m, rec, NULL, &header, &key, &data);
+
+               if (rec == NULL) {
+                       DEBUG(DEBUG_ERR,("Failed to get next record %d for db_id 0x%08x in ctdb_persistent_store\n",
+                                        i, state->ctdb_db->db_id));
+                       talloc_free(tmp_ctx);
+                       goto failed;
+               }
+
+               /* we must check if the record exists or not because
+                  ctdb_ltdb_fetch will unconditionally create a record
+                */
+               if (state->flags & UPDATE_FLAGS_REPLACE_ONLY) {
+                       TDB_DATA trec;
+                       trec = tdb_fetch(state->ctdb_db->ltdb->tdb, key);
+                       if (trec.dsize == 0) {
+                               talloc_free(tmp_ctx);
+                               continue;
+                       }
+                       free(trec.dptr);
+               }
+
+               /* fetch the old header and ensure the rsn is less than the new rsn */
+               ret = ctdb_ltdb_fetch(state->ctdb_db, key, &oldheader, tmp_ctx, &olddata);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to fetch old record for db_id 0x%08x in ctdb_persistent_store\n",
+                                        state->ctdb_db->db_id));
+                       talloc_free(tmp_ctx);
+                       goto failed;
+               }
+
+               if (oldheader.rsn >= header.rsn &&
+                   (olddata.dsize != data.dsize ||
+                    memcmp(olddata.dptr, data.dptr, data.dsize) != 0)) {
+                       DEBUG(DEBUG_CRIT,("existing header for db_id 0x%08x has larger RSN %llu than new RSN %llu in ctdb_persistent_store\n",
+                                         state->ctdb_db->db_id,
+                                         (unsigned long long)oldheader.rsn, (unsigned long long)header.rsn));
+                       talloc_free(tmp_ctx);
+                       goto failed;
+               }
+
+               talloc_free(tmp_ctx);
+
+               ret = ctdb_ltdb_store(state->ctdb_db, key, &header, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_CRIT,("Failed to store record for db_id 0x%08x in ctdb_persistent_store\n",
+                                         state->ctdb_db->db_id));
+                       goto failed;
+               }
+       }
+
+       ret = tdb_transaction_commit(state->ctdb_db->ltdb->tdb);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR,("Failed to commit transaction for db_id 0x%08x in ctdb_persistent_store\n",
+                                state->ctdb_db->db_id));
+               return -1;
+       }
+
+       return 0;
+
+failed:
+       tdb_transaction_cancel(state->ctdb_db->ltdb->tdb);
+       return -1;
+}
+
+
+/*
+  called when we the child has completed the persistent write
+  on our behalf
+ */
+static void ctdb_persistent_write_callback(int status, void *private_data)
+{
+       struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
+                                                                  struct ctdb_persistent_write_state);
+
+
+       ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, status, NULL);
+
+       talloc_free(state);
+}
+
+/*
+  called if our lockwait child times out
+ */
+static void ctdb_persistent_lock_timeout(struct event_context *ev, struct timed_event *te,
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_persistent_write_state *state = talloc_get_type(private_data,
+                                                                  struct ctdb_persistent_write_state);
+       ctdb_request_control_reply(state->ctdb_db->ctdb, state->c, NULL, -1, "timeout in ctdb_persistent_lock");
+       talloc_free(state);
+}
+
+struct childwrite_handle {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct fd_event *fde;
+       int fd[2];
+       pid_t child;
+       void *private_data;
+       void (*callback)(int, void *);
+       struct timeval start_time;
+};
+
+static int childwrite_destructor(struct childwrite_handle *h)
+{
+       CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
+       ctdb_kill(h->ctdb, h->child, SIGKILL);
+       return 0;
+}
+
+/* called when the child process has finished writing the record to the
+   database
+*/
+static void childwrite_handler(struct event_context *ev, struct fd_event *fde,
+                            uint16_t flags, void *private_data)
+{
+       struct childwrite_handle *h = talloc_get_type(private_data,
+                                                    struct childwrite_handle);
+       void *p = h->private_data;
+       void (*callback)(int, void *) = h->callback;
+       pid_t child = h->child;
+       TALLOC_CTX *tmp_ctx = talloc_new(ev);
+       int ret;
+       char c;
+
+       CTDB_UPDATE_LATENCY(h->ctdb, h->ctdb_db, "persistent", childwrite_latency, h->start_time);
+       CTDB_DECREMENT_STAT(h->ctdb, pending_childwrite_calls);
+
+       /* the handle needs to go away when the context is gone - when
+          the handle goes away this implicitly closes the pipe, which
+          kills the child */
+       talloc_steal(tmp_ctx, h);
+
+       talloc_set_destructor(h, NULL);
+
+       ret = read(h->fd[0], &c, 1);
+       if (ret < 1) {
+               DEBUG(DEBUG_ERR, (__location__ " Read returned %d. Childwrite failed\n", ret));
+               c = 1;
+       }
+
+       callback(c, p);
+
+       ctdb_kill(h->ctdb, child, SIGKILL);
+       talloc_free(tmp_ctx);
+}
+
+/* this creates a child process which will take out a tdb transaction
+   and write the record to the database.
+*/
+static struct childwrite_handle *ctdb_childwrite(
+                               struct ctdb_db_context *ctdb_db,
+                               void (*callback)(int, void *private_data),
+                               struct ctdb_persistent_write_state *state)
+{
+       struct childwrite_handle *result;
+       int ret;
+       pid_t parent = getpid();
+
+       CTDB_INCREMENT_STAT(ctdb_db->ctdb, childwrite_calls);
+       CTDB_INCREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+
+       if (!(result = talloc_zero(state, struct childwrite_handle))) {
+               CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+               return NULL;
+       }
+
+       ret = pipe(result->fd);
+
+       if (ret != 0) {
+               talloc_free(result);
+               CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+               return NULL;
+       }
+
+       result->child = ctdb_fork(ctdb_db->ctdb);
+
+       if (result->child == (pid_t)-1) {
+               close(result->fd[0]);
+               close(result->fd[1]);
+               talloc_free(result);
+               CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+               return NULL;
+       }
+
+       result->callback = callback;
+       result->private_data = state;
+       result->ctdb = ctdb_db->ctdb;
+       result->ctdb_db = ctdb_db;
+
+       if (result->child == 0) {
+               char c = 0;
+
+               close(result->fd[0]);
+               ctdb_set_process_name("ctdb_write_persistent");
+               debug_extra = talloc_asprintf(NULL, "childwrite-%s:", ctdb_db->db_name);
+               ret = ctdb_persistent_store(state);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to write persistent data\n"));
+                       c = 1;
+               }
+
+               write(result->fd[1], &c, 1);
+
+               /* make sure we die when our parent dies */
+               while (ctdb_kill(ctdb_db->ctdb, parent, 0) == 0 || errno != ESRCH) {
+                       sleep(5);
+               }
+               _exit(0);
+       }
+
+       close(result->fd[1]);
+       set_close_on_exec(result->fd[0]);
+
+       talloc_set_destructor(result, childwrite_destructor);
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d for ctdb_childwrite\n", result->fd[0]));
+
+       result->fde = event_add_fd(ctdb_db->ctdb->ev, result, result->fd[0],
+                                  EVENT_FD_READ, childwrite_handler,
+                                  (void *)result);
+       if (result->fde == NULL) {
+               talloc_free(result);
+               CTDB_DECREMENT_STAT(ctdb_db->ctdb, pending_childwrite_calls);
+               return NULL;
+       }
+       tevent_fd_set_auto_close(result->fde);
+
+       result->start_time = timeval_current();
+
+       return result;
+}
+
+/*
+   update a record on this node if the new record has a higher rsn than the
+   current record
+ */
+int32_t ctdb_control_update_record(struct ctdb_context *ctdb,
+                                  struct ctdb_req_control *c, TDB_DATA recdata,
+                                  bool *async_reply)
+{
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_persistent_write_state *state;
+       struct childwrite_handle *handle;
+       struct ctdb_marshall_buffer *m = (struct ctdb_marshall_buffer *)recdata.dptr;
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_INFO,("rejecting ctdb_control_update_record when recovery active\n"));
+               return -1;
+       }
+
+       ctdb_db = find_ctdb_db(ctdb, m->db_id);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unknown database 0x%08x in ctdb_control_update_record\n", m->db_id));
+               return -1;
+       }
+
+       if (ctdb_db->unhealthy_reason) {
+               DEBUG(DEBUG_ERR,("db(%s) unhealty in ctdb_control_update_record: %s\n",
+                                ctdb_db->db_name, ctdb_db->unhealthy_reason));
+               return -1;
+       }
+
+       state = talloc(ctdb, struct ctdb_persistent_write_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->ctdb_db = ctdb_db;
+       state->c       = c;
+       state->m       = m;
+       state->flags   = 0;
+       if (!ctdb_db->persistent) {
+               state->flags   = UPDATE_FLAGS_REPLACE_ONLY;
+       }
+
+       /* create a child process to take out a transaction and
+          write the data.
+       */
+       handle = ctdb_childwrite(ctdb_db, ctdb_persistent_write_callback, state);
+       if (handle == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to setup childwrite handler in ctdb_control_update_record\n"));
+               talloc_free(state);
+               return -1;
+       }
+
+       /* we need to wait for the replies */
+       *async_reply = true;
+
+       /* need to keep the control structure around */
+       talloc_steal(state, c);
+
+       /* but we won't wait forever */
+       event_add_timed(ctdb->ev, state, timeval_current_ofs(ctdb->tunable.control_timeout, 0),
+                       ctdb_persistent_lock_timeout, state);
+
+       return 0;
+}
+
diff --git a/ctdb/server/ctdb_uptime.c b/ctdb/server/ctdb_uptime.c
new file mode 100644 (file)
index 0000000..b45ea80
--- /dev/null
@@ -0,0 +1,45 @@
+/* 
+   ctdb uptime code
+
+   Copyright (C) Ronnie Sahlberg 2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "../include/ctdb_private.h"
+#include "system/syslog.h"
+#include "system/time.h"
+#include "system/filesys.h"
+
+/* 
+   returns the ctdb uptime
+*/
+int32_t ctdb_control_uptime(struct ctdb_context *ctdb, TDB_DATA *outdata)
+{
+       struct ctdb_uptime *uptime;
+
+       uptime = talloc_zero(outdata, struct ctdb_uptime);
+       CTDB_NO_MEMORY(ctdb, uptime);
+
+       gettimeofday(&uptime->current_time, NULL);
+       uptime->ctdbd_start_time       = ctdb->ctdbd_start_time;
+       uptime->last_recovery_started  = ctdb->last_recovery_started;
+       uptime->last_recovery_finished = ctdb->last_recovery_finished;
+
+       outdata->dsize = sizeof(struct ctdb_uptime);
+       outdata->dptr  = (uint8_t *)uptime;
+
+       return 0;
+}
diff --git a/ctdb/server/ctdb_vacuum.c b/ctdb/server/ctdb_vacuum.c
new file mode 100644 (file)
index 0000000..d07afd4
--- /dev/null
@@ -0,0 +1,1935 @@
+/*
+   ctdb vacuuming events
+
+   Copyright (C) Ronnie Sahlberg  2009
+   Copyright (C) Michael Adam 2010-2013
+   Copyright (C) Stefan Metzmacher 2010-2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "system/dir.h"
+#include "../include/ctdb_private.h"
+#include "db_wrap.h"
+#include "lib/util/dlinklist.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+
+#define TIMELIMIT() timeval_current_ofs(10, 0)
+
+enum vacuum_child_status { VACUUM_RUNNING, VACUUM_OK, VACUUM_ERROR, VACUUM_TIMEOUT};
+
+struct ctdb_vacuum_child_context {
+       struct ctdb_vacuum_child_context *next, *prev;
+       struct ctdb_vacuum_handle *vacuum_handle;
+       /* fd child writes status to */
+       int fd[2];
+       pid_t child_pid;
+       enum vacuum_child_status status;
+       struct timeval start_time;
+};
+
+struct ctdb_vacuum_handle {
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_vacuum_child_context *child_ctx;
+       uint32_t fast_path_count;
+};
+
+
+/*  a list of records to possibly delete */
+struct vacuum_data {
+       uint32_t vacuum_limit;
+       uint32_t repack_limit;
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct tdb_context *dest_db;
+       trbt_tree_t *delete_list;
+       uint32_t delete_count;
+       struct ctdb_marshall_buffer **vacuum_fetch_list;
+       struct timeval start;
+       bool traverse_error;
+       bool vacuum;
+       uint32_t total;
+       uint32_t vacuumed;
+       uint32_t copied;
+       uint32_t fast_added_to_vacuum_fetch_list;
+       uint32_t fast_added_to_delete_list;
+       uint32_t fast_deleted;
+       uint32_t fast_skipped;
+       uint32_t fast_error;
+       uint32_t fast_total;
+       uint32_t full_added_to_vacuum_fetch_list;
+       uint32_t full_added_to_delete_list;
+       uint32_t full_skipped;
+       uint32_t full_error;
+       uint32_t full_total;
+       uint32_t delete_left;
+       uint32_t delete_remote_error;
+       uint32_t delete_local_error;
+       uint32_t delete_deleted;
+       uint32_t delete_skipped;
+};
+
+/* this structure contains the information for one record to be deleted */
+struct delete_record_data {
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_ltdb_header hdr;
+       TDB_DATA key;
+       uint8_t keydata[1];
+};
+
+struct delete_records_list {
+       struct ctdb_marshall_buffer *records;
+       struct vacuum_data *vdata;
+};
+
+/**
+ * Store key and header in a tree, indexed by the key hash.
+ */
+static int insert_delete_record_data_into_tree(struct ctdb_context *ctdb,
+                                              struct ctdb_db_context *ctdb_db,
+                                              trbt_tree_t *tree,
+                                              const struct ctdb_ltdb_header *hdr,
+                                              TDB_DATA key)
+{
+       struct delete_record_data *dd;
+       uint32_t hash;
+       size_t len;
+
+       len = offsetof(struct delete_record_data, keydata) + key.dsize;
+
+       dd = (struct delete_record_data *)talloc_size(tree, len);
+       if (dd == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               return -1;
+       }
+       talloc_set_name_const(dd, "struct delete_record_data");
+
+       dd->ctdb      = ctdb;
+       dd->ctdb_db   = ctdb_db;
+       dd->key.dsize = key.dsize;
+       dd->key.dptr  = dd->keydata;
+       memcpy(dd->keydata, key.dptr, key.dsize);
+
+       dd->hdr = *hdr;
+
+       hash = ctdb_hash(&key);
+
+       trbt_insert32(tree, hash, dd);
+
+       return 0;
+}
+
+static int add_record_to_delete_list(struct vacuum_data *vdata, TDB_DATA key,
+                                    struct ctdb_ltdb_header *hdr)
+{
+       struct ctdb_context *ctdb = vdata->ctdb;
+       struct ctdb_db_context *ctdb_db = vdata->ctdb_db;
+       uint32_t hash;
+       int ret;
+
+       hash = ctdb_hash(&key);
+
+       if (trbt_lookup32(vdata->delete_list, hash)) {
+               DEBUG(DEBUG_INFO, (__location__ " Hash collission when vacuuming, skipping this record.\n"));
+               return 0;
+       }
+
+       ret = insert_delete_record_data_into_tree(ctdb, ctdb_db,
+                                                 vdata->delete_list,
+                                                 hdr, key);
+       if (ret != 0) {
+               return -1;
+       }
+
+       vdata->delete_count++;
+
+       return 0;
+}
+
+/**
+ * Add a record to the list of records to be sent
+ * to their lmaster with VACUUM_FETCH.
+ */
+static int add_record_to_vacuum_fetch_list(struct vacuum_data *vdata,
+                                          TDB_DATA key)
+{
+       struct ctdb_context *ctdb = vdata->ctdb;
+       struct ctdb_rec_data *rec;
+       uint32_t lmaster;
+       size_t old_size;
+       struct ctdb_marshall_buffer *vfl;
+
+       lmaster = ctdb_lmaster(ctdb, &key);
+
+       vfl = vdata->vacuum_fetch_list[lmaster];
+
+       rec = ctdb_marshall_record(vfl, ctdb->pnn, key, NULL, tdb_null);
+       if (rec == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               vdata->traverse_error = true;
+               return -1;
+       }
+
+       old_size = talloc_get_size(vfl);
+       vfl = talloc_realloc_size(NULL, vfl, old_size + rec->length);
+       if (vfl == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
+               vdata->traverse_error = true;
+               return -1;
+       }
+       vdata->vacuum_fetch_list[lmaster] = vfl;
+
+       vfl->count++;
+       memcpy(old_size+(uint8_t *)vfl, rec, rec->length);
+       talloc_free(rec);
+
+       vdata->total++;
+
+       return 0;
+}
+
+
+static void ctdb_vacuum_event(struct event_context *ev, struct timed_event *te,
+                             struct timeval t, void *private_data);
+
+
+/*
+ * traverse function for gathering the records that can be deleted
+ */
+static int vacuum_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+       struct vacuum_data *vdata = talloc_get_type(private, struct vacuum_data);
+       struct ctdb_context *ctdb = vdata->ctdb;
+       uint32_t lmaster;
+       struct ctdb_ltdb_header *hdr;
+       int res = 0;
+
+       vdata->full_total++;
+
+       lmaster = ctdb_lmaster(ctdb, &key);
+       if (lmaster >= ctdb->num_nodes) {
+               vdata->full_error++;
+               DEBUG(DEBUG_CRIT, (__location__
+                                  " lmaster[%u] >= ctdb->num_nodes[%u] for key"
+                                  " with hash[%u]!\n",
+                                  (unsigned)lmaster,
+                                  (unsigned)ctdb->num_nodes,
+                                  (unsigned)ctdb_hash(&key)));
+               return -1;
+       }
+
+       if (data.dsize != sizeof(struct ctdb_ltdb_header)) {
+               /* it is not a deleted record */
+               vdata->full_skipped++;
+               return 0;
+       }
+
+       hdr = (struct ctdb_ltdb_header *)data.dptr;
+
+       if (hdr->dmaster != ctdb->pnn) {
+               vdata->full_skipped++;
+               return 0;
+       }
+
+       if (lmaster == ctdb->pnn) {
+               /*
+                * We are both lmaster and dmaster, and the record is empty.
+                * So we should be able to delete it.
+                */
+               res = add_record_to_delete_list(vdata, key, hdr);
+               if (res != 0) {
+                       vdata->full_error++;
+               } else {
+                       vdata->full_added_to_delete_list++;
+               }
+       } else {
+               /*
+                * We are not lmaster.
+                * Add the record to the blob ready to send to the nodes.
+                */
+               res = add_record_to_vacuum_fetch_list(vdata, key);
+               if (res != 0) {
+                       vdata->full_error++;
+               } else {
+                       vdata->full_added_to_vacuum_fetch_list++;
+               }
+       }
+
+       return res;
+}
+
+/*
+ * traverse the tree of records to delete and marshall them into
+ * a blob
+ */
+static int delete_marshall_traverse(void *param, void *data)
+{
+       struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data);
+       struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list);
+       struct ctdb_rec_data *rec;
+       size_t old_size;
+
+       rec = ctdb_marshall_record(dd, recs->records->db_id, dd->key, &dd->hdr, tdb_null);
+       if (rec == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " failed to marshall record\n"));
+               return 0;
+       }
+
+       old_size = talloc_get_size(recs->records);
+       recs->records = talloc_realloc_size(NULL, recs->records, old_size + rec->length);
+       if (recs->records == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to expand\n"));
+               return 0;
+       }
+       recs->records->count++;
+       memcpy(old_size+(uint8_t *)(recs->records), rec, rec->length);
+       return 0;
+}
+
+/**
+ * Variant of delete_marshall_traverse() that bumps the
+ * RSN of each traversed record in the database.
+ *
+ * This is needed to ensure that when rolling out our
+ * empty record copy before remote deletion, we as the
+ * record's dmaster keep a higher RSN than the non-dmaster
+ * nodes. This is needed to prevent old copies from
+ * resurrection in recoveries.
+ */
+static int delete_marshall_traverse_first(void *param, void *data)
+{
+       struct delete_record_data *dd = talloc_get_type(data, struct delete_record_data);
+       struct delete_records_list *recs = talloc_get_type(param, struct delete_records_list);
+       struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       struct ctdb_ltdb_header *header;
+       TDB_DATA tdb_data, ctdb_data;
+       uint32_t lmaster;
+       uint32_t hash = ctdb_hash(&(dd->key));
+       int res;
+
+       res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,
+                     (__location__ " Error getting chainlock on record with "
+                      "key hash [0x%08x] on database db[%s].\n",
+                      hash, ctdb_db->db_name));
+               recs->vdata->delete_skipped++;
+               talloc_free(dd);
+               return 0;
+       }
+
+       /*
+        * Verify that the record is still empty, its RSN has not
+        * changed and that we are still its lmaster and dmaster.
+        */
+
+       tdb_data = tdb_fetch(ctdb_db->ltdb->tdb, dd->key);
+       if (tdb_data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] does not exist or is not"
+                                  " a ctdb-record.  skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       if (tdb_data.dsize > sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has been recycled. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       header = (struct ctdb_ltdb_header *)tdb_data.dptr;
+
+       if (header->flags & CTDB_REC_RO_FLAGS) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has read-only flags. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       if (header->dmaster != ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has been migrated away. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       if (header->rsn != dd->hdr.rsn) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] seems to have been "
+                                  "migrated away and back again (with empty "
+                                  "data). skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+       if (lmaster != ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ ": not lmaster for record in "
+                                  "delete list (key hash [0x%08x], db[%s]). "
+                                  "Strange! skipping.\n",
+                                  hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       /*
+        * Increment the record's RSN to ensure the dmaster (i.e. the current
+        * node) has the highest RSN of the record in the cluster.
+        * This is to prevent old record copies from resurrecting in recoveries
+        * if something should fail during the deletion process.
+        * Note that ctdb_ltdb_store_server() increments the RSN if called
+        * on the record's dmaster.
+        */
+
+       ctdb_data.dptr = tdb_data.dptr + sizeof(struct ctdb_ltdb_header);
+       ctdb_data.dsize = tdb_data.dsize - sizeof(struct ctdb_ltdb_header);
+
+       res = ctdb_ltdb_store(ctdb_db, dd->key, header, ctdb_data);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR, (__location__ ": Failed to store record with "
+                                 "key hash [0x%08x] on database db[%s].\n",
+                                 hash, ctdb_db->db_name));
+               goto skip;
+       }
+
+       tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+       goto done;
+
+skip:
+       tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+       recs->vdata->delete_skipped++;
+       talloc_free(dd);
+       dd = NULL;
+
+done:
+       if (tdb_data.dptr != NULL) {
+               free(tdb_data.dptr);
+       }
+
+       if (dd == NULL) {
+               return 0;
+       }
+
+       return delete_marshall_traverse(param, data);
+}
+
+/**
+ * traverse function for the traversal of the delete_queue,
+ * the fast-path vacuuming list.
+ *
+ *  - If the record has been migrated off the node
+ *    or has been revived (filled with data) on the node,
+ *    then skip the record.
+ *
+ *  - If the current node is the record's lmaster and it is
+ *    a record that has never been migrated with data, then
+ *    delete the record from the local tdb.
+ *
+ *  - If the current node is the record's lmaster and it has
+ *    been migrated with data, then schedule it for the normal
+ *    vacuuming procedure (i.e. add it to the delete_list).
+ *
+ *  - If the current node is NOT the record's lmaster then
+ *    add it to the list of records that are to be sent to
+ *    the lmaster with the VACUUM_FETCH message.
+ */
+static int delete_queue_traverse(void *param, void *data)
+{
+       struct delete_record_data *dd =
+               talloc_get_type(data, struct delete_record_data);
+       struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data);
+       struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+       struct ctdb_context *ctdb = ctdb_db->ctdb; /* or dd->ctdb ??? */
+       int res;
+       struct ctdb_ltdb_header *header;
+       TDB_DATA tdb_data;
+       uint32_t lmaster;
+       uint32_t hash = ctdb_hash(&(dd->key));
+
+       vdata->fast_total++;
+
+       res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,
+                     (__location__ " Error getting chainlock on record with "
+                      "key hash [0x%08x] on database db[%s].\n",
+                      hash, ctdb_db->db_name));
+               vdata->fast_error++;
+               return 0;
+       }
+
+       tdb_data = tdb_fetch(ctdb_db->ltdb->tdb, dd->key);
+       if (tdb_data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               /* Does not exist or not a ctdb record. Skip. */
+               goto skipped;
+       }
+
+       if (tdb_data.dsize > sizeof(struct ctdb_ltdb_header)) {
+               /* The record has been recycled (filled with data). Skip. */
+               goto skipped;
+       }
+
+       header = (struct ctdb_ltdb_header *)tdb_data.dptr;
+
+       if (header->dmaster != ctdb->pnn) {
+               /* The record has been migrated off the node. Skip. */
+               goto skipped;
+       }
+
+       if (header->rsn != dd->hdr.rsn) {
+               /*
+                * The record has been migrated off the node and back again.
+                * But not requeued for deletion. Skip it.
+                */
+               goto skipped;
+       }
+
+       /*
+        * We are dmaster, and the record has no data, and it has
+        * not been migrated after it has been queued for deletion.
+        *
+        * At this stage, the record could still have been revived locally
+        * and last been written with empty data. This can only be
+        * fixed with the addition of an active or delete flag. (TODO)
+        */
+
+       lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+       if (lmaster != ctdb->pnn) {
+               res = add_record_to_vacuum_fetch_list(vdata, dd->key);
+
+               if (res != 0) {
+                       DEBUG(DEBUG_ERR,
+                             (__location__ " Error adding record to list "
+                              "of records to send to lmaster.\n"));
+                       vdata->fast_error++;
+               } else {
+                       vdata->fast_added_to_vacuum_fetch_list++;
+               }
+               goto done;
+       }
+
+       /* use header->flags or dd->hdr.flags ?? */
+       if (dd->hdr.flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA) {
+               res = add_record_to_delete_list(vdata, dd->key, &dd->hdr);
+
+               if (res != 0) {
+                       DEBUG(DEBUG_ERR,
+                             (__location__ " Error adding record to list "
+                              "of records for deletion on lmaster.\n"));
+                       vdata->fast_error++;
+               } else {
+                       vdata->fast_added_to_delete_list++;
+               }
+       } else {
+               res = tdb_delete(ctdb_db->ltdb->tdb, dd->key);
+
+               if (res != 0) {
+                       DEBUG(DEBUG_ERR,
+                             (__location__ " Error deleting record with key "
+                              "hash [0x%08x] from local data base db[%s].\n",
+                              hash, ctdb_db->db_name));
+                       vdata->fast_error++;
+               } else {
+                       DEBUG(DEBUG_DEBUG,
+                             (__location__ " Deleted record with key hash "
+                              "[0x%08x] from local data base db[%s].\n",
+                              hash, ctdb_db->db_name));
+                       vdata->fast_deleted++;
+               }
+       }
+
+       goto done;
+
+skipped:
+       vdata->fast_skipped++;
+
+done:
+       if (tdb_data.dptr != NULL) {
+               free(tdb_data.dptr);
+       }
+       tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+       return 0;
+}
+
+/**
+ * Delete the records that we are lmaster and dmaster for and
+ * that could be deleted on all other nodes via the TRY_DELETE_RECORDS
+ * control.
+ */
+static int delete_record_traverse(void *param, void *data)
+{
+       struct delete_record_data *dd =
+               talloc_get_type(data, struct delete_record_data);
+       struct vacuum_data *vdata = talloc_get_type(param, struct vacuum_data);
+       struct ctdb_db_context *ctdb_db = dd->ctdb_db;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       int res;
+       struct ctdb_ltdb_header *header;
+       TDB_DATA tdb_data;
+       uint32_t lmaster;
+       bool deleted = false;
+       uint32_t hash = ctdb_hash(&(dd->key));
+
+       res = tdb_chainlock(ctdb_db->ltdb->tdb, dd->key);
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,
+                     (__location__ " Error getting chainlock on record with "
+                      "key hash [0x%08x] on database db[%s].\n",
+                      hash, ctdb_db->db_name));
+               vdata->delete_local_error++;
+               return 0;
+       }
+
+       /*
+        * Verify that the record is still empty, its RSN has not
+        * changed and that we are still its lmaster and dmaster.
+        */
+
+       tdb_data = tdb_fetch(ctdb_db->ltdb->tdb, dd->key);
+       if (tdb_data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] does not exist or is not"
+                                  " a ctdb-record.  skipping.\n",
+                                  hash, ctdb_db->db_name));
+               vdata->delete_skipped++;
+               goto done;
+       }
+
+       if (tdb_data.dsize > sizeof(struct ctdb_ltdb_header)) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has been recycled. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               vdata->delete_skipped++;
+               goto done;
+       }
+
+       header = (struct ctdb_ltdb_header *)tdb_data.dptr;
+
+       if (header->flags & CTDB_REC_RO_FLAGS) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has read-only flags. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               vdata->delete_skipped++;
+               goto done;
+       }
+
+       if (header->dmaster != ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] has been migrated away. "
+                                  "skipping.\n",
+                                  hash, ctdb_db->db_name));
+               vdata->delete_skipped++;
+               goto done;
+       }
+
+       if (header->rsn != dd->hdr.rsn + 1) {
+               /*
+                * The record has been migrated off the node and back again.
+                * But not requeued for deletion. Skip it.
+                * (Note that the first marshall traverse has bumped the RSN
+                *  on disk.)
+                */
+               DEBUG(DEBUG_INFO, (__location__ ": record with hash [0x%08x] "
+                                  "on database db[%s] seems to have been "
+                                  "migrated away and back again (with empty "
+                                  "data). skipping.\n",
+                                  hash, ctdb_db->db_name));
+               vdata->delete_skipped++;
+               goto done;
+       }
+
+       lmaster = ctdb_lmaster(ctdb_db->ctdb, &dd->key);
+
+       if (lmaster != ctdb->pnn) {
+               DEBUG(DEBUG_INFO, (__location__ ": not lmaster for record in "
+                                  "delete list (key hash [0x%08x], db[%s]). "
+                                  "Strange! skipping.\n",
+                                  hash, ctdb_db->db_name));
+               vdata->delete_skipped++;
+               goto done;
+       }
+
+       res = tdb_delete(ctdb_db->ltdb->tdb, dd->key);
+
+       if (res != 0) {
+               DEBUG(DEBUG_ERR,
+                     (__location__ " Error deleting record with key hash "
+                      "[0x%08x] from local data base db[%s].\n",
+                      hash, ctdb_db->db_name));
+               vdata->delete_local_error++;
+               goto done;
+       }
+
+       deleted = true;
+
+       DEBUG(DEBUG_DEBUG,
+             (__location__ " Deleted record with key hash [0x%08x] from "
+              "local data base db[%s].\n", hash, ctdb_db->db_name));
+
+done:
+       if (tdb_data.dptr != NULL) {
+               free(tdb_data.dptr);
+       }
+
+       tdb_chainunlock(ctdb_db->ltdb->tdb, dd->key);
+
+       if (deleted) {
+               /*
+                * successfully deleted the record locally.
+                * remove it from the list and update statistics.
+                */
+               talloc_free(dd);
+               vdata->delete_deleted++;
+               vdata->delete_left--;
+       }
+
+       return 0;
+}
+
+/**
+ * Fast vacuuming run:
+ * Traverse the delete_queue.
+ * This fills the same lists as the database traverse.
+ */
+static void ctdb_vacuum_db_fast(struct ctdb_db_context *ctdb_db,
+                               struct vacuum_data *vdata)
+{
+       trbt_traversearray32(ctdb_db->delete_queue, 1, delete_queue_traverse, vdata);
+
+       if (vdata->fast_total > 0) {
+               DEBUG(DEBUG_INFO,
+                     (__location__
+                      " fast vacuuming delete_queue traverse statistics: "
+                      "db[%s] "
+                      "total[%u] "
+                      "del[%u] "
+                      "skp[%u] "
+                      "err[%u] "
+                      "adl[%u] "
+                      "avf[%u]\n",
+                      ctdb_db->db_name,
+                      (unsigned)vdata->fast_total,
+                      (unsigned)vdata->fast_deleted,
+                      (unsigned)vdata->fast_skipped,
+                      (unsigned)vdata->fast_error,
+                      (unsigned)vdata->fast_added_to_delete_list,
+                      (unsigned)vdata->fast_added_to_vacuum_fetch_list));
+       }
+
+       return;
+}
+
+/**
+ * Full vacuum run:
+ * read-only traverse of the database, looking for records that
+ * might be able to be vacuumed.
+ *
+ * This is not done each time but only every tunable
+ * VacuumFastPathCount times.
+ */
+static int ctdb_vacuum_db_full(struct ctdb_db_context *ctdb_db,
+                              struct vacuum_data *vdata,
+                              bool full_vacuum_run)
+{
+       int ret;
+
+       if (!full_vacuum_run) {
+               return 0;
+       }
+
+       ret = tdb_traverse_read(ctdb_db->ltdb->tdb, vacuum_traverse, vdata);
+       if (ret == -1 || vdata->traverse_error) {
+               DEBUG(DEBUG_ERR, (__location__ " Traverse error in vacuuming "
+                                 "'%s'\n", ctdb_db->db_name));
+               return -1;
+       }
+
+       if (vdata->full_total > 0) {
+               DEBUG(DEBUG_INFO,
+                     (__location__
+                      " full vacuuming db traverse statistics: "
+                      "db[%s] "
+                      "total[%u] "
+                      "skp[%u] "
+                      "err[%u] "
+                      "adl[%u] "
+                      "avf[%u]\n",
+                      ctdb_db->db_name,
+                      (unsigned)vdata->full_total,
+                      (unsigned)vdata->full_skipped,
+                      (unsigned)vdata->full_error,
+                      (unsigned)vdata->full_added_to_delete_list,
+                      (unsigned)vdata->full_added_to_vacuum_fetch_list));
+       }
+
+       return 0;
+}
+
+/**
+ * Process the vacuum fetch lists:
+ * For records for which we are not the lmaster, tell the lmaster to
+ * fetch the record.
+ */
+static int ctdb_process_vacuum_fetch_lists(struct ctdb_db_context *ctdb_db,
+                                          struct vacuum_data *vdata)
+{
+       int i;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+       for (i = 0; i < ctdb->num_nodes; i++) {
+               TDB_DATA data;
+               struct ctdb_marshall_buffer *vfl = vdata->vacuum_fetch_list[i];
+
+               if (ctdb->nodes[i]->pnn == ctdb->pnn) {
+                       continue;
+               }
+
+               if (vfl->count == 0) {
+                       continue;
+               }
+
+               DEBUG(DEBUG_INFO, ("Found %u records for lmaster %u in '%s'\n",
+                                  vfl->count, ctdb->nodes[i]->pnn,
+                                  ctdb_db->db_name));
+
+               data.dsize = talloc_get_size(vfl);
+               data.dptr  = (void *)vfl;
+               if (ctdb_client_send_message(ctdb, ctdb->nodes[i]->pnn,
+                                            CTDB_SRVID_VACUUM_FETCH,
+                                            data) != 0)
+               {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to send vacuum "
+                                         "fetch message to %u\n",
+                                         ctdb->nodes[i]->pnn));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * Process the delete list:
+ *
+ * This is the last step of vacuuming that consistently deletes
+ * those records that have been migrated with data and can hence
+ * not be deleted when leaving a node.
+ *
+ * In this step, the lmaster does the final deletion of those empty
+ * records that it is also dmaster for. It has ususally received
+ * at least some of these records previously from the former dmasters
+ * with the vacuum fetch message.
+ *
+ * This last step is implemented as a 3-phase process to protect from
+ * races leading to data corruption:
+ *
+ *  1) Send the lmaster's copy to all other active nodes with the
+ *     RECEIVE_RECORDS control: The remote nodes store the lmaster's copy.
+ *  2) Send the records that could successfully be stored remotely
+ *     in step #1 to all active nodes with the TRY_DELETE_RECORDS
+ *     control. The remote notes delete their local copy.
+ *  3) The lmaster locally deletes its copies of all records that
+ *     could successfully be deleted remotely in step #2.
+ */
+static int ctdb_process_delete_list(struct ctdb_db_context *ctdb_db,
+                                   struct vacuum_data *vdata)
+{
+       int ret, i;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       struct delete_records_list *recs;
+       TDB_DATA indata;
+       struct ctdb_node_map *nodemap;
+       uint32_t *active_nodes;
+       int num_active_nodes;
+       TALLOC_CTX *tmp_ctx;
+
+       if (vdata->delete_count == 0) {
+               return 0;
+       }
+
+       tmp_ctx = talloc_new(vdata);
+       if (tmp_ctx == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               return 0;
+       }
+
+       vdata->delete_left = vdata->delete_count;
+
+       /*
+        * get the list of currently active nodes
+        */
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(),
+                                  CTDB_CURRENT_NODE,
+                                  tmp_ctx,
+                                  &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " unable to get node map\n"));
+               ret = -1;
+               goto done;
+       }
+
+       active_nodes = list_of_active_nodes(ctdb, nodemap,
+                                           nodemap, /* talloc context */
+                                           false /* include self */);
+       /* yuck! ;-) */
+       num_active_nodes = talloc_get_size(active_nodes)/sizeof(*active_nodes);
+
+       /*
+        * Now delete the records all active nodes in a three-phase process:
+        * 1) send all active remote nodes the current empty copy with this
+        *    node as DMASTER
+        * 2) if all nodes could store the new copy,
+        *    tell all the active remote nodes to delete all their copy
+        * 3) if all remote nodes deleted their record copy, delete it locally
+        */
+
+       /*
+        * Step 1:
+        * Send currently empty record copy to all active nodes for storing.
+        */
+
+       recs = talloc_zero(tmp_ctx, struct delete_records_list);
+       if (recs == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               ret = -1;
+               goto done;
+       }
+       recs->records = (struct ctdb_marshall_buffer *)
+               talloc_zero_size(recs,
+                                offsetof(struct ctdb_marshall_buffer, data));
+       if (recs->records == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               ret = -1;
+               goto done;
+       }
+       recs->records->db_id = ctdb_db->db_id;
+       recs->vdata = vdata;
+
+       /*
+        * traverse the tree of all records we want to delete and
+        * create a blob we can send to the other nodes.
+        *
+        * We call delete_marshall_traverse_first() to bump the
+        * records' RSNs in the database, to ensure we (as dmaster)
+        * keep the highest RSN of the records in the cluster.
+        */
+       trbt_traversearray32(vdata->delete_list, 1,
+                            delete_marshall_traverse_first, recs);
+
+       indata.dsize = talloc_get_size(recs->records);
+       indata.dptr  = (void *)recs->records;
+
+       for (i = 0; i < num_active_nodes; i++) {
+               struct ctdb_marshall_buffer *records;
+               struct ctdb_rec_data *rec;
+               int32_t res;
+               TDB_DATA outdata;
+
+               ret = ctdb_control(ctdb, active_nodes[i], 0,
+                               CTDB_CONTROL_RECEIVE_RECORDS, 0,
+                               indata, recs, &outdata, &res,
+                               NULL, NULL);
+               if (ret != 0 || res != 0) {
+                       DEBUG(DEBUG_ERR, ("Error storing record copies on "
+                                         "node %u: ret[%d] res[%d]\n",
+                                         active_nodes[i], ret, res));
+                       ret = -1;
+                       goto done;
+               }
+
+               /*
+                * outdata contains the list of records coming back
+                * from the node: These are the records that the
+                * remote node could not store. We remove these from
+                * the list to process further.
+                */
+               records = (struct ctdb_marshall_buffer *)outdata.dptr;
+               rec = (struct ctdb_rec_data *)&records->data[0];
+               while (records->count-- > 1) {
+                       TDB_DATA reckey, recdata;
+                       struct ctdb_ltdb_header *rechdr;
+                       struct delete_record_data *dd;
+
+                       reckey.dptr = &rec->data[0];
+                       reckey.dsize = rec->keylen;
+                       recdata.dptr = &rec->data[reckey.dsize];
+                       recdata.dsize = rec->datalen;
+
+                       if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) {
+                               DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+                               ret = -1;
+                               goto done;
+                       }
+                       rechdr = (struct ctdb_ltdb_header *)recdata.dptr;
+                       recdata.dptr += sizeof(*rechdr);
+                       recdata.dsize -= sizeof(*rechdr);
+
+                       dd = (struct delete_record_data *)trbt_lookup32(
+                                       vdata->delete_list,
+                                       ctdb_hash(&reckey));
+                       if (dd != NULL) {
+                               /*
+                                * The other node could not store the record
+                                * copy and it is the first node that failed.
+                                * So we should remove it from the tree and
+                                * update statistics.
+                                */
+                               talloc_free(dd);
+                               vdata->delete_remote_error++;
+                               vdata->delete_left--;
+                       }
+
+                       rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
+               }
+       }
+
+       if (vdata->delete_left == 0) {
+               goto success;
+       }
+
+       /*
+        * Step 2:
+        * Send the remaining records to all active nodes for deletion.
+        *
+        * The lmaster's (i.e. our) copies of these records have been stored
+        * successfully on the other nodes.
+        */
+
+       /*
+        * Create a marshall blob from the remaining list of records to delete.
+        */
+
+       talloc_free(recs->records);
+
+       recs->records = (struct ctdb_marshall_buffer *)
+               talloc_zero_size(recs,
+                                offsetof(struct ctdb_marshall_buffer, data));
+       if (recs->records == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               ret = -1;
+               goto done;
+       }
+       recs->records->db_id = ctdb_db->db_id;
+
+       trbt_traversearray32(vdata->delete_list, 1,
+                            delete_marshall_traverse, recs);
+
+       indata.dsize = talloc_get_size(recs->records);
+       indata.dptr  = (void *)recs->records;
+
+       for (i = 0; i < num_active_nodes; i++) {
+               struct ctdb_marshall_buffer *records;
+               struct ctdb_rec_data *rec;
+               int32_t res;
+               TDB_DATA outdata;
+
+               ret = ctdb_control(ctdb, active_nodes[i], 0,
+                               CTDB_CONTROL_TRY_DELETE_RECORDS, 0,
+                               indata, recs, &outdata, &res,
+                               NULL, NULL);
+               if (ret != 0 || res != 0) {
+                       DEBUG(DEBUG_ERR, ("Failed to delete records on "
+                                         "node %u: ret[%d] res[%d]\n",
+                                         active_nodes[i], ret, res));
+                       ret = -1;
+                       goto done;
+               }
+
+               /*
+                * outdata contains the list of records coming back
+                * from the node: These are the records that the
+                * remote node could not delete. We remove these from
+                * the list to delete locally.
+                */
+               records = (struct ctdb_marshall_buffer *)outdata.dptr;
+               rec = (struct ctdb_rec_data *)&records->data[0];
+               while (records->count-- > 1) {
+                       TDB_DATA reckey, recdata;
+                       struct ctdb_ltdb_header *rechdr;
+                       struct delete_record_data *dd;
+
+                       reckey.dptr = &rec->data[0];
+                       reckey.dsize = rec->keylen;
+                       recdata.dptr = &rec->data[reckey.dsize];
+                       recdata.dsize = rec->datalen;
+
+                       if (recdata.dsize < sizeof(struct ctdb_ltdb_header)) {
+                               DEBUG(DEBUG_CRIT,(__location__ " bad ltdb record\n"));
+                               ret = -1;
+                               goto done;
+                       }
+                       rechdr = (struct ctdb_ltdb_header *)recdata.dptr;
+                       recdata.dptr += sizeof(*rechdr);
+                       recdata.dsize -= sizeof(*rechdr);
+
+                       dd = (struct delete_record_data *)trbt_lookup32(
+                                       vdata->delete_list,
+                                       ctdb_hash(&reckey));
+                       if (dd != NULL) {
+                               /*
+                                * The other node could not delete the
+                                * record and it is the first node that
+                                * failed. So we should remove it from
+                                * the tree and update statistics.
+                                */
+                               talloc_free(dd);
+                               vdata->delete_remote_error++;
+                               vdata->delete_left--;
+                       }
+
+                       rec = (struct ctdb_rec_data *)(rec->length + (uint8_t *)rec);
+               }
+       }
+
+       if (vdata->delete_left == 0) {
+               goto success;
+       }
+
+       /*
+        * Step 3:
+        * Delete the remaining records locally.
+        *
+        * These records have successfully been deleted on all
+        * active remote nodes.
+        */
+
+       trbt_traversearray32(vdata->delete_list, 1,
+                            delete_record_traverse, vdata);
+
+success:
+
+       if (vdata->delete_count > 0) {
+               DEBUG(DEBUG_INFO,
+                     (__location__
+                      " vacuum delete list statistics: "
+                      "db[%s] "
+                      "coll[%u] "
+                      "rem.err[%u] "
+                      "loc.err[%u] "
+                      "skip[%u] "
+                      "del[%u] "
+                      "left[%u]\n",
+                      ctdb_db->db_name,
+                      (unsigned)vdata->delete_count,
+                      (unsigned)vdata->delete_remote_error,
+                      (unsigned)vdata->delete_local_error,
+                      (unsigned)vdata->delete_skipped,
+                      (unsigned)vdata->delete_deleted,
+                      (unsigned)vdata->delete_left));
+       }
+
+       ret = 0;
+
+done:
+       talloc_free(tmp_ctx);
+
+       return ret;
+}
+
+/**
+ * initialize the vacuum_data
+ */
+static int ctdb_vacuum_init_vacuum_data(struct ctdb_db_context *ctdb_db,
+                                       struct vacuum_data *vdata)
+{
+       int i;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+       vdata->fast_added_to_delete_list = 0;
+       vdata->fast_added_to_vacuum_fetch_list = 0;
+       vdata->fast_deleted = 0;
+       vdata->fast_skipped = 0;
+       vdata->fast_error = 0;
+       vdata->fast_total = 0;
+       vdata->full_added_to_delete_list = 0;
+       vdata->full_added_to_vacuum_fetch_list = 0;
+       vdata->full_skipped = 0;
+       vdata->full_error = 0;
+       vdata->full_total = 0;
+       vdata->delete_count = 0;
+       vdata->delete_left = 0;
+       vdata->delete_remote_error = 0;
+       vdata->delete_local_error = 0;
+       vdata->delete_skipped = 0;
+       vdata->delete_deleted = 0;
+
+       /* the list needs to be of length num_nodes */
+       vdata->vacuum_fetch_list = talloc_zero_array(vdata,
+                                               struct ctdb_marshall_buffer *,
+                                               ctdb->num_nodes);
+       if (vdata->vacuum_fetch_list == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               return -1;
+       }
+       for (i = 0; i < ctdb->num_nodes; i++) {
+               vdata->vacuum_fetch_list[i] = (struct ctdb_marshall_buffer *)
+                       talloc_zero_size(vdata->vacuum_fetch_list,
+                                        offsetof(struct ctdb_marshall_buffer, data));
+               if (vdata->vacuum_fetch_list[i] == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+                       return -1;
+               }
+               vdata->vacuum_fetch_list[i]->db_id = ctdb_db->db_id;
+       }
+
+       return 0;
+}
+
+/**
+ * Vacuum a DB:
+ *  - Always do the fast vacuuming run, which traverses
+ *    the in-memory delete queue: these records have been
+ *    scheduled for deletion.
+ *  - Only if explicitly requested, the database is traversed
+ *    in order to use the traditional heuristics on empty records
+ *    to trigger deletion.
+ *    This is done only every VacuumFastPathCount'th vacuuming run.
+ *
+ * The traverse runs fill two lists:
+ *
+ * - The delete_list:
+ *   This is the list of empty records the current
+ *   node is lmaster and dmaster for. These records are later
+ *   deleted first on other nodes and then locally.
+ *
+ *   The fast vacuuming run has a short cut for those records
+ *   that have never been migrated with data: these records
+ *   are immediately deleted locally, since they have left
+ *   no trace on other nodes.
+ *
+ * - The vacuum_fetch lists
+ *   (one for each other lmaster node):
+ *   The records in this list are sent for deletion to
+ *   their lmaster in a bulk VACUUM_FETCH message.
+ *
+ *   The lmaster then migrates all these records to itelf
+ *   so that they can be vacuumed there.
+ *
+ * This executes in the child context.
+ */
+static int ctdb_vacuum_db(struct ctdb_db_context *ctdb_db,
+                         struct vacuum_data *vdata,
+                         bool full_vacuum_run)
+{
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       int ret, pnn;
+
+       DEBUG(DEBUG_INFO, (__location__ " Entering %s vacuum run for db "
+                          "%s db_id[0x%08x]\n",
+                          full_vacuum_run ? "full" : "fast",
+                          ctdb_db->db_name, ctdb_db->db_id));
+
+       ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get vnnmap from local node\n"));
+               return ret;
+       }
+
+       pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+       if (pnn == -1) {
+               DEBUG(DEBUG_ERR, ("Unable to get pnn from local node\n"));
+               return -1;
+       }
+
+       ctdb->pnn = pnn;
+
+       ret = ctdb_vacuum_init_vacuum_data(ctdb_db, vdata);
+       if (ret != 0) {
+               return ret;
+       }
+
+       ctdb_vacuum_db_fast(ctdb_db, vdata);
+
+       ret = ctdb_vacuum_db_full(ctdb_db, vdata, full_vacuum_run);
+       if (ret != 0) {
+               return ret;
+       }
+
+       ret = ctdb_process_vacuum_fetch_lists(ctdb_db, vdata);
+       if (ret != 0) {
+               return ret;
+       }
+
+       ret = ctdb_process_delete_list(ctdb_db, vdata);
+       if (ret != 0) {
+               return ret;
+       }
+
+       /* this ensures we run our event queue */
+       ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE);
+
+       return 0;
+}
+
+
+/*
+ * traverse function for repacking
+ */
+static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+       struct vacuum_data *vdata = (struct vacuum_data *)private;
+
+       if (vdata->vacuum) {
+               uint32_t hash = ctdb_hash(&key);
+               struct delete_record_data *kd;
+               /*
+                * check if we can ignore this record because it's in the delete_list
+                */
+               kd = (struct delete_record_data *)trbt_lookup32(vdata->delete_list, hash);
+               /*
+                * there might be hash collisions so we have to compare the keys here to be sure
+                */
+               if (kd && kd->key.dsize == key.dsize && memcmp(kd->key.dptr, key.dptr, key.dsize) == 0) {
+                       struct ctdb_ltdb_header *hdr = (struct ctdb_ltdb_header *)data.dptr;
+                       /*
+                        * we have to check if the record hasn't changed in the meantime in order to
+                        * savely remove it from the database
+                        */
+                       if (data.dsize == sizeof(struct ctdb_ltdb_header) &&
+                               hdr->dmaster == kd->ctdb->pnn &&
+                               ctdb_lmaster(kd->ctdb, &(kd->key)) == kd->ctdb->pnn &&
+                               kd->hdr.rsn == hdr->rsn) {
+                               vdata->vacuumed++;
+                               return 0;
+                       }
+               }
+       }
+       if (tdb_store(vdata->dest_db, key, data, TDB_INSERT) != 0) {
+               vdata->traverse_error = true;
+               return -1;
+       }
+       vdata->copied++;
+       return 0;
+}
+
+/*
+ * repack a tdb
+ */
+static int ctdb_repack_tdb(struct tdb_context *tdb, TALLOC_CTX *mem_ctx, struct vacuum_data *vdata)
+{
+       struct tdb_context *tmp_db;
+
+       if (tdb_transaction_start(tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to start transaction\n"));
+               return -1;
+       }
+
+       tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb),
+                         TDB_INTERNAL|TDB_DISALLOW_NESTING,
+                         O_RDWR|O_CREAT, 0);
+       if (tmp_db == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to create tmp_db\n"));
+               tdb_transaction_cancel(tdb);
+               return -1;
+       }
+
+       vdata->traverse_error = false;
+       vdata->dest_db = tmp_db;
+       vdata->vacuum = true;
+       vdata->vacuumed = 0;
+       vdata->copied = 0;
+
+       /*
+        * repack and vacuum on-the-fly by not writing the records that are
+        * no longer needed
+        */
+       if (tdb_traverse_read(tdb, repack_traverse, vdata) == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to traverse copying out\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;              
+       }
+
+       DEBUG(DEBUG_INFO,(__location__ " %u records vacuumed\n", vdata->vacuumed));
+       
+       if (vdata->traverse_error) {
+               DEBUG(DEBUG_ERR,(__location__ " Error during traversal\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       if (tdb_wipe_all(tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to wipe database\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       vdata->traverse_error = false;
+       vdata->dest_db = tdb;
+       vdata->vacuum = false;
+       vdata->copied = 0;
+
+       if (tdb_traverse_read(tmp_db, repack_traverse, vdata) == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to traverse copying back\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;              
+       }
+
+       if (vdata->traverse_error) {
+               DEBUG(DEBUG_ERR,(__location__ " Error during second traversal\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       tdb_close(tmp_db);
+
+
+       if (tdb_transaction_commit(tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to commit\n"));
+               return -1;
+       }
+       DEBUG(DEBUG_INFO,(__location__ " %u records copied\n", vdata->copied));
+
+       return 0;
+}
+
+/*
+ * repack and vaccum a db
+ * called from the child context
+ */
+static int ctdb_vacuum_and_repack_db(struct ctdb_db_context *ctdb_db,
+                                    TALLOC_CTX *mem_ctx,
+                                    bool full_vacuum_run)
+{
+       uint32_t repack_limit = ctdb_db->ctdb->tunable.repack_limit;
+       uint32_t vacuum_limit = ctdb_db->ctdb->tunable.vacuum_limit;
+       const char *name = ctdb_db->db_name;
+       int freelist_size;
+       struct vacuum_data *vdata;
+
+       freelist_size = tdb_freelist_size(ctdb_db->ltdb->tdb);
+       if (freelist_size == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get freelist size for '%s'\n", name));
+               return -1;
+       }
+
+       vdata = talloc_zero(mem_ctx, struct vacuum_data);
+       if (vdata == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               return -1;
+       }
+
+       vdata->ctdb = ctdb_db->ctdb;
+       vdata->vacuum_limit = vacuum_limit;
+       vdata->repack_limit = repack_limit;
+       vdata->delete_list = trbt_create(vdata, 0);
+       vdata->ctdb_db = ctdb_db;
+       if (vdata->delete_list == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Out of memory\n"));
+               talloc_free(vdata);
+               return -1;
+       }
+
+       vdata->start = timeval_current();
+       /*
+        * gather all records that can be deleted in vdata
+        */
+       if (ctdb_vacuum_db(ctdb_db, vdata, full_vacuum_run) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to vacuum '%s'\n", name));
+       }
+
+       /*
+        * decide if a repack is necessary
+        */
+       if (freelist_size < repack_limit && vdata->delete_left < vacuum_limit)
+       {
+               talloc_free(vdata);
+               return 0;
+       }
+
+       DEBUG(DEBUG_INFO,("Repacking %s with %u freelist entries and %u records to delete\n", 
+                       name, freelist_size, vdata->delete_left));
+
+       /*
+        * repack and implicitely get rid of the records we can delete
+        */
+       if (ctdb_repack_tdb(ctdb_db->ltdb->tdb, mem_ctx, vdata) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to repack '%s'\n", name));
+               talloc_free(vdata);
+               return -1;
+       }
+       talloc_free(vdata);
+
+       return 0;
+}
+
+static uint32_t get_vacuum_interval(struct ctdb_db_context *ctdb_db)
+{
+       uint32_t interval = ctdb_db->ctdb->tunable.vacuum_interval;
+
+       return interval;
+}
+
+static int vacuum_child_destructor(struct ctdb_vacuum_child_context *child_ctx)
+{
+       double l = timeval_elapsed(&child_ctx->start_time);
+       struct ctdb_db_context *ctdb_db = child_ctx->vacuum_handle->ctdb_db;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+
+       DEBUG(DEBUG_INFO,("Vacuuming took %.3f seconds for database %s\n", l, ctdb_db->db_name));
+
+       if (child_ctx->child_pid != -1) {
+               ctdb_kill(ctdb, child_ctx->child_pid, SIGKILL);
+       } else {
+               /* Bump the number of successful fast-path runs. */
+               child_ctx->vacuum_handle->fast_path_count++;
+       }
+
+       DLIST_REMOVE(ctdb->vacuumers, child_ctx);
+
+       event_add_timed(ctdb->ev, child_ctx->vacuum_handle,
+                       timeval_current_ofs(get_vacuum_interval(ctdb_db), 0), 
+                       ctdb_vacuum_event, child_ctx->vacuum_handle);
+
+       return 0;
+}
+
+/*
+ * this event is generated when a vacuum child process times out
+ */
+static void vacuum_child_timeout(struct event_context *ev, struct timed_event *te,
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context);
+
+       DEBUG(DEBUG_ERR,("Vacuuming child process timed out for db %s\n", child_ctx->vacuum_handle->ctdb_db->db_name));
+
+       child_ctx->status = VACUUM_TIMEOUT;
+
+       talloc_free(child_ctx);
+}
+
+
+/*
+ * this event is generated when a vacuum child process has completed
+ */
+static void vacuum_child_handler(struct event_context *ev, struct fd_event *fde,
+                            uint16_t flags, void *private_data)
+{
+       struct ctdb_vacuum_child_context *child_ctx = talloc_get_type(private_data, struct ctdb_vacuum_child_context);
+       char c = 0;
+       int ret;
+
+       DEBUG(DEBUG_INFO,("Vacuuming child process %d finished for db %s\n", child_ctx->child_pid, child_ctx->vacuum_handle->ctdb_db->db_name));
+       child_ctx->child_pid = -1;
+
+       ret = read(child_ctx->fd[0], &c, 1);
+       if (ret != 1 || c != 0) {
+               child_ctx->status = VACUUM_ERROR;
+               DEBUG(DEBUG_ERR, ("A vacuum child process failed with an error for database %s. ret=%d c=%d\n", child_ctx->vacuum_handle->ctdb_db->db_name, ret, c));
+       } else {
+               child_ctx->status = VACUUM_OK;
+       }
+
+       talloc_free(child_ctx);
+}
+
+/*
+ * this event is called every time we need to start a new vacuum process
+ */
+static void
+ctdb_vacuum_event(struct event_context *ev, struct timed_event *te,
+                              struct timeval t, void *private_data)
+{
+       struct ctdb_vacuum_handle *vacuum_handle = talloc_get_type(private_data, struct ctdb_vacuum_handle);
+       struct ctdb_db_context *ctdb_db = vacuum_handle->ctdb_db;
+       struct ctdb_context *ctdb = ctdb_db->ctdb;
+       struct ctdb_vacuum_child_context *child_ctx;
+       struct tevent_fd *fde;
+       int ret;
+
+       /* we dont vacuum if we are in recovery mode, or db frozen */
+       if (ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ||
+           ctdb->freeze_mode[ctdb_db->priority] != CTDB_FREEZE_NONE) {
+               DEBUG(DEBUG_INFO, ("Not vacuuming %s (%s)\n", ctdb_db->db_name,
+                                  ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE ? "in recovery"
+                                  : ctdb->freeze_mode[ctdb_db->priority] == CTDB_FREEZE_PENDING
+                                  ? "freeze pending"
+                                  : "frozen"));
+               event_add_timed(ctdb->ev, vacuum_handle,
+                       timeval_current_ofs(get_vacuum_interval(ctdb_db), 0),
+                       ctdb_vacuum_event, vacuum_handle);
+               return;
+       }
+
+       child_ctx = talloc(vacuum_handle, struct ctdb_vacuum_child_context);
+       if (child_ctx == NULL) {
+               DEBUG(DEBUG_CRIT, (__location__ " Failed to allocate child context for vacuuming of %s\n", ctdb_db->db_name));
+               ctdb_fatal(ctdb, "Out of memory when crating vacuum child context. Shutting down\n");
+       }
+
+
+       ret = pipe(child_ctx->fd);
+       if (ret != 0) {
+               talloc_free(child_ctx);
+               DEBUG(DEBUG_ERR, ("Failed to create pipe for vacuum child process.\n"));
+               event_add_timed(ctdb->ev, vacuum_handle,
+                       timeval_current_ofs(get_vacuum_interval(ctdb_db), 0),
+                       ctdb_vacuum_event, vacuum_handle);
+               return;
+       }
+
+       if (vacuum_handle->fast_path_count > ctdb->tunable.vacuum_fast_path_count) {
+               vacuum_handle->fast_path_count = 0;
+       }
+
+       child_ctx->child_pid = ctdb_fork(ctdb);
+       if (child_ctx->child_pid == (pid_t)-1) {
+               close(child_ctx->fd[0]);
+               close(child_ctx->fd[1]);
+               talloc_free(child_ctx);
+               DEBUG(DEBUG_ERR, ("Failed to fork vacuum child process.\n"));
+               event_add_timed(ctdb->ev, vacuum_handle,
+                       timeval_current_ofs(get_vacuum_interval(ctdb_db), 0),
+                       ctdb_vacuum_event, vacuum_handle);
+               return;
+       }
+
+
+       if (child_ctx->child_pid == 0) {
+               char cc = 0;
+               bool full_vacuum_run = false;
+               close(child_ctx->fd[0]);
+
+               DEBUG(DEBUG_INFO,("Vacuuming child process %d for db %s started\n", getpid(), ctdb_db->db_name));
+               ctdb_set_process_name("ctdb_vacuum");
+               if (switch_from_server_to_client(ctdb, "vacuum-%s", ctdb_db->db_name) != 0) {
+                       DEBUG(DEBUG_CRIT, (__location__ "ERROR: failed to switch vacuum daemon into client mode. Shutting down.\n"));
+                       _exit(1);
+               }
+
+               /* 
+                * repack the db
+                */
+               if ((ctdb->tunable.vacuum_fast_path_count > 0) &&
+                   (vacuum_handle->fast_path_count == 0))
+               {
+                       full_vacuum_run = true;
+               }
+               cc = ctdb_vacuum_and_repack_db(ctdb_db, child_ctx,
+                                              full_vacuum_run);
+
+               write(child_ctx->fd[1], &cc, 1);
+               _exit(0);
+       }
+
+       set_close_on_exec(child_ctx->fd[0]);
+       close(child_ctx->fd[1]);
+
+       child_ctx->status = VACUUM_RUNNING;
+       child_ctx->start_time = timeval_current();
+
+       DLIST_ADD(ctdb->vacuumers, child_ctx);
+       talloc_set_destructor(child_ctx, vacuum_child_destructor);
+
+       /*
+        * Clear the fastpath vacuuming list in the parent.
+        */
+       talloc_free(ctdb_db->delete_queue);
+       ctdb_db->delete_queue = trbt_create(ctdb_db, 0);
+       if (ctdb_db->delete_queue == NULL) {
+               /* fatal here? ... */
+               ctdb_fatal(ctdb, "Out of memory when re-creating vacuum tree "
+                                "in parent context. Shutting down\n");
+       }
+
+       event_add_timed(ctdb->ev, child_ctx,
+               timeval_current_ofs(ctdb->tunable.vacuum_max_run_time, 0),
+               vacuum_child_timeout, child_ctx);
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to child vacuum process\n", child_ctx->fd[0]));
+
+       fde = event_add_fd(ctdb->ev, child_ctx, child_ctx->fd[0],
+                          EVENT_FD_READ, vacuum_child_handler, child_ctx);
+       tevent_fd_set_auto_close(fde);
+
+       vacuum_handle->child_ctx = child_ctx;
+       child_ctx->vacuum_handle = vacuum_handle;
+}
+
+void ctdb_stop_vacuuming(struct ctdb_context *ctdb)
+{
+       /* Simply free them all. */
+       while (ctdb->vacuumers) {
+               DEBUG(DEBUG_INFO, ("Aborting vacuuming for %s (%i)\n",
+                          ctdb->vacuumers->vacuum_handle->ctdb_db->db_name,
+                          (int)ctdb->vacuumers->child_pid));
+               /* vacuum_child_destructor kills it, removes from list */
+               talloc_free(ctdb->vacuumers);
+       }
+}
+
+/* this function initializes the vacuuming context for a database
+ * starts the vacuuming events
+ */
+int ctdb_vacuum_init(struct ctdb_db_context *ctdb_db)
+{
+       if (ctdb_db->persistent != 0) {
+               DEBUG(DEBUG_ERR,("Vacuuming is disabled for persistent database %s\n", ctdb_db->db_name));
+               return 0;
+       }
+
+       ctdb_db->vacuum_handle = talloc(ctdb_db, struct ctdb_vacuum_handle);
+       CTDB_NO_MEMORY(ctdb_db->ctdb, ctdb_db->vacuum_handle);
+
+       ctdb_db->vacuum_handle->ctdb_db         = ctdb_db;
+       ctdb_db->vacuum_handle->fast_path_count = 0;
+
+       event_add_timed(ctdb_db->ctdb->ev, ctdb_db->vacuum_handle, 
+                       timeval_current_ofs(get_vacuum_interval(ctdb_db), 0), 
+                       ctdb_vacuum_event, ctdb_db->vacuum_handle);
+
+       return 0;
+}
+
+static void remove_record_from_delete_queue(struct ctdb_db_context *ctdb_db,
+                                           const struct ctdb_ltdb_header *hdr,
+                                           const TDB_DATA key)
+{
+       struct delete_record_data *kd;
+       uint32_t hash;
+
+       hash = (uint32_t)ctdb_hash(&key);
+
+       DEBUG(DEBUG_DEBUG, (__location__
+                           " remove_record_from_delete_queue: "
+                           "db[%s] "
+                           "db_id[0x%08x] "
+                           "key_hash[0x%08x] "
+                           "lmaster[%u] "
+                           "migrated_with_data[%s]\n",
+                            ctdb_db->db_name, ctdb_db->db_id,
+                            hash,
+                            ctdb_lmaster(ctdb_db->ctdb, &key),
+                            hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no"));
+
+       kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash);
+       if (kd == NULL) {
+               DEBUG(DEBUG_DEBUG, (__location__
+                                   " remove_record_from_delete_queue: "
+                                   "record not in queue (hash[0x%08x])\n.",
+                                   hash));
+               return;
+       }
+
+       if ((kd->key.dsize != key.dsize) ||
+           (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
+       {
+               DEBUG(DEBUG_DEBUG, (__location__
+                                   " remove_record_from_delete_queue: "
+                                   "hash collision for key with hash[0x%08x] "
+                                   "in db[%s] - skipping\n",
+                                   hash, ctdb_db->db_name));
+               return;
+       }
+
+       DEBUG(DEBUG_DEBUG, (__location__
+                           " remove_record_from_delete_queue: "
+                           "removing key with hash[0x%08x]\n",
+                            hash));
+
+       talloc_free(kd);
+
+       return;
+}
+
+/**
+ * Insert a record into the ctdb_db context's delete queue,
+ * handling hash collisions.
+ */
+static int insert_record_into_delete_queue(struct ctdb_db_context *ctdb_db,
+                                          const struct ctdb_ltdb_header *hdr,
+                                          TDB_DATA key)
+{
+       struct delete_record_data *kd;
+       uint32_t hash;
+       int ret;
+
+       hash = (uint32_t)ctdb_hash(&key);
+
+       DEBUG(DEBUG_INFO, (__location__ " schedule for deletion: db[%s] "
+                          "db_id[0x%08x] "
+                          "key_hash[0x%08x] "
+                          "lmaster[%u] "
+                          "migrated_with_data[%s]\n",
+                           ctdb_db->db_name, ctdb_db->db_id,
+                           hash,
+                           ctdb_lmaster(ctdb_db->ctdb, &key),
+                           hdr->flags & CTDB_REC_FLAG_MIGRATED_WITH_DATA ? "yes" : "no"));
+
+       kd = (struct delete_record_data *)trbt_lookup32(ctdb_db->delete_queue, hash);
+       if (kd != NULL) {
+               if ((kd->key.dsize != key.dsize) ||
+                   (memcmp(kd->key.dptr, key.dptr, key.dsize) != 0))
+               {
+                       DEBUG(DEBUG_INFO,
+                             (__location__ " schedule for deletion: "
+                              "hash collision for key hash [0x%08x]. "
+                              "Skipping the record.\n", hash));
+                       return 0;
+               } else {
+                       DEBUG(DEBUG_DEBUG,
+                             (__location__ " schedule for deletion: "
+                              "updating entry for key with hash [0x%08x].\n",
+                              hash));
+               }
+       }
+
+       ret = insert_delete_record_data_into_tree(ctdb_db->ctdb, ctdb_db,
+                                                 ctdb_db->delete_queue,
+                                                 hdr, key);
+       if (ret != 0) {
+               DEBUG(DEBUG_INFO,
+                     (__location__ " schedule for deletion: error "
+                      "inserting key with hash [0x%08x] into delete queue\n",
+                      hash));
+               return -1;
+       }
+
+       return 0;
+}
+
+/**
+ * Schedule a record for deletetion.
+ * Called from the parent context.
+ */
+int32_t ctdb_control_schedule_for_deletion(struct ctdb_context *ctdb,
+                                          TDB_DATA indata)
+{
+       struct ctdb_control_schedule_for_deletion *dd;
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+       TDB_DATA key;
+
+       dd = (struct ctdb_control_schedule_for_deletion *)indata.dptr;
+
+       ctdb_db = find_ctdb_db(ctdb, dd->db_id);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Unknown db id 0x%08x\n",
+                                 dd->db_id));
+               return -1;
+       }
+
+       key.dsize = dd->keylen;
+       key.dptr = dd->key;
+
+       ret = insert_record_into_delete_queue(ctdb_db, &dd->hdr, key);
+
+       return ret;
+}
+
+int32_t ctdb_local_schedule_for_deletion(struct ctdb_db_context *ctdb_db,
+                                        const struct ctdb_ltdb_header *hdr,
+                                        TDB_DATA key)
+{
+       int ret;
+       struct ctdb_control_schedule_for_deletion *dd;
+       TDB_DATA indata;
+       int32_t status;
+
+       if (ctdb_db->ctdb->ctdbd_pid == getpid()) {
+               /* main daemon - directly queue */
+               ret = insert_record_into_delete_queue(ctdb_db, hdr, key);
+
+               return ret;
+       }
+
+       /* if we dont have a connection to the daemon we can not send
+          a control. For example sometimes from update_record control child
+          process.
+       */
+       if (!ctdb_db->ctdb->can_send_controls) {
+               return -1;
+       }
+
+
+       /* child process: send the main daemon a control */
+       indata.dsize = offsetof(struct ctdb_control_schedule_for_deletion, key) + key.dsize;
+       indata.dptr = talloc_zero_array(ctdb_db, uint8_t, indata.dsize);
+       if (indata.dptr == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " out of memory\n"));
+               return -1;
+       }
+       dd = (struct ctdb_control_schedule_for_deletion *)(void *)indata.dptr;
+       dd->db_id = ctdb_db->db_id;
+       dd->hdr = *hdr;
+       dd->keylen = key.dsize;
+       memcpy(dd->key, key.dptr, key.dsize);
+
+       ret = ctdb_control(ctdb_db->ctdb,
+                          CTDB_CURRENT_NODE,
+                          ctdb_db->db_id,
+                          CTDB_CONTROL_SCHEDULE_FOR_DELETION,
+                          CTDB_CTRL_FLAG_NOREPLY, /* flags */
+                          indata,
+                          NULL, /* mem_ctx */
+                          NULL, /* outdata */
+                          &status,
+                          NULL, /* timeout : NULL == wait forever */
+                          NULL); /* error message */
+
+       talloc_free(indata.dptr);
+
+       if (ret != 0 || status != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Error sending "
+                                 "SCHEDULE_FOR_DELETION "
+                                 "control.\n"));
+               if (status != 0) {
+                       ret = -1;
+               }
+       }
+
+       return ret;
+}
+
+void ctdb_local_remove_from_delete_queue(struct ctdb_db_context *ctdb_db,
+                                        const struct ctdb_ltdb_header *hdr,
+                                        const TDB_DATA key)
+{
+       if (ctdb_db->ctdb->ctdbd_pid != getpid()) {
+               /*
+                * Only remove the record from the delete queue if called
+                * in the main daemon.
+                */
+               return;
+       }
+
+       remove_record_from_delete_queue(ctdb_db, hdr, key);
+
+       return;
+}
diff --git a/ctdb/server/ctdbd.c b/ctdb/server/ctdbd.c
new file mode 100644 (file)
index 0000000..5bf48a1
--- /dev/null
@@ -0,0 +1,324 @@
+/* 
+   standalone ctdb daemon
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "system/time.h"
+#include "system/wait.h"
+#include "system/network.h"
+#include "cmdline.h"
+#include "../include/ctdb_private.h"
+
+static struct {
+       const char *nlist;
+       const char *transport;
+       const char *myaddress;
+       const char *public_address_list;
+       const char *event_script_dir;
+       const char *notification_script;
+       const char *logfile;
+       const char *recovery_lock_file;
+       const char *db_dir;
+       const char *db_dir_persistent;
+       const char *db_dir_state;
+       const char *public_interface;
+       const char *single_public_ip;
+       int         valgrinding;
+       int         nosetsched;
+       int         use_syslog;
+       int         start_as_disabled;
+       int         start_as_stopped;
+       int         no_lmaster;
+       int         no_recmaster;
+       int         lvs;
+       int         script_log_level;
+       int         no_publicipcheck;
+       int         max_persistent_check_errors;
+} options = {
+       .nlist = NULL,
+       .public_address_list = NULL,
+       .transport = "tcp",
+       .event_script_dir = NULL,
+       .logfile = LOGDIR "/log.ctdb",
+       .db_dir = CTDB_VARDIR,
+       .db_dir_persistent = CTDB_VARDIR "/persistent",
+       .db_dir_state = CTDB_VARDIR "/state",
+       .script_log_level = DEBUG_ERR,
+};
+
+int script_log_level;
+bool fast_start;
+
+/*
+  called by the transport layer when a packet comes in
+*/
+static void ctdb_recv_pkt(struct ctdb_context *ctdb, uint8_t *data, uint32_t length)
+{
+       struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
+
+       CTDB_INCREMENT_STAT(ctdb, node_packets_recv);
+
+       /* up the counter for this source node, so we know its alive */
+       if (ctdb_validate_pnn(ctdb, hdr->srcnode)) {
+               /* as a special case, redirected calls don't increment the rx_cnt */
+               if (hdr->operation != CTDB_REQ_CALL ||
+                   ((struct ctdb_req_call *)hdr)->hopcount == 0) {
+                       ctdb->nodes[hdr->srcnode]->rx_cnt++;
+               }
+       }
+
+       ctdb_input_pkt(ctdb, hdr);
+}
+
+static const struct ctdb_upcalls ctdb_upcalls = {
+       .recv_pkt       = ctdb_recv_pkt,
+       .node_dead      = ctdb_node_dead,
+       .node_connected = ctdb_node_connected
+};
+
+
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       int interactive = 0;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "interactive", 'i', POPT_ARG_NONE, &interactive, 0, "don't fork", NULL },
+               { "public-addresses", 0, POPT_ARG_STRING, &options.public_address_list, 0, "public address list file", "filename" },
+               { "public-interface", 0, POPT_ARG_STRING, &options.public_interface, 0, "public interface", "interface"},
+               { "single-public-ip", 0, POPT_ARG_STRING, &options.single_public_ip, 0, "single public ip", "ip-address"},
+               { "event-script-dir", 0, POPT_ARG_STRING, &options.event_script_dir, 0, "event script directory", "dirname" },
+               { "logfile", 0, POPT_ARG_STRING, &options.logfile, 0, "log file location", "filename" },
+               { "nlist", 0, POPT_ARG_STRING, &options.nlist, 0, "node list file", "filename" },
+               { "notification-script", 0, POPT_ARG_STRING, &options.notification_script, 0, "notification script", "filename" },
+               { "listen", 0, POPT_ARG_STRING, &options.myaddress, 0, "address to listen on", "address" },
+               { "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL },
+               { "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL },
+               { "dbdir-persistent", 0, POPT_ARG_STRING, &options.db_dir_persistent, 0, "directory for persistent tdb files", NULL },
+               { "dbdir-state", 0, POPT_ARG_STRING, &options.db_dir_state, 0, "directory for internal state tdb files", NULL },
+               { "reclock", 0, POPT_ARG_STRING, &options.recovery_lock_file, 0, "location of recovery lock file", "filename" },
+               { "pidfile", 0, POPT_ARG_STRING, &ctdbd_pidfile, 0, "location of PID file", "filename" },
+               { "valgrinding", 0, POPT_ARG_NONE, &options.valgrinding, 0, "disable setscheduler SCHED_FIFO call, use mmap for tdbs", NULL },
+               { "nosetsched", 0, POPT_ARG_NONE, &options.nosetsched, 0, "disable setscheduler SCHED_FIFO call, use mmap for tdbs", NULL },
+               { "syslog", 0, POPT_ARG_NONE, &options.use_syslog, 0, "log messages to syslog", NULL },
+               { "start-as-disabled", 0, POPT_ARG_NONE, &options.start_as_disabled, 0, "Node starts in disabled state", NULL },
+               { "start-as-stopped", 0, POPT_ARG_NONE, &options.start_as_stopped, 0, "Node starts in stopped state", NULL },
+               { "no-lmaster", 0, POPT_ARG_NONE, &options.no_lmaster, 0, "disable lmaster role on this node", NULL },
+               { "no-recmaster", 0, POPT_ARG_NONE, &options.no_recmaster, 0, "disable recmaster role on this node", NULL },
+               { "lvs", 0, POPT_ARG_NONE, &options.lvs, 0, "lvs is enabled on this node", NULL },
+               { "script-log-level", 0, POPT_ARG_INT, &options.script_log_level, 0, "log level of event script output", NULL },
+               { "nopublicipcheck", 0, POPT_ARG_NONE, &options.no_publicipcheck, 0, "don't check we have/don't have the correct public ip addresses", NULL },
+               { "max-persistent-check-errors", 0, POPT_ARG_INT,
+                 &options.max_persistent_check_errors, 0,
+                 "max allowed persistent check errors (default 0)", NULL },
+               { "log-ringbuf-size", 0, POPT_ARG_INT, &log_ringbuf_size, 0, "Number of log messages we can store in the memory ringbuffer", NULL },
+               { "sloppy-start", 0, POPT_ARG_NONE, &fast_start, 0, "Do not perform full recovery on start", NULL },
+               POPT_TABLEEND
+       };
+       int opt, ret;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       talloc_enable_null_tracking();
+
+       ctdb_block_signal(SIGPIPE);
+       fault_setup("ctdbd");
+
+       ev = event_context_init(NULL);
+       tevent_loop_allow_nesting(ev);
+
+       ctdb = ctdb_cmdline_init(ev);
+
+       ctdb->start_as_disabled = options.start_as_disabled;
+       ctdb->start_as_stopped  = options.start_as_stopped;
+
+       script_log_level = options.script_log_level;
+
+       ret = ctdb_set_logfile(ctdb, options.logfile, options.use_syslog);
+       if (ret == -1) {
+               printf("ctdb_set_logfile to %s failed - %s\n", 
+                      options.use_syslog?"syslog":options.logfile, ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       DEBUG(DEBUG_NOTICE,("CTDB starting on node\n"));
+
+       gettimeofday(&ctdb->ctdbd_start_time, NULL);
+       gettimeofday(&ctdb->last_recovery_started, NULL);
+       gettimeofday(&ctdb->last_recovery_finished, NULL);
+       ctdb->recovery_mode    = CTDB_RECOVERY_NORMAL;
+       ctdb->recovery_master  = (uint32_t)-1;
+       ctdb->upcalls          = &ctdb_upcalls;
+       ctdb->idr              = idr_init(ctdb);
+       ctdb->recovery_lock_fd = -1;
+
+       ctdb_tunables_set_defaults(ctdb);
+
+       ret = ctdb_set_recovery_lock_file(ctdb, options.recovery_lock_file);
+       if (ret == -1) {
+               DEBUG(DEBUG_ALERT,("ctdb_set_recovery_lock_file failed - %s\n", ctdb_errstr(ctdb)));
+               exit(1);
+       }
+
+       ret = ctdb_set_transport(ctdb, options.transport);
+       if (ret == -1) {
+               DEBUG(DEBUG_ALERT,("ctdb_set_transport failed - %s\n", ctdb_errstr(ctdb)));
+               exit(1);
+       }
+
+       /* tell ctdb what address to listen on */
+       if (options.myaddress) {
+               ret = ctdb_set_address(ctdb, options.myaddress);
+               if (ret == -1) {
+                       DEBUG(DEBUG_ALERT,("ctdb_set_address failed - %s\n", ctdb_errstr(ctdb)));
+                       exit(1);
+               }
+       }
+
+       /* set ctdbd capabilities */
+       ctdb->capabilities = 0;
+       if (options.no_lmaster == 0) {
+               ctdb->capabilities |= CTDB_CAP_LMASTER;
+       }
+       if (options.no_recmaster == 0) {
+               ctdb->capabilities |= CTDB_CAP_RECMASTER;
+       }
+       if (options.lvs != 0) {
+               ctdb->capabilities |= CTDB_CAP_LVS;
+       }
+
+       /* Initialise this node's PNN to the unknown value.  This will
+        * be set to the correct value by either ctdb_add_node() as
+        * part of loading the nodes file or by
+        * ctdb_tcp_listen_automatic() when the transport is
+        * initialised.  At some point we should de-optimise this and
+        * pull it out into ctdb_start_daemon() so it is done clearly
+        * and only in one place.
+        */
+       ctdb->pnn = -1;
+
+       /* Default value for CTDB_BASE - don't override */
+       setenv("CTDB_BASE", ETCDIR "/ctdb", 0);
+
+       /* tell ctdb what nodes are available */
+       if (options.nlist != NULL) {
+               ctdb->nodes_file = options.nlist;
+       } else {
+               ctdb->nodes_file =
+                       talloc_asprintf(ctdb, "%s/nodes", getenv("CTDB_BASE"));
+               if (ctdb->nodes_file == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " Out of memory\n"));
+                       exit(1);
+               }
+       }
+       ctdb_load_nodes_file(ctdb);
+
+       ctdb->db_directory = options.db_dir;
+       ctdb_mkdir_p_or_die(ctdb, ctdb->db_directory, 0700);
+
+       ctdb->db_directory_persistent = options.db_dir_persistent;
+       ctdb_mkdir_p_or_die(ctdb, ctdb->db_directory_persistent, 0700);
+
+       ctdb->db_directory_state = options.db_dir_state;
+       ctdb_mkdir_p_or_die(ctdb, ctdb->db_directory_state, 0700);
+
+       if (options.public_interface) {
+               ctdb->default_public_interface = talloc_strdup(ctdb, options.public_interface);
+               CTDB_NO_MEMORY(ctdb, ctdb->default_public_interface);
+       }
+
+       if (options.single_public_ip) {
+               if (options.public_interface == NULL) {
+                       DEBUG(DEBUG_ALERT,("--single_public_ip used but --public_interface is not specified. You must specify the public interface when using single public ip. Exiting\n"));
+                       exit(10);
+               }
+
+               ret = ctdb_set_single_public_ip(ctdb, options.public_interface,
+                                               options.single_public_ip);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ALERT,("Invalid --single-public-ip argument : %s . This is not a valid ip address. Exiting.\n", options.single_public_ip));
+                       exit(10);
+               }
+       }
+
+       if (options.event_script_dir != NULL) {
+               ctdb->event_script_dir = options.event_script_dir;
+       } else {
+               ctdb->event_script_dir = talloc_asprintf(ctdb, "%s/events.d",
+                                                        getenv("CTDB_BASE"));
+               if (ctdb->event_script_dir == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " Out of memory\n"));
+                       exit(1);
+               }
+       }
+
+       if (options.notification_script != NULL) {
+               ret = ctdb_set_notification_script(ctdb, options.notification_script);
+               if (ret == -1) {
+                       DEBUG(DEBUG_ALERT,("Unable to setup notification script\n"));
+                       exit(1);
+               }
+       }
+
+       ctdb->valgrinding = options.valgrinding;
+       if (options.valgrinding || options.nosetsched) {
+               ctdb->do_setsched = 0;
+       } else {
+               ctdb->do_setsched = 1;
+       }
+
+       ctdb->public_addresses_file = options.public_address_list;
+       ctdb->do_checkpublicip = !options.no_publicipcheck;
+
+       if (options.max_persistent_check_errors < 0) {
+               ctdb->max_persistent_check_errors = 0xFFFFFFFFFFFFFFFFLL;
+       } else {
+               ctdb->max_persistent_check_errors = (uint64_t)options.max_persistent_check_errors;
+       }
+
+       /* start the protocol running (as a child) */
+       return ctdb_start_daemon(ctdb, interactive?false:true, options.use_syslog);
+}
diff --git a/ctdb/server/eventscript.c b/ctdb/server/eventscript.c
new file mode 100644 (file)
index 0000000..d13e944
--- /dev/null
@@ -0,0 +1,1193 @@
+/* 
+   event script handling
+
+   Copyright (C) Andrew Tridgell  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include <time.h>
+#include "system/filesys.h"
+#include "system/wait.h"
+#include "system/dir.h"
+#include "system/locale.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+#include "lib/util/dlinklist.h"
+
+static void ctdb_event_script_timeout(struct event_context *ev, struct timed_event *te, struct timeval t, void *p);
+
+/*
+  ctdbd sends us a SIGTERM when we should die.
+ */
+static void sigterm(int sig)
+{
+       pid_t pid;
+
+       /* all the child processes will be running in the same process group */
+       pid = getpgrp();
+       if (pid == -1) {
+               kill(-getpid(), SIGKILL);
+       } else {
+               kill(-pid, SIGKILL);
+       }
+       _exit(1);
+}
+
+/* This is attached to the event script state. */
+struct event_script_callback {
+       struct event_script_callback *next, *prev;
+       struct ctdb_context *ctdb;
+
+       /* Warning: this can free us! */
+       void (*fn)(struct ctdb_context *, int, void *);
+       void *private_data;
+};
+       
+
+struct ctdb_event_script_state {
+       struct ctdb_context *ctdb;
+       struct event_script_callback *callback;
+       pid_t child;
+       int fd[2];
+       bool from_user;
+       enum ctdb_eventscript_call call;
+       const char *options;
+       struct timeval timeout;
+       
+       unsigned int current;
+       struct ctdb_scripts_wire *scripts;
+};
+
+static struct ctdb_script_wire *get_current_script(struct ctdb_event_script_state *state)
+{
+       return &state->scripts->scripts[state->current];
+}
+
+/* called from ctdb_logging when we have received output on STDERR from
+ * one of the eventscripts
+ */
+static void log_event_script_output(const char *str, uint16_t len, void *p)
+{
+       struct ctdb_event_script_state *state
+               = talloc_get_type(p, struct ctdb_event_script_state);
+       struct ctdb_script_wire *current;
+       unsigned int slen, min;
+
+       /* We may have been aborted to run something else.  Discard */
+       if (state->scripts == NULL) {
+               return;
+       }
+
+       current = get_current_script(state);
+
+       /* Append, but don't overfill buffer.  It starts zero-filled. */
+       slen = strlen(current->output);
+       min = MIN(len, sizeof(current->output) - slen - 1);
+
+       memcpy(current->output + slen, str, min);
+}
+
+int32_t ctdb_control_get_event_script_status(struct ctdb_context *ctdb,
+                                            uint32_t call_type,
+                                            TDB_DATA *outdata)
+{
+       if (call_type >= CTDB_EVENT_MAX) {
+               return -1;
+       }
+
+       if (ctdb->last_status[call_type] == NULL) {
+               /* If it's never been run, return nothing so they can tell. */
+               outdata->dsize = 0;
+       } else {
+               outdata->dsize = talloc_get_size(ctdb->last_status[call_type]);
+               outdata->dptr  = (uint8_t *)ctdb->last_status[call_type];
+       }
+       return 0;
+}
+
+struct ctdb_script_tree_item {
+       const char *name;
+       int error;
+};
+
+/* Return true if OK, otherwise set errno. */
+static bool check_executable(const char *dir, const char *name)
+{
+       char *full;
+       struct stat st;
+
+       full = talloc_asprintf(NULL, "%s/%s", dir, name);
+       if (!full)
+               return false;
+
+       if (stat(full, &st) != 0) {
+               DEBUG(DEBUG_ERR,("Could not stat event script %s: %s\n",
+                                full, strerror(errno)));
+               talloc_free(full);
+               return false;
+       }
+
+       if (!(st.st_mode & S_IXUSR)) {
+               DEBUG(DEBUG_DEBUG,("Event script %s is not executable. Ignoring this event script\n", full));
+               errno = ENOEXEC;
+               talloc_free(full);
+               return false;
+       }
+
+       talloc_free(full);
+       return true;
+}
+
+static struct ctdb_scripts_wire *ctdb_get_script_list(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx)
+{
+       DIR *dir;
+       struct dirent *de;
+       struct stat st;
+       trbt_tree_t *tree;
+       struct ctdb_scripts_wire *scripts;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_script_tree_item *tree_item;
+       int count;
+
+       /*
+         the service specific event scripts 
+       */
+       if (stat(ctdb->event_script_dir, &st) != 0 && 
+           errno == ENOENT) {
+               DEBUG(DEBUG_CRIT,("No event script directory found at '%s'\n", ctdb->event_script_dir));
+               talloc_free(tmp_ctx);
+               return NULL;
+       }
+
+       /* create a tree to store all the script names in */
+       tree = trbt_create(tmp_ctx, 0);
+
+       /* scan all directory entries and insert all valid scripts into the 
+          tree
+       */
+       dir = opendir(ctdb->event_script_dir);
+       if (dir == NULL) {
+               DEBUG(DEBUG_CRIT,("Failed to open event script directory '%s'\n", ctdb->event_script_dir));
+               talloc_free(tmp_ctx);
+               return NULL;
+       }
+
+       count = 0;
+       while ((de=readdir(dir)) != NULL) {
+               int namlen;
+               unsigned num;
+
+               namlen = strlen(de->d_name);
+
+               if (namlen < 3) {
+                       continue;
+               }
+
+               if (de->d_name[namlen-1] == '~') {
+                       /* skip files emacs left behind */
+                       continue;
+               }
+
+               if (de->d_name[2] != '.') {
+                       continue;
+               }
+
+               if (sscanf(de->d_name, "%02u.", &num) != 1) {
+                       continue;
+               }
+
+               if (strlen(de->d_name) > MAX_SCRIPT_NAME) {
+                       DEBUG(DEBUG_ERR,("Script name %s too long! %u chars max",
+                                        de->d_name, MAX_SCRIPT_NAME));
+                       continue;
+               }
+
+               tree_item = talloc(tree, struct ctdb_script_tree_item);
+               if (tree_item == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to allocate new tree item\n"));
+                       closedir(dir);
+                       talloc_free(tmp_ctx);
+                       return NULL;
+               }
+       
+               tree_item->error = 0;
+               if (!check_executable(ctdb->event_script_dir, de->d_name)) {
+                       tree_item->error = errno;
+               }
+
+               tree_item->name = talloc_strdup(tree_item, de->d_name);
+               if (tree_item->name == NULL) {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to allocate script name.\n"));
+                       closedir(dir);
+                       talloc_free(tmp_ctx);
+                       return NULL;
+               }
+
+               /* store the event script in the tree */
+               trbt_insert32(tree, (num<<16)|count++, tree_item);
+       }
+       closedir(dir);
+
+       /* Overallocates by one, but that's OK */
+       scripts = talloc_zero_size(tmp_ctx,
+                                  sizeof(*scripts)
+                                  + sizeof(scripts->scripts[0]) * count);
+       if (scripts == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to allocate scripts\n"));
+               talloc_free(tmp_ctx);
+               return NULL;
+       }
+       scripts->num_scripts = count;
+
+       for (count = 0; count < scripts->num_scripts; count++) {
+               tree_item = trbt_findfirstarray32(tree, 1);
+
+               strcpy(scripts->scripts[count].name, tree_item->name);
+               scripts->scripts[count].status = -tree_item->error;
+
+               /* remove this script from the tree */
+               talloc_free(tree_item);
+       }
+
+       talloc_steal(mem_ctx, scripts);
+       talloc_free(tmp_ctx);
+       return scripts;
+}
+
+static int child_setup(struct ctdb_context *ctdb)
+{
+       if (setpgid(0,0) != 0) {
+               int ret = -errno;
+               DEBUG(DEBUG_ERR,("Failed to create process group for event scripts - %s\n",
+                        strerror(errno)));
+               return ret;
+       }
+
+       signal(SIGTERM, sigterm);
+       return 0;
+}
+
+static char *child_command_string(struct ctdb_context *ctdb,
+                                      TALLOC_CTX *ctx,
+                                      bool from_user,
+                                      const char *scriptname,
+                                      enum ctdb_eventscript_call call,
+                                      const char *options)
+{
+       const char *str = from_user ? "CTDB_CALLED_BY_USER=1 " : "";
+
+       /* Allow a setting where we run the actual monitor event
+          from an external source and replace it with
+          a "status" event that just picks up the actual
+          status of the event asynchronously.
+       */
+       if ((ctdb->tunable.use_status_events_for_monitoring != 0)
+           &&  (call == CTDB_EVENT_MONITOR)
+           &&  !from_user) {
+               return talloc_asprintf(ctx, "%s%s/%s %s",
+                                      str,
+                                      ctdb->event_script_dir,
+                                      scriptname, "status");
+       } else {
+               return talloc_asprintf(ctx, "%s%s/%s %s %s",
+                                      str,
+                                      ctdb->event_script_dir,
+                                      scriptname,
+                                      ctdb_eventscript_call_names[call],
+                                      options);
+       }
+}
+
+static int child_run_one(struct ctdb_context *ctdb,
+                        const char *scriptname, const char *cmdstr)
+{
+       int ret;
+
+       ret = system(cmdstr);
+       /* if the system() call was successful, translate ret into the
+          return code from the command
+       */
+       if (ret != -1) {
+               ret = WEXITSTATUS(ret);
+       } else {
+               ret = -errno;
+       }
+
+       /* 127 could mean it does not exist, 126 non-executable. */
+       if (ret == 127 || ret == 126) {
+               /* Re-check it... */
+               if (!check_executable(ctdb->event_script_dir, scriptname)) {
+                       DEBUG(DEBUG_ERR,("Script %s returned status %u. Someone just deleted it?\n",
+                                        cmdstr, ret));
+                       ret = -errno;
+               }
+       }
+       return ret;
+}
+
+/*
+  Actually run one event script
+  this function is called and run in the context of a forked child
+  which allows it to do blocking calls such as system()
+ */
+static int child_run_script(struct ctdb_context *ctdb,
+                           bool from_user,
+                           enum ctdb_eventscript_call call,
+                           const char *options,
+                           struct ctdb_script_wire *current)
+{
+       char *cmdstr;
+       int ret;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       ret = child_setup(ctdb);
+       if (ret != 0)
+               goto out;
+
+       cmdstr = child_command_string(ctdb, tmp_ctx, from_user,
+                                     current->name, call, options);
+       CTDB_NO_MEMORY(ctdb, cmdstr);
+
+       DEBUG(DEBUG_DEBUG,("Executing event script %s\n",cmdstr));
+
+       if (current->status) {
+               ret = current->status;
+               goto out;
+       }
+
+       ret = child_run_one(ctdb, current->name, cmdstr);
+out:
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+static void ctdb_event_script_handler(struct event_context *ev, struct fd_event *fde,
+                                     uint16_t flags, void *p);
+
+static int fork_child_for_script(struct ctdb_context *ctdb,
+                                struct ctdb_event_script_state *state)
+{
+       int r;
+       struct tevent_fd *fde;
+       struct ctdb_script_wire *current = get_current_script(state);
+
+       current->start = timeval_current();
+
+       r = pipe(state->fd);
+       if (r != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " pipe failed for child eventscript process\n"));
+               return -errno;
+       }
+
+       if (!ctdb_fork_with_logging(state, ctdb, current->name, log_event_script_output,
+                                   state, &state->child)) {
+               r = -errno;
+               close(state->fd[0]);
+               close(state->fd[1]);
+               return r;
+       }
+
+       /* If we are the child, do the work. */
+       if (state->child == 0) {
+               int rt;
+
+               debug_extra = talloc_asprintf(NULL, "eventscript-%s-%s:",
+                                             current->name,
+                                             ctdb_eventscript_call_names[state->call]);
+               close(state->fd[0]);
+               set_close_on_exec(state->fd[1]);
+               ctdb_set_process_name("ctdb_eventscript");
+
+               rt = child_run_script(ctdb, state->from_user, state->call, state->options, current);
+               /* We must be able to write PIPEBUF bytes at least; if this
+                  somehow fails, the read above will be short. */
+               write(state->fd[1], &rt, sizeof(rt));
+               close(state->fd[1]);
+               _exit(rt);
+       }
+
+       close(state->fd[1]);
+       set_close_on_exec(state->fd[0]);
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created PIPE FD:%d to child eventscript process\n", state->fd[0]));
+
+       /* Set ourselves up to be called when that's done. */
+       fde = event_add_fd(ctdb->ev, state, state->fd[0], EVENT_FD_READ,
+                          ctdb_event_script_handler, state);
+       tevent_fd_set_auto_close(fde);
+
+       return 0;
+}
+
+/*
+ Summarize status of this run of scripts.
+ */
+static int script_status(struct ctdb_scripts_wire *scripts)
+{
+       unsigned int i;
+
+       for (i = 0; i < scripts->num_scripts; i++) {
+               switch (scripts->scripts[i].status) {
+               case -ENOENT:
+               case -ENOEXEC:
+                       /* Disabled or missing; that's OK. */
+                       break;
+               case 0:
+                       /* No problem. */
+                       break;
+               default:
+                       return scripts->scripts[i].status;
+               }
+       }
+
+       /* All OK! */
+       return 0;
+}
+
+/* called when child is finished */
+static void ctdb_event_script_handler(struct event_context *ev, struct fd_event *fde, 
+                                     uint16_t flags, void *p)
+{
+       struct ctdb_event_script_state *state = 
+               talloc_get_type(p, struct ctdb_event_script_state);
+       struct ctdb_script_wire *current = get_current_script(state);
+       struct ctdb_context *ctdb = state->ctdb;
+       int r, status;
+
+       if (ctdb == NULL) {
+               DEBUG(DEBUG_ERR,("Eventscript finished but ctdb is NULL\n"));
+               return;
+       }
+
+       r = read(state->fd[0], &current->status, sizeof(current->status));
+       if (r < 0) {
+               current->status = -errno;
+       } else if (r != sizeof(current->status)) {
+               current->status = -EIO;
+       }
+
+       current->finished = timeval_current();
+       /* valgrind gets overloaded if we run next script as it's still doing
+        * post-execution analysis, so kill finished child here. */
+       if (ctdb->valgrinding) {
+               ctdb_kill(ctdb, state->child, SIGKILL);
+       }
+
+       state->child = 0;
+
+       status = script_status(state->scripts);
+
+       /* Aborted or finished all scripts?  We're done. */
+       if (status != 0 || state->current+1 == state->scripts->num_scripts) {
+               DEBUG(DEBUG_INFO,(__location__ " Eventscript %s %s finished with state %d\n",
+                                 ctdb_eventscript_call_names[state->call], state->options, status));
+
+               ctdb->event_script_timeouts = 0;
+               talloc_free(state);
+               return;
+       }
+
+       /* Forget about that old fd. */
+       talloc_free(fde);
+
+       /* Next script! */
+       state->current++;
+       current++;
+       current->status = fork_child_for_script(ctdb, state);
+       if (current->status != 0) {
+               /* This calls the callback. */
+               talloc_free(state);
+       }
+}
+
+struct debug_hung_script_state {
+       struct ctdb_context *ctdb;
+       pid_t child;
+       enum ctdb_eventscript_call call;
+};
+
+static int debug_hung_script_state_destructor(struct debug_hung_script_state *state)
+{
+       if (state->child) {
+               ctdb_kill(state->ctdb, state->child, SIGKILL);
+       }
+       return 0;
+}
+
+static void debug_hung_script_timeout(struct tevent_context *ev, struct tevent_timer *te,
+                                     struct timeval t, void *p)
+{
+       struct debug_hung_script_state *state =
+               talloc_get_type(p, struct debug_hung_script_state);
+
+       talloc_free(state);
+}
+
+static void debug_hung_script_done(struct tevent_context *ev, struct tevent_fd *fde,
+                                  uint16_t flags, void *p)
+{
+       struct debug_hung_script_state *state =
+               talloc_get_type(p, struct debug_hung_script_state);
+
+       talloc_free(state);
+}
+
+static void ctdb_run_debug_hung_script(struct ctdb_context *ctdb, struct debug_hung_script_state *state)
+{
+       pid_t pid;
+       const char * debug_hung_script = ETCDIR "/ctdb/debug-hung-script.sh";
+       int fd[2];
+       struct tevent_timer *ttimer;
+       struct tevent_fd *tfd;
+
+       if (pipe(fd) < 0) {
+               DEBUG(DEBUG_ERR,("Failed to create pipe fd for debug hung script\n"));
+               return;
+       }
+
+       if (!ctdb_fork_with_logging(ctdb, ctdb, "Hung script", NULL, NULL, &pid)) {
+               DEBUG(DEBUG_ERR,("Failed to fork a child process with logging to track hung event script\n"));
+               close(fd[0]);
+               close(fd[1]);
+               return;
+       }
+       if (pid == -1) {
+               DEBUG(DEBUG_ERR,("Fork for debug script failed : %s\n",
+                                strerror(errno)));
+               close(fd[0]);
+               close(fd[1]);
+               return;
+       }
+       if (pid == 0) {
+               char *buf;
+
+               ctdb_set_process_name("ctdb_debug_hung_script");
+               if (getenv("CTDB_DEBUG_HUNG_SCRIPT") != NULL) {
+                       debug_hung_script = getenv("CTDB_DEBUG_HUNG_SCRIPT");
+               }
+
+               close(fd[0]);
+
+               buf = talloc_asprintf(NULL, "%s %d %s",
+                                     debug_hung_script, state->child,
+                                     ctdb_eventscript_call_names[state->call]);
+               system(buf);
+               talloc_free(buf);
+
+               _exit(0);
+       }
+
+       close(fd[1]);
+
+       ttimer = tevent_add_timer(ctdb->ev, state,
+                                 timeval_current_ofs(ctdb->tunable.script_timeout, 0),
+                                 debug_hung_script_timeout, state);
+       if (ttimer == NULL) {
+               close(fd[0]);
+               return;
+       }
+
+       tfd = tevent_add_fd(ctdb->ev, state, fd[0], EVENT_FD_READ,
+                           debug_hung_script_done, state);
+       if (tfd == NULL) {
+               talloc_free(ttimer);
+               close(fd[0]);
+               return;
+       }
+       tevent_fd_set_auto_close(tfd);
+}
+
+/* called when child times out */
+static void ctdb_event_script_timeout(struct event_context *ev, struct timed_event *te, 
+                                     struct timeval t, void *p)
+{
+       struct ctdb_event_script_state *state = talloc_get_type(p, struct ctdb_event_script_state);
+       struct ctdb_context *ctdb = state->ctdb;
+       struct ctdb_script_wire *current = get_current_script(state);
+       struct debug_hung_script_state *debug_state;
+
+       DEBUG(DEBUG_ERR,("Event script '%s %s %s' timed out after %.1fs, count: %u, pid: %d\n",
+                        current->name, ctdb_eventscript_call_names[state->call], state->options,
+                        timeval_elapsed(&current->start),
+                        ctdb->event_script_timeouts, state->child));
+
+       /* ignore timeouts for these events */
+       switch (state->call) {
+       case CTDB_EVENT_START_RECOVERY:
+       case CTDB_EVENT_RECOVERED:
+       case CTDB_EVENT_TAKE_IP:
+       case CTDB_EVENT_RELEASE_IP:
+       case CTDB_EVENT_STATUS:
+               state->scripts->scripts[state->current].status = 0;
+               DEBUG(DEBUG_ERR,("Ignoring hung script for %s call %d\n", state->options, state->call));
+               break;
+        default:
+               state->scripts->scripts[state->current].status = -ETIME;
+       }
+
+       debug_state = talloc_zero(ctdb, struct debug_hung_script_state);
+       if (debug_state == NULL) {
+               talloc_free(state);
+               return;
+       }
+
+       /* Save information useful for running debug hung script, so
+        * eventscript state can be freed.
+        */
+       debug_state->ctdb = ctdb;
+       debug_state->child = state->child;
+       debug_state->call = state->call;
+
+       /* This destructor will actually kill the hung event script */
+       talloc_set_destructor(debug_state, debug_hung_script_state_destructor);
+
+       state->child = 0;
+       talloc_free(state);
+
+       ctdb_run_debug_hung_script(ctdb, debug_state);
+}
+
+/*
+  destroy an event script: kill it if ->child != 0.
+ */
+static int event_script_destructor(struct ctdb_event_script_state *state)
+{
+       int status;
+       struct event_script_callback *callback;
+
+       if (state->child) {
+               DEBUG(DEBUG_ERR,(__location__ " Sending SIGTERM to child pid:%d\n", state->child));
+
+               if (ctdb_kill(state->ctdb, state->child, SIGTERM) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to kill child process for eventscript, errno %s(%d)\n", strerror(errno), errno));
+               }
+       }
+
+       /* If we were the current monitor, we no longer are. */
+       if (state->ctdb->current_monitor == state) {
+               state->ctdb->current_monitor = NULL;
+       }
+
+       /* Save our scripts as the last executed status, if we have them.
+        * See ctdb_event_script_callback_v where we abort monitor event. */
+       if (state->scripts) {
+               talloc_free(state->ctdb->last_status[state->call]);
+               state->ctdb->last_status[state->call] = state->scripts;
+               if (state->current < state->ctdb->last_status[state->call]->num_scripts) {
+                       state->ctdb->last_status[state->call]->num_scripts = state->current+1;
+               }
+       }
+
+       /* Use last status as result, or "OK" if none. */
+       if (state->ctdb->last_status[state->call]) {
+               status = script_status(state->ctdb->last_status[state->call]);
+       } else {
+               status = 0;
+       }
+
+       /* This is allowed to free us; talloc will prevent double free anyway,
+        * but beware if you call this outside the destructor!
+        * the callback hangs off a different context so we walk the list
+        * of "active" callbacks until we find the one state points to.
+        * if we cant find it it means the callback has been removed.
+        */
+       for (callback = state->ctdb->script_callbacks; callback != NULL; callback = callback->next) {
+               if (callback == state->callback) {
+                       break;
+               }
+       }
+       
+       state->callback = NULL;
+
+       if (callback) {
+               /* Make sure destructor doesn't free itself! */
+               talloc_steal(NULL, callback);
+               callback->fn(state->ctdb, status, callback->private_data);
+               talloc_free(callback);
+       }
+
+       return 0;
+}
+
+static unsigned int count_words(const char *options)
+{
+       unsigned int words = 0;
+
+       options += strspn(options, " \t");
+       while (*options) {
+               words++;
+               options += strcspn(options, " \t");
+               options += strspn(options, " \t");
+       }
+       return words;
+}
+
+static bool check_options(enum ctdb_eventscript_call call, const char *options)
+{
+       switch (call) {
+       /* These all take no arguments. */
+       case CTDB_EVENT_INIT:
+       case CTDB_EVENT_SETUP:
+       case CTDB_EVENT_STARTUP:
+       case CTDB_EVENT_START_RECOVERY:
+       case CTDB_EVENT_RECOVERED:
+       case CTDB_EVENT_MONITOR:
+       case CTDB_EVENT_STATUS:
+       case CTDB_EVENT_SHUTDOWN:
+       case CTDB_EVENT_RELOAD:
+       case CTDB_EVENT_IPREALLOCATED:
+               return count_words(options) == 0;
+
+       case CTDB_EVENT_TAKE_IP: /* interface, IP address, netmask bits. */
+       case CTDB_EVENT_RELEASE_IP:
+               return count_words(options) == 3;
+
+       case CTDB_EVENT_UPDATE_IP: /* old interface, new interface, IP address, netmask bits. */
+               return count_words(options) == 4;
+
+       default:
+               DEBUG(DEBUG_ERR,(__location__ "Unknown ctdb_eventscript_call %u\n", call));
+               return false;
+       }
+}
+
+static int remove_callback(struct event_script_callback *callback)
+{
+       DLIST_REMOVE(callback->ctdb->script_callbacks, callback);
+       return 0;
+}
+
+/*
+  run the event script in the background, calling the callback when 
+  finished
+ */
+static int ctdb_event_script_callback_v(struct ctdb_context *ctdb,
+                                       const void *mem_ctx,
+                                       void (*callback)(struct ctdb_context *, int, void *),
+                                       void *private_data,
+                                       bool from_user,
+                                       enum ctdb_eventscript_call call,
+                                       const char *fmt, va_list ap)
+{
+       struct ctdb_event_script_state *state;
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               /* we guarantee that only some specifically allowed event scripts are run
+                  while in recovery */
+               const enum ctdb_eventscript_call allowed_calls[] = {
+                       CTDB_EVENT_INIT,
+                       CTDB_EVENT_SETUP,
+                       CTDB_EVENT_START_RECOVERY,
+                       CTDB_EVENT_SHUTDOWN,
+                       CTDB_EVENT_RELEASE_IP,
+                       CTDB_EVENT_IPREALLOCATED,
+               };
+               int i;
+               for (i=0;i<ARRAY_SIZE(allowed_calls);i++) {
+                       if (call == allowed_calls[i]) break;
+               }
+               if (i == ARRAY_SIZE(allowed_calls)) {
+                       DEBUG(DEBUG_ERR,("Refusing to run event scripts call '%s' while in recovery\n",
+                                ctdb_eventscript_call_names[call]));
+                       return -1;
+               }
+       }
+
+       /* Kill off any running monitor events to run this event. */
+       if (ctdb->current_monitor) {
+               struct ctdb_event_script_state *ms = talloc_get_type(ctdb->current_monitor, struct ctdb_event_script_state);
+
+               /* Cancel current monitor callback state only if monitoring
+                * context ctdb->monitor->monitor_context has not been freed */
+               if (ms->callback != NULL && !ctdb_stopped_monitoring(ctdb)) {
+                       ms->callback->fn(ctdb, -ECANCELED, ms->callback->private_data);
+                       talloc_free(ms->callback);
+               }
+
+               /* Discard script status so we don't save to last_status */
+               talloc_free(ctdb->current_monitor->scripts);
+               ctdb->current_monitor->scripts = NULL;
+               talloc_free(ctdb->current_monitor);
+               ctdb->current_monitor = NULL;
+       }
+
+       state = talloc(ctdb->event_script_ctx, struct ctdb_event_script_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       /* The callback isn't done if the context is freed. */
+       state->callback = talloc(mem_ctx, struct event_script_callback);
+       CTDB_NO_MEMORY(ctdb, state->callback);
+       DLIST_ADD(ctdb->script_callbacks, state->callback);
+       talloc_set_destructor(state->callback, remove_callback);
+       state->callback->ctdb         = ctdb;
+       state->callback->fn           = callback;
+       state->callback->private_data = private_data;
+
+       state->ctdb = ctdb;
+       state->from_user = from_user;
+       state->call = call;
+       state->options = talloc_vasprintf(state, fmt, ap);
+       state->timeout = timeval_set(ctdb->tunable.script_timeout, 0);
+       state->scripts = NULL;
+       if (state->options == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ " could not allocate state->options\n"));
+               talloc_free(state);
+               return -1;
+       }
+       if (!check_options(state->call, state->options)) {
+               DEBUG(DEBUG_ERR, ("Bad eventscript options '%s' for %s\n",
+                                 ctdb_eventscript_call_names[state->call], state->options));
+               talloc_free(state);
+               return -1;
+       }
+
+       DEBUG(DEBUG_INFO,(__location__ " Starting eventscript %s %s\n",
+                         ctdb_eventscript_call_names[state->call],
+                         state->options));
+
+       /* This is not a child of state, since we save it in destructor. */
+       state->scripts = ctdb_get_script_list(ctdb, ctdb);
+       if (state->scripts == NULL) {
+               talloc_free(state);
+               return -1;
+       }
+       state->current = 0;
+       state->child = 0;
+
+       if (!from_user && (call == CTDB_EVENT_MONITOR || call == CTDB_EVENT_STATUS)) {
+               ctdb->current_monitor = state;
+       }
+
+       talloc_set_destructor(state, event_script_destructor);
+
+       /* Nothing to do? */
+       if (state->scripts->num_scripts == 0) {
+               talloc_free(state);
+               return 0;
+       }
+
+       state->scripts->scripts[0].status = fork_child_for_script(ctdb, state);
+       if (state->scripts->scripts[0].status != 0) {
+               /* Callback is called from destructor, with fail result. */
+               talloc_free(state);
+               return 0;
+       }
+
+       if (!timeval_is_zero(&state->timeout)) {
+               event_add_timed(ctdb->ev, state, timeval_current_ofs(state->timeout.tv_sec, state->timeout.tv_usec), ctdb_event_script_timeout, state);
+       } else {
+               DEBUG(DEBUG_ERR, (__location__ " eventscript %s %s called with no timeout\n",
+                                 ctdb_eventscript_call_names[state->call],
+                                 state->options));
+       }
+
+       return 0;
+}
+
+
+/*
+  run the event script in the background, calling the callback when 
+  finished.  If mem_ctx is freed, callback will never be called.
+ */
+int ctdb_event_script_callback(struct ctdb_context *ctdb, 
+                              TALLOC_CTX *mem_ctx,
+                              void (*callback)(struct ctdb_context *, int, void *),
+                              void *private_data,
+                              bool from_user,
+                              enum ctdb_eventscript_call call,
+                              const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+
+       va_start(ap, fmt);
+       ret = ctdb_event_script_callback_v(ctdb, mem_ctx, callback, private_data, from_user, call, fmt, ap);
+       va_end(ap);
+
+       return ret;
+}
+
+
+struct callback_status {
+       bool done;
+       int status;
+};
+
+/*
+  called when ctdb_event_script() finishes
+ */
+static void event_script_callback(struct ctdb_context *ctdb, int status, void *private_data)
+{
+       struct callback_status *s = (struct callback_status *)private_data;
+       s->done = true;
+       s->status = status;
+}
+
+/*
+  run the event script, waiting for it to complete. Used when the caller
+  doesn't want to continue till the event script has finished.
+ */
+int ctdb_event_script_args(struct ctdb_context *ctdb, enum ctdb_eventscript_call call,
+                          const char *fmt, ...)
+{
+       va_list ap;
+       int ret;
+       struct callback_status status;
+
+       va_start(ap, fmt);
+       ret = ctdb_event_script_callback_v(ctdb, ctdb,
+                       event_script_callback, &status, false, call, fmt, ap);
+       va_end(ap);
+       if (ret != 0) {
+               return ret;
+       }
+
+       status.status = -1;
+       status.done = false;
+
+       while (status.done == false && event_loop_once(ctdb->ev) == 0) /* noop */;
+
+       if (status.status == -ETIME) {
+               DEBUG(DEBUG_ERR, (__location__ " eventscript for '%s' timedout."
+                                 " Immediately banning ourself for %d seconds\n",
+                                 ctdb_eventscript_call_names[call],
+                                 ctdb->tunable.recovery_ban_period));
+
+               /* Don't ban self if CTDB is starting up or shutting down */
+               if (call != CTDB_EVENT_INIT && call != CTDB_EVENT_SHUTDOWN) {
+                       ctdb_ban_self(ctdb);
+               }
+       }
+
+       return status.status;
+}
+
+int ctdb_event_script(struct ctdb_context *ctdb, enum ctdb_eventscript_call call)
+{
+       /* GCC complains about empty format string, so use %s and "". */
+       return ctdb_event_script_args(ctdb, call, "%s", "");
+}
+
+struct eventscript_callback_state {
+       struct ctdb_req_control *c;
+};
+
+/*
+  called when a forced eventscript run has finished
+ */
+static void run_eventscripts_callback(struct ctdb_context *ctdb, int status, 
+                                void *private_data)
+{
+       struct eventscript_callback_state *state = 
+               talloc_get_type(private_data, struct eventscript_callback_state);
+
+       ctdb_enable_monitoring(ctdb);
+
+       if (status != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to run eventscripts\n"));
+       }
+
+       ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL);
+       /* This will free the struct ctdb_event_script_state we are in! */
+       talloc_free(state);
+       return;
+}
+
+
+/* Returns rest of string, or NULL if no match. */
+static const char *get_call(const char *p, enum ctdb_eventscript_call *call)
+{
+       unsigned int len;
+
+       /* Skip any initial whitespace. */
+       p += strspn(p, " \t");
+
+       /* See if we match any. */
+       for (*call = 0; *call < CTDB_EVENT_MAX; (*call)++) {
+               len = strlen(ctdb_eventscript_call_names[*call]);
+               if (strncmp(p, ctdb_eventscript_call_names[*call], len) == 0) {
+                       /* If end of string or whitespace, we're done. */
+                       if (strcspn(p + len, " \t") == 0) {
+                               return p + len;
+                       }
+               }
+       }
+       return NULL;
+}
+
+/*
+  A control to force running of the eventscripts from the ctdb client tool
+*/
+int32_t ctdb_run_eventscripts(struct ctdb_context *ctdb,
+               struct ctdb_req_control *c,
+               TDB_DATA indata, bool *async_reply)
+{
+       int ret;
+       struct eventscript_callback_state *state;
+       const char *options;
+       enum ctdb_eventscript_call call;
+
+       /* Figure out what call they want. */
+       options = get_call((const char *)indata.dptr, &call);
+       if (!options) {
+               DEBUG(DEBUG_ERR, (__location__ " Invalid event name \"%s\"\n", (const char *)indata.dptr));
+               return -1;
+       }
+
+       if (ctdb->recovery_mode != CTDB_RECOVERY_NORMAL) {
+               DEBUG(DEBUG_ERR, (__location__ " Aborted running eventscript \"%s\" while in RECOVERY mode\n", indata.dptr));
+               return -1;
+       }
+
+       state = talloc(ctdb->event_script_ctx, struct eventscript_callback_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       state->c = talloc_steal(state, c);
+
+       DEBUG(DEBUG_NOTICE,("Running eventscripts with arguments %s\n", indata.dptr));
+
+       ctdb_disable_monitoring(ctdb);
+
+       ret = ctdb_event_script_callback(ctdb, 
+                        state, run_eventscripts_callback, state,
+                        true, call, "%s", options);
+
+       if (ret != 0) {
+               ctdb_enable_monitoring(ctdb);
+               DEBUG(DEBUG_ERR,(__location__ " Failed to run eventscripts with arguments %s\n", indata.dptr));
+               talloc_free(state);
+               return -1;
+       }
+
+       /* tell ctdb_control.c that we will be replying asynchronously */
+       *async_reply = true;
+
+       return 0;
+}
+
+
+
+int32_t ctdb_control_enable_script(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       const char *script;
+       struct stat st;
+       char *filename;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       script = (char *)indata.dptr;
+       if (indata.dsize == 0) {
+               DEBUG(DEBUG_ERR,(__location__ " No script specified.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (indata.dptr[indata.dsize - 1] != '\0') {
+               DEBUG(DEBUG_ERR,(__location__ " String is not null terminated.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (index(script,'/') != NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Script name contains '/'. Failed to enable script %s\n", script));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       if (stat(ctdb->event_script_dir, &st) != 0 && 
+           errno == ENOENT) {
+               DEBUG(DEBUG_CRIT,("No event script directory found at '%s'\n", ctdb->event_script_dir));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       filename = talloc_asprintf(tmp_ctx, "%s/%s", ctdb->event_script_dir, script);
+       if (filename == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to create script path\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (stat(filename, &st) != 0) {
+               DEBUG(DEBUG_ERR,("Could not stat event script %s. Failed to enable script.\n", filename));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (chmod(filename, st.st_mode | S_IXUSR) == -1) {
+               DEBUG(DEBUG_ERR,("Could not chmod %s. Failed to enable script.\n", filename));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+int32_t ctdb_control_disable_script(struct ctdb_context *ctdb, TDB_DATA indata)
+{
+       const char *script;
+       struct stat st;
+       char *filename;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       script = (char *)indata.dptr;
+       if (indata.dsize == 0) {
+               DEBUG(DEBUG_ERR,(__location__ " No script specified.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (indata.dptr[indata.dsize - 1] != '\0') {
+               DEBUG(DEBUG_ERR,(__location__ " String is not null terminated.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (index(script,'/') != NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Script name contains '/'. Failed to disable script %s\n", script));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       if (stat(ctdb->event_script_dir, &st) != 0 && 
+           errno == ENOENT) {
+               DEBUG(DEBUG_CRIT,("No event script directory found at '%s'\n", ctdb->event_script_dir));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       filename = talloc_asprintf(tmp_ctx, "%s/%s", ctdb->event_script_dir, script);
+       if (filename == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to create script path\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (stat(filename, &st) != 0) {
+               DEBUG(DEBUG_ERR,("Could not stat event script %s. Failed to disable script.\n", filename));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (chmod(filename, st.st_mode & ~(S_IXUSR|S_IXGRP|S_IXOTH)) == -1) {
+               DEBUG(DEBUG_ERR,("Could not chmod %s. Failed to disable script.\n", filename));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
diff --git a/ctdb/tcp/ctdb_tcp.h b/ctdb/tcp/ctdb_tcp.h
new file mode 100644 (file)
index 0000000..5b6b651
--- /dev/null
@@ -0,0 +1,60 @@
+/* 
+   ctdb database library
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CTDB_TCP_H
+#define _CTDB_TCP_H
+/* ctdb_tcp main state */
+struct ctdb_tcp {
+       struct ctdb_context *ctdb;
+       int listen_fd;
+};
+
+/*
+  state associated with an incoming connection
+*/
+struct ctdb_incoming {
+       struct ctdb_context *ctdb;
+       int fd;
+       struct ctdb_queue *queue;
+};
+
+/*
+  state associated with one tcp node
+*/
+struct ctdb_tcp_node {
+       int fd;
+       struct ctdb_queue *out_queue;
+       struct fd_event *connect_fde;
+       struct timed_event *connect_te;
+};
+
+
+/* prototypes internal to tcp transport */
+int ctdb_tcp_queue_pkt(struct ctdb_node *node, uint8_t *data, uint32_t length);
+int ctdb_tcp_listen(struct ctdb_context *ctdb);
+void ctdb_tcp_node_connect(struct event_context *ev, struct timed_event *te, 
+                          struct timeval t, void *private_data);
+void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args);
+void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data);
+void ctdb_tcp_stop_connection(struct ctdb_node *node);
+
+#define CTDB_TCP_ALIGNMENT 8
+
+#endif /* _CTDB_TCP_H */
diff --git a/ctdb/tcp/tcp_connect.c b/ctdb/tcp/tcp_connect.c
new file mode 100644 (file)
index 0000000..9df3300
--- /dev/null
@@ -0,0 +1,485 @@
+/* 
+   ctdb over TCP
+
+   Copyright (C) Andrew Tridgell  2006
+   Copyright (C) Ronnie Sahlberg  2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "ctdb_tcp.h"
+
+/*
+  stop any connecting (established or pending) to a node
+ */
+void ctdb_tcp_stop_connection(struct ctdb_node *node)
+{
+       struct ctdb_tcp_node *tnode = talloc_get_type(
+               node->private_data, struct ctdb_tcp_node);
+       
+       ctdb_queue_set_fd(tnode->out_queue, -1);
+       talloc_free(tnode->connect_te);
+       talloc_free(tnode->connect_fde);
+       tnode->connect_fde = NULL;
+       tnode->connect_te = NULL;
+       if (tnode->fd != -1) {
+               close(tnode->fd);
+               tnode->fd = -1;
+       }
+}
+
+
+/*
+  called when a complete packet has come in - should not happen on this socket
+  unless the other side closes the connection with RST or FIN
+ */
+void ctdb_tcp_tnode_cb(uint8_t *data, size_t cnt, void *private_data)
+{
+       struct ctdb_node *node = talloc_get_type(private_data, struct ctdb_node);
+       struct ctdb_tcp_node *tnode = talloc_get_type(
+               node->private_data, struct ctdb_tcp_node);
+
+       if (data == NULL) {
+               node->ctdb->upcalls->node_dead(node);
+       }
+
+       ctdb_tcp_stop_connection(node);
+       tnode->connect_te = event_add_timed(node->ctdb->ev, tnode,
+                                           timeval_current_ofs(3, 0),
+                                           ctdb_tcp_node_connect, node);
+}
+
+/*
+  called when socket becomes writeable on connect
+*/
+static void ctdb_node_connect_write(struct event_context *ev, struct fd_event *fde, 
+                                   uint16_t flags, void *private_data)
+{
+       struct ctdb_node *node = talloc_get_type(private_data,
+                                                struct ctdb_node);
+       struct ctdb_tcp_node *tnode = talloc_get_type(node->private_data,
+                                                     struct ctdb_tcp_node);
+       struct ctdb_context *ctdb = node->ctdb;
+       int error = 0;
+       socklen_t len = sizeof(error);
+       int one = 1;
+
+       talloc_free(tnode->connect_te);
+       tnode->connect_te = NULL;
+
+       if (getsockopt(tnode->fd, SOL_SOCKET, SO_ERROR, &error, &len) != 0 ||
+           error != 0) {
+               ctdb_tcp_stop_connection(node);
+               tnode->connect_te = event_add_timed(ctdb->ev, tnode, 
+                                                   timeval_current_ofs(1, 0),
+                                                   ctdb_tcp_node_connect, node);
+               return;
+       }
+
+       talloc_free(tnode->connect_fde);
+       tnode->connect_fde = NULL;
+
+        setsockopt(tnode->fd,IPPROTO_TCP,TCP_NODELAY,(char *)&one,sizeof(one));
+        setsockopt(tnode->fd,SOL_SOCKET,SO_KEEPALIVE,(char *)&one,sizeof(one));
+
+       ctdb_queue_set_fd(tnode->out_queue, tnode->fd);
+
+       /* the queue subsystem now owns this fd */
+       tnode->fd = -1;
+}
+
+
+static int ctdb_tcp_get_address(struct ctdb_context *ctdb,
+                               const char *address, ctdb_sock_addr *addr)
+{
+       if (parse_ip(address, NULL, 0, addr) == 0) {
+               DEBUG(DEBUG_CRIT, (__location__ " Unparsable address : %s.\n", address));
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  called when we should try and establish a tcp connection to a node
+*/
+void ctdb_tcp_node_connect(struct event_context *ev, struct timed_event *te, 
+                          struct timeval t, void *private_data)
+{
+       struct ctdb_node *node = talloc_get_type(private_data,
+                                                struct ctdb_node);
+       struct ctdb_tcp_node *tnode = talloc_get_type(node->private_data, 
+                                                     struct ctdb_tcp_node);
+       struct ctdb_context *ctdb = node->ctdb;
+        ctdb_sock_addr sock_in;
+       int sockin_size;
+       int sockout_size;
+        ctdb_sock_addr sock_out;
+
+       ctdb_tcp_stop_connection(node);
+
+       ZERO_STRUCT(sock_out);
+#ifdef HAVE_SOCK_SIN_LEN
+       sock_out.ip.sin_len = sizeof(sock_out);
+#endif
+       if (ctdb_tcp_get_address(ctdb, node->address.address, &sock_out) != 0) {
+               return;
+       }
+       switch (sock_out.sa.sa_family) {
+       case AF_INET:
+               sock_out.ip.sin_port = htons(node->address.port);
+               break;
+       case AF_INET6:
+               sock_out.ip6.sin6_port = htons(node->address.port);
+               break;
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " unknown family %u\n",
+                       sock_out.sa.sa_family));
+               return;
+       }
+
+       tnode->fd = socket(sock_out.sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
+       if (tnode->fd == -1) {
+               DEBUG(DEBUG_ERR, (__location__ "Failed to create socket\n"));
+               return;
+       }
+       set_nonblocking(tnode->fd);
+       set_close_on_exec(tnode->fd);
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created TCP SOCKET FD:%d\n", tnode->fd));
+
+       /* Bind our side of the socketpair to the same address we use to listen
+        * on incoming CTDB traffic.
+        * We must specify this address to make sure that the address we expose to
+        * the remote side is actually routable in case CTDB traffic will run on
+        * a dedicated non-routeable network.
+        */
+       ZERO_STRUCT(sock_in);
+       if (ctdb_tcp_get_address(ctdb, ctdb->address.address, &sock_in) != 0) {
+               DEBUG(DEBUG_ERR, (__location__ " Failed to find our address. Failing bind.\n"));
+               close(tnode->fd);
+               return;
+       }
+
+       /* AIX libs check to see if the socket address and length
+          arguments are consistent with each other on calls like
+          connect().   Can not get by with just sizeof(sock_in),
+          need sizeof(sock_in.ip).
+       */
+       switch (sock_in.sa.sa_family) {
+       case AF_INET:
+               sockin_size = sizeof(sock_in.ip);
+               sockout_size = sizeof(sock_out.ip);
+               break;
+       case AF_INET6:
+               sockin_size = sizeof(sock_in.ip6);
+               sockout_size = sizeof(sock_out.ip6);
+               break;
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " unknown family %u\n",
+                       sock_in.sa.sa_family));
+               close(tnode->fd);
+               return;
+       }
+#ifdef HAVE_SOCK_SIN_LEN
+       sock_in.ip.sin_len = sockin_size;
+       sock_out.ip.sin_len = sockout_size;
+#endif
+       if (bind(tnode->fd, (struct sockaddr *)&sock_in, sockin_size) == -1) {
+               DEBUG(DEBUG_ERR, (__location__ "Failed to bind socket %s(%d)\n",
+                                 strerror(errno), errno));
+               close(tnode->fd);
+               return;
+       }
+
+       if (connect(tnode->fd, (struct sockaddr *)&sock_out, sockout_size) != 0 &&
+           errno != EINPROGRESS) {
+               ctdb_tcp_stop_connection(node);
+               tnode->connect_te = event_add_timed(ctdb->ev, tnode, 
+                                                   timeval_current_ofs(1, 0),
+                                                   ctdb_tcp_node_connect, node);
+               return;
+       }
+
+       /* non-blocking connect - wait for write event */
+       tnode->connect_fde = event_add_fd(node->ctdb->ev, tnode, tnode->fd, 
+                                         EVENT_FD_WRITE|EVENT_FD_READ, 
+                                         ctdb_node_connect_write, node);
+
+       /* don't give it long to connect - retry in one second. This ensures
+          that we find a node is up quickly (tcp normally backs off a syn reply
+          delay by quite a lot) */
+       tnode->connect_te = event_add_timed(ctdb->ev, tnode, timeval_current_ofs(1, 0), 
+                                           ctdb_tcp_node_connect, node);
+}
+
+/*
+  called when we get contacted by another node
+  currently makes no attempt to check if the connection is really from a ctdb
+  node in our cluster
+*/
+static void ctdb_listen_event(struct event_context *ev, struct fd_event *fde, 
+                             uint16_t flags, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data, struct ctdb_tcp);
+       ctdb_sock_addr addr;
+       socklen_t len;
+       int fd, nodeid;
+       struct ctdb_incoming *in;
+       int one = 1;
+       const char *incoming_node;
+
+       memset(&addr, 0, sizeof(addr));
+       len = sizeof(addr);
+       fd = accept(ctcp->listen_fd, (struct sockaddr *)&addr, &len);
+       if (fd == -1) return;
+
+       incoming_node = ctdb_addr_to_str(&addr);
+       nodeid = ctdb_ip_to_nodeid(ctdb, incoming_node);
+
+       if (nodeid == -1) {
+               DEBUG(DEBUG_ERR, ("Refused connection from unknown node %s\n", incoming_node));
+               close(fd);
+               return;
+       }
+
+       in = talloc_zero(ctcp, struct ctdb_incoming);
+       in->fd = fd;
+       in->ctdb = ctdb;
+
+       set_nonblocking(in->fd);
+       set_close_on_exec(in->fd);
+
+       DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d to incoming ctdb connection\n", fd));
+
+        setsockopt(in->fd,SOL_SOCKET,SO_KEEPALIVE,(char *)&one,sizeof(one));
+
+       in->queue = ctdb_queue_setup(ctdb, in, in->fd, CTDB_TCP_ALIGNMENT, 
+                                    ctdb_tcp_read_cb, in, "ctdbd-%s", incoming_node);
+}
+
+
+/*
+  automatically find which address to listen on
+*/
+static int ctdb_tcp_listen_automatic(struct ctdb_context *ctdb)
+{
+       struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data,
+                                               struct ctdb_tcp);
+        ctdb_sock_addr sock;
+       int lock_fd, i;
+       const char *lock_path = VARDIR "/run/ctdb/.socket_lock";
+       struct flock lock;
+       int one = 1;
+       int sock_size;
+       struct tevent_fd *fde;
+
+       /* If there are no nodes, then it won't be possible to find
+        * the first one.  Log a failure and short circuit the whole
+        * process.
+        */
+       if (ctdb->num_nodes == 0) {
+               DEBUG(DEBUG_CRIT,("No nodes available to attempt bind to - is the nodes file empty?\n"));
+               return -1;
+       }
+
+       /* in order to ensure that we don't get two nodes with the
+          same adddress, we must make the bind() and listen() calls
+          atomic. The SO_REUSEADDR setsockopt only prevents double
+          binds if the first socket is in LISTEN state  */
+       lock_fd = open(lock_path, O_RDWR|O_CREAT, 0666);
+       if (lock_fd == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to open %s\n", lock_path));
+               return -1;
+       }
+
+       lock.l_type = F_WRLCK;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = 0;
+       lock.l_len = 1;
+       lock.l_pid = 0;
+
+       if (fcntl(lock_fd, F_SETLKW, &lock) != 0) {
+               DEBUG(DEBUG_CRIT,("Unable to lock %s\n", lock_path));
+               close(lock_fd);
+               return -1;
+       }
+
+       for (i=0; i < ctdb->num_nodes; i++) {
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               ZERO_STRUCT(sock);
+               if (ctdb_tcp_get_address(ctdb,
+                               ctdb->nodes[i]->address.address, 
+                               &sock) != 0) {
+                       continue;
+               }
+       
+               switch (sock.sa.sa_family) {
+               case AF_INET:
+                       sock.ip.sin_port = htons(ctdb->nodes[i]->address.port);
+                       sock_size = sizeof(sock.ip);
+                       break;
+               case AF_INET6:
+                       sock.ip6.sin6_port = htons(ctdb->nodes[i]->address.port);
+                       sock_size = sizeof(sock.ip6);
+                       break;
+               default:
+                       DEBUG(DEBUG_ERR, (__location__ " unknown family %u\n",
+                               sock.sa.sa_family));
+                       continue;
+               }
+#ifdef HAVE_SOCK_SIN_LEN
+               sock.ip.sin_len = sock_size;
+#endif
+
+               ctcp->listen_fd = socket(sock.sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
+               if (ctcp->listen_fd == -1) {
+                       ctdb_set_error(ctdb, "socket failed\n");
+                       continue;
+               }
+
+               set_close_on_exec(ctcp->listen_fd);
+
+               setsockopt(ctcp->listen_fd,SOL_SOCKET,SO_REUSEADDR,(char *)&one,sizeof(one));
+
+               if (bind(ctcp->listen_fd, (struct sockaddr * )&sock, sock_size) == 0) {
+                       break;
+               }
+
+               if (errno == EADDRNOTAVAIL) {
+                       DEBUG(DEBUG_DEBUG,(__location__ " Failed to bind() to socket. %s(%d)\n",
+                                       strerror(errno), errno));
+               } else {
+                       DEBUG(DEBUG_ERR,(__location__ " Failed to bind() to socket. %s(%d)\n",
+                                       strerror(errno), errno));
+               }
+       }
+       
+       if (i == ctdb->num_nodes) {
+               DEBUG(DEBUG_CRIT,("Unable to bind to any of the node addresses - giving up\n"));
+               goto failed;
+       }
+       ctdb->address.address = talloc_strdup(ctdb, ctdb->nodes[i]->address.address);
+       ctdb->address.port    = ctdb->nodes[i]->address.port;
+       ctdb->name = talloc_asprintf(ctdb, "%s:%u", 
+                                    ctdb->address.address, 
+                                    ctdb->address.port);
+       ctdb->pnn = ctdb->nodes[i]->pnn;
+       DEBUG(DEBUG_INFO,("ctdb chose network address %s:%u pnn %u\n", 
+                ctdb->address.address, 
+                ctdb->address.port, 
+                ctdb->pnn));
+       
+       if (listen(ctcp->listen_fd, 10) == -1) {
+               goto failed;
+       }
+
+       fde = event_add_fd(ctdb->ev, ctcp, ctcp->listen_fd, EVENT_FD_READ,
+                          ctdb_listen_event, ctdb);
+       tevent_fd_set_auto_close(fde);
+
+       close(lock_fd);
+
+       return 0;
+       
+failed:
+       close(lock_fd);
+       close(ctcp->listen_fd);
+       ctcp->listen_fd = -1;
+       return -1;
+}
+
+
+/*
+  listen on our own address
+*/
+int ctdb_tcp_listen(struct ctdb_context *ctdb)
+{
+       struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data,
+                                               struct ctdb_tcp);
+        ctdb_sock_addr sock;
+       int sock_size;
+       int one = 1;
+       struct tevent_fd *fde;
+
+       /* we can either auto-bind to the first available address, or we can
+          use a specified address */
+       if (!ctdb->address.address) {
+               return ctdb_tcp_listen_automatic(ctdb);
+       }
+
+       ZERO_STRUCT(sock);
+       if (ctdb_tcp_get_address(ctdb, ctdb->address.address, 
+                                &sock) != 0) {
+               goto failed;
+       }
+       
+       switch (sock.sa.sa_family) {
+       case AF_INET:
+               sock.ip.sin_port = htons(ctdb->address.port);
+               sock_size = sizeof(sock.ip);
+               break;
+       case AF_INET6:
+               sock.ip6.sin6_port = htons(ctdb->address.port);
+               sock_size = sizeof(sock.ip6);
+               break;
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " unknown family %u\n",
+                       sock.sa.sa_family));
+               goto failed;
+       }
+#ifdef HAVE_SOCK_SIN_LEN
+       sock.ip.sin_len = sock_size;
+#endif
+
+       ctcp->listen_fd = socket(sock.sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
+       if (ctcp->listen_fd == -1) {
+               ctdb_set_error(ctdb, "socket failed\n");
+               return -1;
+       }
+
+       set_close_on_exec(ctcp->listen_fd);
+
+        setsockopt(ctcp->listen_fd,SOL_SOCKET,SO_REUSEADDR,(char *)&one,sizeof(one));
+
+       if (bind(ctcp->listen_fd, (struct sockaddr * )&sock, sock_size) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to bind() to socket. %s(%d)\n", strerror(errno), errno));
+               goto failed;
+       }
+
+       if (listen(ctcp->listen_fd, 10) == -1) {
+               goto failed;
+       }
+
+       fde = event_add_fd(ctdb->ev, ctcp, ctcp->listen_fd, EVENT_FD_READ,
+                    ctdb_listen_event, ctdb);  
+       tevent_fd_set_auto_close(fde);
+
+       return 0;
+
+failed:
+       if (ctcp->listen_fd != -1) {
+               close(ctcp->listen_fd);
+       }
+       ctcp->listen_fd = -1;
+       return -1;
+}
+
diff --git a/ctdb/tcp/tcp_init.c b/ctdb/tcp/tcp_init.c
new file mode 100644 (file)
index 0000000..a65e732
--- /dev/null
@@ -0,0 +1,201 @@
+/* 
+   ctdb over TCP
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "ctdb_tcp.h"
+
+static int tnode_destructor(struct ctdb_tcp_node *tnode)
+{
+  //   struct ctdb_node *node = talloc_find_parent_bytype(tnode, struct ctdb_node);
+
+       if (tnode->fd != -1) {
+               close(tnode->fd);
+               tnode->fd = -1;
+       }
+
+       return 0;
+}
+
+/*
+  initialise tcp portion of a ctdb node 
+*/
+static int ctdb_tcp_add_node(struct ctdb_node *node)
+{
+       struct ctdb_tcp_node *tnode;
+       tnode = talloc_zero(node, struct ctdb_tcp_node);
+       CTDB_NO_MEMORY(node->ctdb, tnode);
+
+       tnode->fd = -1;
+       node->private_data = tnode;
+       talloc_set_destructor(tnode, tnode_destructor);
+
+       tnode->out_queue = ctdb_queue_setup(node->ctdb, node, tnode->fd, CTDB_TCP_ALIGNMENT,
+                                           ctdb_tcp_tnode_cb, node, "to-node-%s", node->name);
+       
+       return 0;
+}
+
+/*
+  initialise transport structures
+*/
+static int ctdb_tcp_initialise(struct ctdb_context *ctdb)
+{
+       int i;
+
+       /* listen on our own address */
+       if (ctdb_tcp_listen(ctdb) != 0) {
+               DEBUG(DEBUG_CRIT, (__location__ " Failed to start listening on the CTDB socket\n"));
+               exit(1);
+       }
+
+       for (i=0; i < ctdb->num_nodes; i++) {
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               if (ctdb_tcp_add_node(ctdb->nodes[i]) != 0) {
+                       DEBUG(DEBUG_CRIT, ("methods->add_node failed at %d\n", i));
+                       return -1;
+               }
+       }
+       
+       return 0;
+}
+
+/*
+  start the protocol going
+*/
+static int ctdb_tcp_connect_node(struct ctdb_node *node)
+{
+       struct ctdb_context *ctdb = node->ctdb;
+       struct ctdb_tcp_node *tnode = talloc_get_type(
+               node->private_data, struct ctdb_tcp_node);
+
+       /* startup connection to the other server - will happen on
+          next event loop */
+       if (!ctdb_same_address(&ctdb->address, &node->address)) {
+               tnode->connect_te = event_add_timed(ctdb->ev, tnode, 
+                                                   timeval_zero(), 
+                                                   ctdb_tcp_node_connect, node);
+       }
+
+       return 0;
+}
+
+/*
+  shutdown and try to restart a connection to a node after it has been
+  disconnected
+*/
+static void ctdb_tcp_restart(struct ctdb_node *node)
+{
+       struct ctdb_tcp_node *tnode = talloc_get_type(
+               node->private_data, struct ctdb_tcp_node);
+
+       DEBUG(DEBUG_NOTICE,("Tearing down connection to dead node :%d\n", node->pnn));
+
+       ctdb_tcp_stop_connection(node);
+
+       tnode->connect_te = event_add_timed(node->ctdb->ev, tnode, timeval_zero(), 
+                                           ctdb_tcp_node_connect, node);
+}
+
+
+/*
+  shutdown the transport
+*/
+static void ctdb_tcp_shutdown(struct ctdb_context *ctdb)
+{
+       struct ctdb_tcp *ctcp = talloc_get_type(ctdb->private_data,
+                                               struct ctdb_tcp);
+       talloc_free(ctcp);
+       ctdb->private_data = NULL;
+}
+
+/*
+  start the transport
+*/
+static int ctdb_tcp_start(struct ctdb_context *ctdb)
+{
+       int i;
+
+       for (i=0; i < ctdb->num_nodes; i++) {
+               if (ctdb->nodes[i]->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               ctdb_tcp_connect_node(ctdb->nodes[i]);
+       }
+
+       return 0;
+}
+
+
+/*
+  transport packet allocator - allows transport to control memory for packets
+*/
+static void *ctdb_tcp_allocate_pkt(TALLOC_CTX *mem_ctx, size_t size)
+{
+       /* tcp transport needs to round to 8 byte alignment to ensure
+          that we can use a length header and 64 bit elements in
+          structures */
+       size = (size+(CTDB_TCP_ALIGNMENT-1)) & ~(CTDB_TCP_ALIGNMENT-1);
+       return talloc_size(mem_ctx, size);
+}
+
+
+static const struct ctdb_methods ctdb_tcp_methods = {
+       .initialise   = ctdb_tcp_initialise,
+       .start        = ctdb_tcp_start,
+       .queue_pkt    = ctdb_tcp_queue_pkt,
+       .add_node     = ctdb_tcp_add_node,
+       .connect_node = ctdb_tcp_connect_node,
+       .allocate_pkt = ctdb_tcp_allocate_pkt,
+       .shutdown     = ctdb_tcp_shutdown,
+       .restart      = ctdb_tcp_restart,
+};
+
+static int tcp_ctcp_destructor(struct ctdb_tcp *ctcp)
+{
+       ctcp->ctdb->private_data = NULL;
+       ctcp->ctdb->methods = NULL;
+       
+       return 0;
+}
+
+               
+/*
+  initialise tcp portion of ctdb 
+*/
+int ctdb_tcp_init(struct ctdb_context *ctdb)
+{
+       struct ctdb_tcp *ctcp;
+       ctcp = talloc_zero(ctdb, struct ctdb_tcp);
+       CTDB_NO_MEMORY(ctdb, ctcp);
+
+       ctcp->listen_fd = -1;
+       ctcp->ctdb      = ctdb;
+       ctdb->private_data = ctcp;
+       ctdb->methods = &ctdb_tcp_methods;
+
+       talloc_set_destructor(ctcp, tcp_ctcp_destructor);
+       return 0;
+}
+
diff --git a/ctdb/tcp/tcp_io.c b/ctdb/tcp/tcp_io.c
new file mode 100644 (file)
index 0000000..5111195
--- /dev/null
@@ -0,0 +1,88 @@
+/* 
+   ctdb over TCP
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/util/dlinklist.h"
+#include "tdb.h"
+#include "system/network.h"
+#include "system/filesys.h"
+#include "../include/ctdb_private.h"
+#include "ctdb_tcp.h"
+
+
+/*
+  called when a complete packet has come in
+ */
+void ctdb_tcp_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+       struct ctdb_incoming *in = talloc_get_type(args, struct ctdb_incoming);
+       struct ctdb_req_header *hdr = (struct ctdb_req_header *)data;
+
+       if (data == NULL) {
+               /* incoming socket has died */
+               goto failed;
+       }
+
+       if (cnt < sizeof(*hdr)) {
+               DEBUG(DEBUG_ALERT,(__location__ " Bad packet length %u\n", (unsigned)cnt));
+               goto failed;
+       }
+
+       if (cnt & (CTDB_TCP_ALIGNMENT-1)) {
+               DEBUG(DEBUG_ALERT,(__location__ " Length 0x%x not multiple of alignment\n", 
+                        (unsigned)cnt));
+               goto failed;
+       }
+
+
+       if (cnt != hdr->length) {
+               DEBUG(DEBUG_ALERT,(__location__ " Bad header length %u expected %u\n", 
+                        (unsigned)hdr->length, (unsigned)cnt));
+               goto failed;
+       }
+
+       if (hdr->ctdb_magic != CTDB_MAGIC) {
+               DEBUG(DEBUG_ALERT,(__location__ " Non CTDB packet 0x%x rejected\n", 
+                        hdr->ctdb_magic));
+               goto failed;
+       }
+
+       if (hdr->ctdb_version != CTDB_VERSION) {
+               DEBUG(DEBUG_ALERT, (__location__ " Bad CTDB version 0x%x rejected\n", 
+                         hdr->ctdb_version));
+               goto failed;
+       }
+
+       /* tell the ctdb layer above that we have a packet */
+       in->ctdb->upcalls->recv_pkt(in->ctdb, data, cnt);
+       return;
+
+failed:
+       talloc_free(in);
+}
+
+/*
+  queue a packet for sending
+*/
+int ctdb_tcp_queue_pkt(struct ctdb_node *node, uint8_t *data, uint32_t length)
+{
+       struct ctdb_tcp_node *tnode = talloc_get_type(node->private_data,
+                                                     struct ctdb_tcp_node);
+       return ctdb_queue_send(tnode->out_queue, data, length);
+}
diff --git a/ctdb/tests/INSTALL b/ctdb/tests/INSTALL
new file mode 100755 (executable)
index 0000000..5581989
--- /dev/null
@@ -0,0 +1,91 @@
+#!/bin/sh
+
+# Script to install the CTDB testsuite on a machine.
+
+usage ()
+{
+    if [ -n "$1" ] ; then
+       echo "$1"
+       echo
+    fi
+
+    cat <<EOF
+  $0 --destdir=<DIR1> \\
+     --datarootdir=<DIR2>  \\
+     --libdir=<DIR3> \\
+     --bindir=<DIR4> \\
+     --etcdir=<DIR5>
+EOF
+    exit 1
+}
+
+parse_options ()
+{
+    temp=$(getopt -n "$prog" -o "h" -l help,destdir:,datarootdir:,libdir:,bindir:,etcdir: -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$temp"
+
+    destdir=""
+    datarootdir=""
+    libdir=""
+    bindir=""
+    etcdir=""
+
+    while true ; do
+        case "$1" in
+            --destdir) destdir="$2" ; shift 2 ;;
+            --datarootdir) datarootdir="$2" ; shift 2 ;;
+            --libdir) libdir="$2" ; shift 2 ;;
+            --bindir) bindir="$2" ; shift 2 ;;
+            --etcdir) etcdir="$2" ; shift 2 ;;
+            --) shift ; break ;;
+            -h|--help|*) usage ;; # Shouldn't happen, so this is reasonable.
+        esac
+    done
+
+    [ $# -gt 0 ] && usage
+
+    [ -n "$destdir" ]     || usage "No option --destdir specified"
+    [ -n "$datarootdir" ] || usage "No option --datarootdir specified"
+    [ -n "$libdir" ]      || usage "No option --libdir specified"
+    [ -n "$bindir" ]      || usage "No option --bindir specified"
+    [ -n "$etcdir" ]      || usage "No option --etcdir specified"
+}
+
+parse_options "$@"
+
+# Make things neater!
+if [ "$destdir" = "/" ] ; then
+    destdir=""
+fi
+
+data_subdirs="complex events.d eventscripts onnode scripts simple takeover tool"
+
+ctdb_datadir="${destdir}${datarootdir}/ctdb-tests"
+echo "Installing test data files into ${ctdb_datadir}..."
+for d in $data_subdirs ; do
+    mkdir -p "${ctdb_datadir}/${d}"
+    cp -pr "tests/${d}" "${ctdb_datadir}"
+done
+# Some of the unit tests have relative symlinks back to in-tree bits
+# and pieces.  These links will be broken!
+for i in "events.d" "functions" "nfs-rpc-checks.d" ; do
+    ln -sf "${etcdir}/ctdb/${i}" "${ctdb_datadir}/eventscripts/etc-ctdb/${i}"
+done
+# test_wrap needs to set TEST_BIN_DIR
+sed -i -e "s@^TEST_SCRIPTS_DIR=.*@&\nexport TEST_BIN_DIR=\"${libdir}/ctdb-tests\"@" "${ctdb_datadir}/scripts/test_wrap"
+
+ctdb_libdir="${destdir}${libdir}/ctdb-tests"
+mkdir -p  "${destdir}${libdir}"
+echo "Installing test binary files into ${ctdb_libdir}..."
+cp -pr "tests/bin/" "${ctdb_libdir}"
+
+ctdb_bindir="${destdir}${bindir}"
+echo "Installing wrapper scripts into ${ctdb_bindir}..."
+mkdir -p "${ctdb_bindir}"
+out="${ctdb_bindir}/ctdb_run_tests"
+sed -e "s@^test_dir=.*@test_dir=${datarootdir}/ctdb-tests\nexport TEST_BIN_DIR=\"${libdir}/ctdb-tests\"@" "tests/run_tests.sh" >"$out"
+chmod 755 "$out"
+ln -s "ctdb_run_tests" "${ctdb_bindir}/ctdb_run_cluster_tests"
diff --git a/ctdb/tests/README b/ctdb/tests/README
new file mode 100644 (file)
index 0000000..1c9983b
--- /dev/null
@@ -0,0 +1,104 @@
+Introduction
+------------
+
+For a developer, the simplest way of running most tests on a local
+machine from within the git repository is:
+
+  make test
+
+This runs all unit tests (onnode, takeover, tool, eventscripts) and
+the tests against local daemons (simple) using the script
+tests/run_tests.sh.
+
+When running tests against a real or virtual cluster the script
+tests/run_cluster_tests.sh can be used.  This runs all integration
+tests (simple, complex).
+
+Both of these scripts can also take a list of tests to run.  You can
+also pass options, which are then passed to run_tests.  However, if
+you just try to pass options to run_tests then you lose the default
+list of tests that are run.  You can't have everything...
+
+scripts/run_tests
+-----------------
+
+The above scripts invoke tests/scripts/run_tests.  This script has a
+lot of command-line switches.  Some of the more useful options
+include:
+
+  -s  Print a summary of tests results after running all tests
+
+  -l  Use local daemons for integration tests
+
+      This allows the tests in "simple" to be run against local
+      daemons.
+
+      All integration tests communicate with cluster nodes using
+      onnode or the ctdb tool, which both have some test hooks to
+      support local daemons.
+
+      By default 3 daemons are used.  If you want to use a different
+      number of daemons then do not use this option but set
+      TEST_LOCAL_DAEMONS to the desired number of daemons instead.
+      The -l option just sets TEST_LOCAL_DAEMONS to 3...  :-)
+
+  -e  Exit on the first test failure
+
+  -C  Clean up - kill daemons and remove $TEST_VAR_DIR when done
+
+      Tests uses a temporary/var directory for test state.  By default,
+      this directory is not removed when tests are complete, so you
+      can do forensics or, for integration tests, re-run tests that
+      have failed against the same directory (with the same local
+      daemons setup).  So this option cleans things up.
+
+      Also kills local daemons associated with directory.
+
+  -V  Use <dir> as $TEST_VAR_DIR
+
+      Use the specified temporary temporary/var directory.
+
+  -H  No headers - for running single test with other wrapper
+
+      This allows tests to be embedded in some other test framework
+      and executed one-by-one with all the required
+      environment/infrastructure.
+
+      This replaces the old ctdb_test_env script.
+
+How do the tests find remote test programs?
+-------------------------------------------
+
+If the all of the cluster nodes have the CTDB git tree in the same
+location as on the test client then no special action is necessary.
+The simplest way of doing this is to share the tree to cluster nodes
+and test clients via NFS.
+
+If cluster nodes do not have the CTDB git tree then
+CTDB_TEST_REMOTE_DIR can be set to a directory that, on each cluster
+node, contains the contents of tests/scripts/ and tests/bin/.
+
+In the future this will hopefully (also) be supported via a ctdb-test
+package.
+
+Running the ctdb tool under valgrind
+------------------------------------
+
+The easiest way of doing this is something like:
+
+  VALGRIND="valgrind -q" scripts/run_tests ...
+
+This can be used to cause all invocations of the ctdb client (and,
+with local daemons, the ctdbd daemons themselves) to occur under
+valgrind.
+
+NOTE: Some libc calls seem to do weird things and perhaps cause
+spurious output from ctdbd at start time.  Please read valgrind output
+carefully before reporting bugs.  :-)
+
+How is the ctdb tool invoked?
+-----------------------------
+
+$CTDB determines how to invoke the ctdb client.  If not already set
+and if $VALGRIND is set, this is set to "$VALGRIND ctdb".  If this is
+not already set but $VALGRIND is not set, this is simply set to "ctdb"
diff --git a/ctdb/tests/TODO b/ctdb/tests/TODO
new file mode 100644 (file)
index 0000000..be471cc
--- /dev/null
@@ -0,0 +1,4 @@
+* Make tests know about IPv6.
+* Tests that write to database.
+* Tests that check actual network connectivity on failover.
+* Handle interrupting tests better.
diff --git a/ctdb/tests/complex/11_ctdb_delip_removes_ip.sh b/ctdb/tests/complex/11_ctdb_delip_removes_ip.sh
new file mode 100755 (executable)
index 0000000..043c345
--- /dev/null
@@ -0,0 +1,115 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that a node's public IP address can be deleted using 'ctdb deleteip'.
+
+Check that the address is actually deleted from the interface.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.  There is nothing intrinsic to this test that forces
+  this - it is because tests run against local daemons don't use the
+  regular eventscripts.  Local daemons put public addresses on
+  loopback, so we can't reliably test when IPs have moved between
+  nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Use 'ctdb ip' on one of the nodes to list the IP addresses being
+   served.
+3. Select an IP address being served by the node and check that it
+   actually appears on the interface it is supposed to be on.
+4. Delete the IP address using 'ctdb delip'.
+5. Verify that the deleted IP address is no longer listed using the
+   all_ips_on_node helper function.
+6. Verify that the deleted IP address no longer appears on the
+   interface it was on.
+
+Expected results:
+
+* 'ctdb delip' removes an IP address from the list of public IP
+  addresses being served by a node and from the network interface.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+echo "Getting list of public IPs..."
+all_ips_on_node -v 0
+
+# Select an IP/node to remove.
+num_ips=$(echo "$out" | wc -l)
+num_to_remove=$(($RANDOM % $num_ips))
+
+# Find the details in the list.
+i=0
+while [ $i -le $num_to_remove ] ; do
+    read ip_to_remove test_node
+    i=$(($i + 1))
+done <<<"$out"
+
+echo "Determining interface for ${ip_to_remove} on ${test_node}."
+try_command_on_node $test_node "ctdb ip -Y -v"
+iface=$(echo "$out" | awk -F: -v ip=${ip_to_remove} -v pnn=${test_node} '$2 == ip && $3 == pnn { print $4 }')
+echo "$iface"
+[ -n "$iface" ]
+
+echo "Checking that node ${test_node} hosts ${ip_to_remove} on interface ${iface}..."
+try_command_on_node $test_node "ip addr show dev $iface | grep -E 'inet[[:space:]]*${ip_to_remove}/'"
+
+echo "Attempting to remove ${ip_to_remove} from node ${test_node}."
+try_command_on_node $test_node $CTDB delip $ip_to_remove
+
+echo "Sleeping..."
+sleep_for 1
+
+test_node_ips=""
+while read ip pnn ; do
+    [ "$pnn" = "$test_node" ] && \
+       test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
+done <<<"$out" # bashism to avoid problem setting variable in pipeline.
+
+if [ "${test_node_ips/${ip_to_remove}}" = "$test_node_ips" ] ; then
+    echo "GOOD: That worked!"
+else
+    echo "BAD: The remove IP address is still there!"
+    testfailures=1
+fi
+
+timeout=60
+increment=5
+count=0
+echo "Waiting for ${ip_to_remove} to disappear from ${iface}..."
+while : ; do
+    try_command_on_node -v $test_node "ip addr show dev $iface"
+    if echo "$out" | grep -E 'inet[[:space:]]*${ip_to_remove}/'; then
+       echo "Still there..."
+       if [ $(($count * $increment)) -ge $timeout ] ; then
+           echo "BAD: Timed out waiting..."
+           exit 1
+       fi
+       sleep_for $increment
+       count=$(($count + 1))
+    else
+       break
+    fi
+done
+
+echo "GOOD: IP was successfully removed!"
diff --git a/ctdb/tests/complex/31_nfs_tickle.sh b/ctdb/tests/complex/31_nfs_tickle.sh
new file mode 100755 (executable)
index 0000000..ce4ae81
--- /dev/null
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that NFS connections are monitored and that NFS tickles are sent.
+
+We create a connection to the NFS server on a node and confirm that
+this connection is registered in the nfs-tickles/ subdirectory in
+shared storage.  Then disable the relevant NFS server node and ensure
+that it send an appropriate reset packet.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+* Cluster nodes must be listening on the NFS TCP port (2049).
+
+Steps:
+
+1. Verify that the cluster is healthy.
+2. Connect from the current host (test client) to TCP port 2049 using
+   the public address of a cluster node.
+3. Determine the source socket used for the connection.
+4. Ensure that CTDB records the source socket details in the nfs-tickles
+   directory on shared storage.
+5. Disable the node that the connection has been made to.
+6. Verify that a TCP tickle (a reset packet) is sent to the test client.
+
+Expected results:
+
+* CTDB should correctly record the socket in the nfs-tickles directory
+  and should send a reset packet when the node is disabled.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+ctdb_test_exit_hook_add ctdb_test_eventscript_uninstall
+
+ctdb_test_eventscript_install
+
+# We need this for later, so we know how long to sleep.
+try_command_on_node any $CTDB getvar MonitorInterval
+monitor_interval="${out#*= }"
+#echo "Monitor interval on node $test_node is $monitor_interval seconds."
+
+select_test_node_and_ips
+
+test_port=2049
+
+echo "Connecting to node ${test_node} on IP ${test_ip}:${test_port} with netcat..."
+
+nc -d -w $(($monitor_interval * 4)) $test_ip $test_port &
+nc_pid=$!
+ctdb_test_exit_hook_add "kill $nc_pid >/dev/null 2>&1"
+
+wait_until_get_src_socket "tcp" "${test_ip}:${test_port}" $nc_pid "nc"
+src_socket="$out"
+echo "Source socket is $src_socket"
+
+wait_for_monitor_event $test_node
+
+echo "Sleeping until tickles are synchronised across nodes..."
+try_command_on_node $test_node $CTDB getvar TickleUpdateInterval
+sleep_for "${out#*= }"
+
+if try_command_on_node any "test -r /etc/ctdb/events.d/61.nfstickle" ; then
+    echo "Trying to determine NFS_TICKLE_SHARED_DIRECTORY..."
+    if [ -f /etc/sysconfig/nfs ]; then
+       f="/etc/sysconfig/nfs"
+    elif [ -f /etc/default/nfs ]; then
+       f="/etc/default/nfs"
+    elif [ -f /etc/ctdb/sysconfig/nfs ]; then
+       f="/etc/ctdb/sysconfig/nfs"
+    fi
+    try_command_on_node -v any "[ -r $f ] &&  sed -n -e s@^NFS_TICKLE_SHARED_DIRECTORY=@@p $f" || true
+
+    nfs_tickle_shared_directory="${out:-/gpfs/.ctdb/nfs-tickles}"
+
+    try_command_on_node $test_node hostname
+    test_hostname=$out
+
+    try_command_on_node -v any cat "${nfs_tickle_shared_directory}/$test_hostname/$test_ip"
+else
+    echo "That's OK, we'll use \"ctdb gettickles\", which is newer..."
+    try_command_on_node -v any "ctdb -Y gettickles $test_ip $test_port"
+fi
+
+if [ "${out/${src_socket}/}" != "$out" ] ; then
+    echo "GOOD: NFS connection tracked OK."
+else
+    echo "BAD: Socket not tracked in NFS tickles."
+    testfailures=1
+fi
+
+tcptickle_sniff_start $src_socket "${test_ip}:${test_port}"
+
+# We need to be nasty to make that the node being failed out doesn't
+# get a chance to send any tickles and confuse our sniff.
+echo "Killing ctdbd on ${test_node}..."
+try_command_on_node $test_node killall -9 ctdbd
+
+wait_until_node_has_status $test_node disconnected
+
+tcptickle_sniff_wait_show
diff --git a/ctdb/tests/complex/32_cifs_tickle.sh b/ctdb/tests/complex/32_cifs_tickle.sh
new file mode 100755 (executable)
index 0000000..93634e7
--- /dev/null
@@ -0,0 +1,99 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that CIFS connections are monitored and that CIFS tickles are sent.
+
+We create a connection to the CIFS server on a node and confirm that
+this connection is registered by CTDB.  Then disable the relevant CIFS
+server node and ensure that it send an appropriate reset packet.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+* Clustered Samba must be listening on TCP port 445.
+
+Steps:
+
+1. Verify that the cluster is healthy.
+2. Connect from the current host (test client) to TCP port 445 using
+   the public address of a cluster node.
+3. Determine the source socket used for the connection.
+4. Using the "ctdb gettickle" command, ensure that CTDB records the
+   connection details.
+5. Disable the node that the connection has been made to.
+6. Verify that a TCP tickle (a reset packet) is sent to the test client.
+
+Expected results:
+
+* CTDB should correctly record the connection and should send a reset
+  packet when the node is disabled.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+# We need this for later, so we know how long to sleep.
+try_command_on_node 0 $CTDB getvar MonitorInterval
+monitor_interval="${out#*= }"
+#echo "Monitor interval on node $test_node is $monitor_interval seconds."
+
+select_test_node_and_ips
+
+test_port=445
+
+echo "Connecting to node ${test_node} on IP ${test_ip}:${test_port} with netcat..."
+
+nc -d -w $(($monitor_interval * 4)) $test_ip $test_port &
+nc_pid=$!
+ctdb_test_exit_hook_add "kill $nc_pid >/dev/null 2>&1"
+
+wait_until_get_src_socket "tcp" "${test_ip}:${test_port}" $nc_pid "nc"
+src_socket="$out"
+echo "Source socket is $src_socket"
+
+# This should happen as soon as connection is up... but unless we wait
+# we sometimes beat the registration.
+check_tickles ()
+{
+    try_command_on_node 0 ctdb gettickles $test_ip -n $test_node
+    # SRC: 10.0.2.45:49091   DST: 10.0.2.143:445
+    [ "${out/SRC: ${src_socket} /}" != "$out" ]
+}
+
+echo "Checking if CIFS connection is tracked by CTDB..."
+wait_until 10 check_tickles
+echo "$out"
+
+if [ "${out/SRC: ${src_socket} /}" != "$out" ] ; then
+    echo "GOOD: CIFS connection tracked OK by CTDB."
+else
+    echo "BAD: Socket not tracked by CTDB."
+    testfailures=1
+fi
+
+tcptickle_sniff_start $src_socket "${test_ip}:${test_port}"
+
+echo "Disabling node $test_node"
+try_command_on_node 1 $CTDB disable -n $test_node
+wait_until_node_has_status $test_node disabled
+
+tcptickle_sniff_wait_show
diff --git a/ctdb/tests/complex/33_gratuitous_arp.sh b/ctdb/tests/complex/33_gratuitous_arp.sh
new file mode 100755 (executable)
index 0000000..721b0f2
--- /dev/null
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that a gratuitous ARP is sent when a node is failed out.
+
+We ping a public IP and lookup the MAC address in the ARP table.  We
+then disable the node and check the ARP table again - the MAC address
+should have changed.  This test does NOT test connectivity after the
+failover.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+Steps:
+
+1. Verify that the cluster is healthy.
+2. Select a public address and its corresponding node.
+3. Remove any entries for the chosen address from the ARP table.
+4. Send a single ping request packet to the selected public address.
+5. Determine the MAC address corresponding to the public address by
+   checking the ARP table.
+6. Disable the selected node.
+7. Check the ARP table and check the MAC associated with the public
+   address.
+
+Expected results:
+
+* When a node is disabled the MAC address associated with public
+  addresses on that node should change.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Removing ${test_ip} from the local ARP table..."
+arp -d $test_ip >/dev/null 2>&1 || true
+
+echo "Pinging ${test_ip}..."
+ping -q -n -c 1 $test_ip
+
+echo "Getting MAC address associated with ${test_ip}..."
+original_mac=$(arp -n $test_ip | awk '$2 == "ether" {print $3}')
+[ $? -eq 0 ]
+
+echo "MAC address is: ${original_mac}"
+
+gratarp_sniff_start
+
+echo "Disabling node $test_node"
+try_command_on_node 1 $CTDB disable -n $test_node
+wait_until_node_has_status $test_node disabled
+
+gratarp_sniff_wait_show
+
+echo "Getting MAC address associated with ${test_ip} again..."
+new_mac=$(arp -n $test_ip | awk '$2 == "ether" {print $3}')
+[ $? -eq 0 ]
+
+echo "MAC address is: ${new_mac}"
+
+if [ "$original_mac" != "$new_mac" ] ; then
+    echo "GOOD: MAC address changed"
+else
+    echo "BAD: MAC address did not change"
+    testfailures=1
+fi
diff --git a/ctdb/tests/complex/41_failover_ping_discrete.sh b/ctdb/tests/complex/41_failover_ping_discrete.sh
new file mode 100755 (executable)
index 0000000..88b2013
--- /dev/null
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that it is possible to ping a public address after disabling a node.
+
+We ping a public IP, disable the node hosting it and then ping the
+public IP again.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+Steps:
+
+1. Verify that the cluster is healthy.
+2. Select a public address and its corresponding node.
+3. Send a single ping request packet to the selected public address.
+4. Disable the selected node.
+5. Send another single ping request packet to the selected public address.
+
+Expected results:
+
+* When a node is disabled the public address fails over and the
+  address is still pingable.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Removing ${test_ip} from the local ARP table..."
+arp -d $test_ip >/dev/null 2>&1 || true
+
+echo "Pinging ${test_ip}..."
+ping -q -n -c 1 $test_ip
+
+gratarp_sniff_start
+
+echo "Disabling node $test_node"
+try_command_on_node 1 $CTDB disable -n $test_node
+wait_until_node_has_status $test_node disabled
+
+gratarp_sniff_wait_show
+
+echo "Removing ${test_ip} from the local ARP table again..."
+arp -d $test_ip >/dev/null 2>&1 || true
+
+echo "Pinging ${test_ip} again..."
+ping -q -n -c 1 $test_ip
diff --git a/ctdb/tests/complex/42_failover_ssh_hostname.sh b/ctdb/tests/complex/42_failover_ssh_hostname.sh
new file mode 100755 (executable)
index 0000000..defe15a
--- /dev/null
@@ -0,0 +1,78 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that it is possible to SSH to a public address after disabling a node.
+
+We SSH to a public IP and check the hostname, disable the node hosting
+it and then SSH again to confirm that the hostname has changed.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+Steps:
+
+1. Verify that the cluster is healthy.
+2. Select a public address and its corresponding node.
+3. SSH to the selected public address and run hostname.
+4. Disable the selected node.
+5. SSH to the selected public address again and run hostname.
+
+Expected results:
+
+* When a node is disabled the public address fails over and it is
+  still possible to SSH to the node.  The hostname should change.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Removing ${test_ip} from the local ARP table..."
+arp -d $test_ip >/dev/null 2>&1 || true
+
+echo "SSHing to ${test_ip} and running hostname..."
+original_hostname=$(ssh -o "StrictHostKeyChecking no" $test_ip hostname)
+[ $? -eq 0 ]
+
+echo "Hostname is: ${original_hostname}"
+
+gratarp_sniff_start
+
+echo "Disabling node $test_node"
+try_command_on_node 1 $CTDB disable -n $test_node
+wait_until_node_has_status $test_node disabled
+
+gratarp_sniff_wait_show
+
+echo "SSHing to ${test_ip} and running hostname (again)..."
+new_hostname=$(ssh -o "StrictHostKeyChecking no" $test_ip hostname)
+[ $? -eq 0 ]
+
+echo "Hostname is: ${new_hostname}"
+
+if [ "$original_hostname" != "$new_hostname" ] ; then
+    echo "GOOD: hostname changed"
+else
+    echo "BAD: hostname did not change"
+    testfailures=1
+fi
diff --git a/ctdb/tests/complex/43_failover_nfs_basic.sh b/ctdb/tests/complex/43_failover_nfs_basic.sh
new file mode 100755 (executable)
index 0000000..a68f7db
--- /dev/null
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that a mounted NFS share is still operational after failover.
+
+We mount an NFS share from a node, write a file via NFS and then
+confirm that we can correctly read the file after a failover.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+Steps:
+
+1. Verify that the cluster is healthy.
+2. Select a public address and its corresponding node.
+3. Select the 1st NFS share exported on the node.
+4. Mount the selected NFS share.
+5. Create a file in the NFS mount and calculate its checksum.
+6. Disable the selected node.
+7. Read the file and calculate its checksum.
+8. Compare the checksums.
+
+Expected results:
+
+* When a node is disabled the public address fails over and it is
+  possible to correctly read a file over NFS.  The checksums should be
+  the same before and after.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+nfs_test_setup
+
+echo "Create file containing random data..."
+dd if=/dev/urandom of=$nfs_local_file bs=1k count=1
+original_sum=$(sum $nfs_local_file)
+[ $? -eq 0 ]
+
+gratarp_sniff_start
+
+echo "Disabling node $test_node"
+try_command_on_node 0 $CTDB disable -n $test_node
+wait_until_node_has_status $test_node disabled
+
+gratarp_sniff_wait_show
+
+new_sum=$(sum $nfs_local_file)
+[ $? -eq 0 ]
+
+if [ "$original_md5" = "$new_md5" ] ; then
+    echo "GOOD: file contents unchanged after failover"
+else
+    echo "BAD: file contents are different after failover"
+    testfailures=1
+fi
diff --git a/ctdb/tests/complex/44_failover_nfs_oneway.sh b/ctdb/tests/complex/44_failover_nfs_oneway.sh
new file mode 100755 (executable)
index 0000000..aaec2ed
--- /dev/null
@@ -0,0 +1,94 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that a file created on a node is readable via NFS after a failover.
+
+We write a file into an exported directory on a node, mount the NFS
+share from a node, verify that we can read the file via NFS and that
+we can still read it after a failover.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+Steps:
+
+1.  Verify that the cluster is healthy.
+2.  Select a public address and its corresponding node.
+3.  Select the 1st NFS share exported on the node.
+4.  Write a file into exported directory on the node and calculate its
+    checksum.
+5.  Mount the selected NFS share.
+6.  Read the file via the NFS mount and calculate its checksum.
+7.  Compare checksums.
+8.  Disable the selected node.
+9.  Read the file via NFS and calculate its checksum.
+10. Compare the checksums.
+
+Expected results:
+
+* Checksums for the file on all 3 occasions should be the same.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+nfs_test_setup
+
+echo "Create file containing random data..."
+local_f=$(mktemp)
+ctdb_test_exit_hook_add rm -f "$local_f"
+dd if=/dev/urandom of=$local_f bs=1k count=1
+local_sum=$(sum $local_f)
+
+scp -p "$local_f" "${test_ip}:${nfs_remote_file}"
+try_command_on_node $test_node "chmod 644 $nfs_remote_file"
+
+nfs_sum=$(sum $nfs_local_file)
+
+if [ "$local_sum" = "$nfs_sum" ] ; then
+    echo "GOOD: file contents read correctly via NFS"
+else
+    echo "BAD: file contents are different over NFS"
+    echo "  original file: $local_sum"
+    echo "       NFS file: $nfs_sum"
+    exit 1
+fi
+
+gratarp_sniff_start
+
+echo "Disabling node $test_node"
+try_command_on_node 0 $CTDB disable -n $test_node
+wait_until_node_has_status $test_node disabled
+
+gratarp_sniff_wait_show
+
+new_sum=$(sum $nfs_local_file)
+[ $? -eq 0 ]
+
+if [ "$nfs_sum" = "$new_sum" ] ; then
+    echo "GOOD: file contents unchanged after failover"
+else
+    echo "BAD: file contents are different after failover"
+    echo "  original file: $nfs_sum"
+    echo "       NFS file: $new_sum"
+    exit 1
+fi
diff --git a/ctdb/tests/complex/45_failover_nfs_kill.sh b/ctdb/tests/complex/45_failover_nfs_kill.sh
new file mode 100755 (executable)
index 0000000..52b423f
--- /dev/null
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that a mounted NFS share is still operational after failover.
+
+We mount an NFS share from a node, write a file via NFS and then
+confirm that we can correctly read the file after a failover.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 nodes with public addresses.
+
+* Test must be run on a real or virtual cluster rather than against
+  local daemons.
+
+* Test must not be run from a cluster node.
+
+Steps:
+
+1. Verify that the cluster is healthy.
+2. Select a public address and its corresponding node.
+3. Select the 1st NFS share exported on the node.
+4. Mount the selected NFS share.
+5. Create a file in the NFS mount and calculate its checksum.
+6. Kill CTDB on the selected node.
+7. Read the file and calculate its checksum.
+8. Compare the checksums.
+
+Expected results:
+
+* When a node is disabled the public address fails over and it is
+  possible to correctly read a file over NFS.  The checksums should be
+  the same before and after.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+set -e
+
+ctdb_test_init "$@"
+
+ctdb_test_check_real_cluster
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+nfs_test_setup
+
+echo "Create file containing random data..."
+dd if=/dev/urandom of=$nfs_local_file bs=1k count=1
+original_sum=$(sum $nfs_local_file)
+[ $? -eq 0 ]
+
+gratarp_sniff_start
+
+echo "Killing node $test_node"
+try_command_on_node $test_node $CTDB getpid
+pid=${out#*:}
+try_command_on_node $test_node kill -9 $pid
+wait_until_node_has_status $test_node disconnected
+
+gratarp_sniff_wait_show
+
+new_sum=$(sum $nfs_local_file)
+[ $? -eq 0 ]
+
+if [ "$original_md5" = "$new_md5" ] ; then
+    echo "GOOD: file contents unchanged after failover"
+else
+    echo "BAD: file contents are different after failover"
+    testfailures=1
+fi
diff --git a/ctdb/tests/complex/README b/ctdb/tests/complex/README
new file mode 100644 (file)
index 0000000..72de396
--- /dev/null
@@ -0,0 +1,2 @@
+Complex integration tests.  These need a real or virtual cluster.
+That is, they can not be run against local daemons.
diff --git a/ctdb/tests/complex/scripts/local.bash b/ctdb/tests/complex/scripts/local.bash
new file mode 100644 (file)
index 0000000..eb4c41c
--- /dev/null
@@ -0,0 +1,143 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+get_src_socket ()
+{
+    local proto="$1"
+    local dst_socket="$2"
+    local pid="$3"
+    local prog="$4"
+
+    local pat="^${proto}[[:space:]]+[[:digit:]]+[[:space:]]+[[:digit:]]+[[:space:]]+[^[:space:]]+[[:space:]]+${dst_socket//./\\.}[[:space:]]+ESTABLISHED[[:space:]]+${pid}/${prog}[[:space:]]*\$"
+    out=$(netstat -tanp |
+       egrep "$pat" |
+       awk '{ print $4 }')
+
+    [ -n "$out" ]
+}
+
+wait_until_get_src_socket ()
+{
+    local proto="$1"
+    local dst_socket="$2"
+    local pid="$3"
+    local prog="$4"
+
+    echo "Waiting for ${prog} to establish connection to ${dst_socket}..."
+
+    wait_until 5 get_src_socket "$@"
+}
+
+#######################################
+
+# filename will be in $tcpdump_filename, pid in $tcpdump_pid
+tcpdump_start ()
+{
+    tcpdump_filter="$1" # global
+
+    echo "Running tcpdump..."
+    tcpdump_filename=$(mktemp)
+    ctdb_test_exit_hook_add "rm -f $tcpdump_filename"
+
+    # The only way of being sure that tcpdump is listening is to send
+    # some packets that it will see.  So we use dummy pings - the -U
+    # option to tcpdump ensures that packets are flushed to the file
+    # as they are captured.
+    local dummy_addr="127.3.2.1"
+    local dummy="icmp and dst host ${dummy_addr} and icmp[icmptype] == icmp-echo"
+    tcpdump -n -p -s 0 -e -U -w $tcpdump_filename -i any "($tcpdump_filter) or ($dummy)" &
+    ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
+
+    echo "Waiting for tcpdump output file to be ready..."
+    ping -q "$dummy_addr" >/dev/null 2>&1 &
+    ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
+
+    tcpdump_listen_for_dummy ()
+    {
+       tcpdump -n -r $tcpdump_filename -c 1 "$dummy" >/dev/null 2>&1
+    }
+
+    wait_until 10 tcpdump_listen_for_dummy
+}
+
+# By default, wait for 1 matching packet.
+tcpdump_wait ()
+{
+    local count="${1:-1}"
+    local filter="${2:-${tcpdump_filter}}"
+
+    tcpdump_check ()
+    {
+       local found=$(tcpdump -n -r $tcpdump_filename "$filter" 2>/dev/null | wc -l)
+       [ $found -ge $count ]
+    }
+
+    echo "Waiting for tcpdump to capture some packets..."
+    if ! wait_until 30 tcpdump_check ; then
+       echo "DEBUG AT $(date '+%F %T'):"
+       local i
+       for i in "onnode -q 0 $CTDB status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
+           echo "$i"
+           $i || true
+       done
+       return 1
+    fi
+}
+
+tcpdump_show ()
+{
+    local filter="${1:-${tcpdump_filter}}"
+
+    tcpdump -n -r $tcpdump_filename  "$filter" 2>/dev/null
+}
+
+tcptickle_sniff_start ()
+{
+    local src="$1"
+    local dst="$2"
+
+    local in="src host ${dst%:*} and tcp src port ${dst##*:} and dst host ${src%:*} and tcp dst port ${src##*:}"
+    local out="src host ${src%:*} and tcp src port ${src##*:} and dst host ${dst%:*} and tcp dst port ${dst##*:}"
+    local tickle_ack="${in} and (tcp[tcpflags] & tcp-ack != 0) and (tcp[14] == 4) and (tcp[15] == 210)" # win == 1234
+    local ack_ack="${out} and (tcp[tcpflags] & tcp-ack != 0)"
+    tcptickle_reset="${in} and tcp[tcpflags] & tcp-rst != 0"
+    local filter="(${tickle_ack}) or (${ack_ack}) or (${tcptickle_reset})"
+
+    tcpdump_start "$filter"
+}
+
+tcptickle_sniff_wait_show ()
+{
+    tcpdump_wait 1 "$tcptickle_reset"
+
+    echo "GOOD: here are some TCP tickle packets:"
+    tcpdump_show
+}
+
+gratarp_sniff_start ()
+{
+    tcpdump_start "arp host ${test_ip}"
+}
+
+gratarp_sniff_wait_show ()
+{
+    tcpdump_wait 2
+
+    echo "GOOD: this should be the some gratuitous ARPs:"
+    tcpdump_show
+}
+
+
+ctdb_test_check_real_cluster ()
+{
+    [ -z "$TEST_LOCAL_DAEMONS" ] || \
+       die "ERROR: This test must be run against a real/virtual cluster, not local daemons."
+
+    local h=$(hostname)
+
+    local i
+    for i in $(onnode -q all hostname) ; do
+       [ "$h" != "$i" ] || \
+           die "ERROR: This test must not be run from a cluster node."
+    done
+}
+
diff --git a/ctdb/tests/events.d/00.test b/ctdb/tests/events.d/00.test
new file mode 100755 (executable)
index 0000000..e3e15eb
--- /dev/null
@@ -0,0 +1,105 @@
+#!/bin/sh
+# event script for 'make test'
+
+cmd="$1"
+shift
+
+case $cmd in 
+    monitor)
+       echo "monitor event"
+       echo "monitor event stderr" >&2
+       exit 0
+       ;;
+
+     startrecovery)
+       echo "ctdb startrecovery event"
+       exit 0; 
+       ;;
+
+     init)
+       echo "ctdb init event"
+       exit 0;
+       ;;
+     setup)
+       echo "ctdb setup event"
+       exit 0;
+       ;;
+     startup)
+       echo "ctdb startup event"
+       IFACES=`ctdb ifaces -Y | grep -v '^:Name:LinkStatus:References:'`
+       for I in $IFACES; do
+               IFACE=`echo -n "$I" | cut -d ':' -f2`
+               ctdb setifacelink $IFACE up
+       done
+       exit 0;
+       ;;
+       
+     takeip)
+       if [ $# != 3 ]; then
+          echo "must supply interface, IP and maskbits"
+          exit 1
+       fi
+       iface=$1
+       ip=$2
+       maskbits=$3
+
+       [ -n "$TEST_LOCAL_DAEMONS" ] || {
+           /sbin/ip addr add $ip/$maskbits dev $iface || {
+               echo "Failed to add $ip/$maskbits on dev $iface"
+               exit 1
+           }
+       }
+       echo "ctdb takeip event for $1 $2 $3"
+       exit 0;
+       ;;
+
+
+     ##################################################
+     # called when ctdbd wants to release an IP address
+     releaseip)
+       if [ $# != 3 ]; then
+          echo "must supply interface, IP and maskbits"
+          exit 1
+       fi
+       iface=$1
+       ip=$2
+       maskbits=$3
+       [ -n "$TEST_LOCAL_DAEMONS" ] || {
+           /sbin/ip addr del $ip/$maskbits dev $iface || {
+               echo "Failed to del $ip on dev $iface"
+               exit 1
+           }
+       }
+       echo "ctdb releaseip event for $1 $2 $3"
+       exit 0
+       ;;
+
+     updateip)
+       echo "ctdb updateip event for $1"
+       exit 0
+       ;;
+
+     recovered)
+       echo "ctdb recovered event"
+       exit 0
+       ;;
+
+     ipreallocated)
+       echo "ctdb ipreallocated event"
+       exit 0
+       ;;
+
+
+     shutdown)
+       echo "ctdb shutdown event"
+       exit 0
+       ;;
+
+     stopped)
+       echo "ctdb stopped event"
+       exit 0
+       ;;
+esac
+
+echo "Invalid command $cmd"
+exit 1
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.001.sh b/ctdb/tests/eventscripts/00.ctdb.init.001.sh
new file mode 100755 (executable)
index 0000000..320025a
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool supports check"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="yes"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.002.sh b/ctdb/tests/eventscripts/00.ctdb.init.002.sh
new file mode 100755 (executable)
index 0000000..2777cc5
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool does no support check"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="no"
+
+ok <<EOF
+WARNING: The installed 'tdbtool' does not offer the 'check' subcommand.
+ Using 'tdbdump' for database checks.
+ Consider updating 'tdbtool' for better checks!
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.003.sh b/ctdb/tests/eventscripts/00.ctdb.init.003.sh
new file mode 100755 (executable)
index 0000000..2770210
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool supports check, good TDB"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="yes"
+
+touch "${CTDB_DBDIR}/foo.tdb.0"
+FAKE_TDB_IS_OK="yes"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.004.sh b/ctdb/tests/eventscripts/00.ctdb.init.004.sh
new file mode 100755 (executable)
index 0000000..b504d08
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool supports check, bad TDB"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="yes"
+
+db="${CTDB_DBDIR}/foo.tdb.0"
+touch "$db"
+FAKE_TDB_IS_OK="no"
+
+FAKE_DATE_OUTPUT="19690818.103000.000000001"
+
+ok <<EOF
+WARNING: database ${db} is corrupted.
+ Moving to backup ${db}.${FAKE_DATE_OUTPUT}.corrupt for later analysis.
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.005.sh b/ctdb/tests/eventscripts/00.ctdb.init.005.sh
new file mode 100755 (executable)
index 0000000..d11ab94
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool does not support check, good TDB"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="no"
+
+touch "${CTDB_DBDIR}/foo.tdb.0"
+FAKE_TDB_IS_OK="yes"
+
+ok <<EOF
+WARNING: The installed 'tdbtool' does not offer the 'check' subcommand.
+ Using 'tdbdump' for database checks.
+ Consider updating 'tdbtool' for better checks!
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.006.sh b/ctdb/tests/eventscripts/00.ctdb.init.006.sh
new file mode 100755 (executable)
index 0000000..745bca0
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool does not support check, bad TDB"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="no"
+
+db="${CTDB_DBDIR}/foo.tdb.0"
+touch "$db"
+FAKE_TDB_IS_OK="no"
+
+FAKE_DATE_OUTPUT="19690818.103000.000000001"
+
+ok <<EOF
+WARNING: The installed 'tdbtool' does not offer the 'check' subcommand.
+ Using 'tdbdump' for database checks.
+ Consider updating 'tdbtool' for better checks!
+WARNING: database ${db} is corrupted.
+ Moving to backup ${db}.${FAKE_DATE_OUTPUT}.corrupt for later analysis.
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.007.sh b/ctdb/tests/eventscripts/00.ctdb.init.007.sh
new file mode 100755 (executable)
index 0000000..1c954d7
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool supports check, good persistent TDB"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="yes"
+
+touch "${CTDB_DBDIR}/persistent/foo.tdb.0"
+FAKE_TDB_IS_OK="yes"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.008.sh b/ctdb/tests/eventscripts/00.ctdb.init.008.sh
new file mode 100755 (executable)
index 0000000..a6afdd8
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "TDB check, tdbtool supports check, bad persistent TDB"
+
+setup_ctdb
+
+FAKE_TDBTOOL_SUPPORTS_CHECK="yes"
+
+db="${CTDB_DBDIR}/persistent/foo.tdb.0"
+touch "$db"
+FAKE_TDB_IS_OK="no"
+
+required_result 1 <<EOF
+Persistent database ${db} is corrupted! CTDB will not start.
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.021.sh b/ctdb/tests/eventscripts/00.ctdb.init.021.sh
new file mode 100755 (executable)
index 0000000..87dfa4d
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Check public IP dropping, none assigned"
+
+setup_ctdb
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.022.sh b/ctdb/tests/eventscripts/00.ctdb.init.022.sh
new file mode 100755 (executable)
index 0000000..6e59428
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Check public IP dropping, 1 assigned"
+
+setup_ctdb
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ip addr add "${ip}/${bits}" dev "$dev"
+
+    ok <<EOF
+Removing public address ${ip}/${bits} from device ${dev}
+EOF
+
+    simple_test
+done
diff --git a/ctdb/tests/eventscripts/00.ctdb.init.023.sh b/ctdb/tests/eventscripts/00.ctdb.init.023.sh
new file mode 100755 (executable)
index 0000000..9b97e82
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Check public IP dropping, all assigned"
+
+setup_ctdb
+
+nl="
+"
+ctdb_get_my_public_addresses | {
+    out=""
+    while read dev ip bits ; do
+       ip addr add "${ip}/${bits}" dev "$dev"
+
+       msg="Removing public address ${ip}/${bits} from device ${dev}"
+       out="${out}${out:+${nl}}${msg}"
+    done
+
+    ok "$out"
+
+    simple_test
+}
diff --git a/ctdb/tests/eventscripts/00.ctdb.monitor.001.sh b/ctdb/tests/eventscripts/00.ctdb.monitor.001.sh
new file mode 100755 (executable)
index 0000000..4290d13
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, no checks enabled"
+
+setup_memcheck "bad"
+
+CTDB_MONITOR_FREE_MEMORY=""
+CTDB_MONITOR_FREE_MEMORY_WARN=""
+CTDB_CHECK_SWAP_IS_NOT_USED="no"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.monitor.002.sh b/ctdb/tests/eventscripts/00.ctdb.monitor.002.sh
new file mode 100755 (executable)
index 0000000..6e94012
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, good situation, all enabled"
+
+setup_memcheck
+
+CTDB_MONITOR_FREE_MEMORY="500"
+CTDB_MONITOR_FREE_MEMORY_WARN="1000"
+CTDB_CHECK_SWAP_IS_NOT_USED="yes"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.monitor.003.sh b/ctdb/tests/eventscripts/00.ctdb.monitor.003.sh
new file mode 100755 (executable)
index 0000000..9e63ab5
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, only swap check"
+
+setup_memcheck "bad"
+
+CTDB_MONITOR_FREE_MEMORY=""
+CTDB_MONITOR_FREE_MEMORY_WARN=""
+CTDB_CHECK_SWAP_IS_NOT_USED="yes"
+
+ok <<EOF
+We are swapping:
+$FAKE_PROC_MEMINFO
+$(ps foobar)
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.monitor.004.sh b/ctdb/tests/eventscripts/00.ctdb.monitor.004.sh
new file mode 100755 (executable)
index 0000000..fdf2032
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, only memory warning"
+
+setup_memcheck "bad"
+
+CTDB_MONITOR_FREE_MEMORY=""
+CTDB_MONITOR_FREE_MEMORY_WARN="500"
+CTDB_CHECK_SWAP_IS_NOT_USED="no"
+
+ok <<EOF
+WARNING: free memory is low - 468MB free <=  ${CTDB_MONITOR_FREE_MEMORY_WARN}MB (CTDB threshold)
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/00.ctdb.monitor.005.sh b/ctdb/tests/eventscripts/00.ctdb.monitor.005.sh
new file mode 100755 (executable)
index 0000000..a46851a
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, only memory critical"
+
+setup_memcheck "bad"
+
+CTDB_MONITOR_FREE_MEMORY="500"
+CTDB_MONITOR_FREE_MEMORY_WARN=""
+CTDB_CHECK_SWAP_IS_NOT_USED="no"
+
+ok <<EOF
+CRITICAL: OOM - 468MB free <= ${CTDB_MONITOR_FREE_MEMORY}MB (CTDB threshold)
+CRITICAL: Shutting down CTDB!!!
+$FAKE_PROC_MEMINFO
+$(ps foobar)
+CTDB says BYE!
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.init.001.sh b/ctdb/tests/eventscripts/10.interface.init.001.sh
new file mode 100755 (executable)
index 0000000..fae1a78
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "no public addresses"
+
+setup_ctdb
+
+export CTDB_PUBLIC_ADDRESSES="$CTDB_ETC/does/not/exist"
+
+ok "No public addresses file found. Nothing to do for 10.interfaces"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.init.002.sh b/ctdb/tests/eventscripts/10.interface.init.002.sh
new file mode 100755 (executable)
index 0000000..ba33f92
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all interfaces up"
+
+setup_ctdb
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.001.sh b/ctdb/tests/eventscripts/10.interface.monitor.001.sh
new file mode 100755 (executable)
index 0000000..42ef42d
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "no public addresses"
+
+setup_ctdb
+
+export CTDB_PUBLIC_ADDRESSES="$CTDB_ETC/does/not/exist"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.002.sh b/ctdb/tests/eventscripts/10.interface.monitor.002.sh
new file mode 100755 (executable)
index 0000000..ba33f92
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all interfaces up"
+
+setup_ctdb
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.003.sh b/ctdb/tests/eventscripts/10.interface.monitor.003.sh
new file mode 100755 (executable)
index 0000000..1eb7916
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 interface down"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+ethtool_interfaces_down $iface
+
+required_result 1 "ERROR: No link on the public network interface $iface"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.004.sh b/ctdb/tests/eventscripts/10.interface.monitor.004.sh
new file mode 100755 (executable)
index 0000000..69ffbd0
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all interfaces up, 1 is a bond"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+setup_bond $iface
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.005.sh b/ctdb/tests/eventscripts/10.interface.monitor.005.sh
new file mode 100755 (executable)
index 0000000..8cf7bbc
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 bond, no active slaves"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+setup_bond $iface "None"
+
+required_result 1 "ERROR: No active slaves for bond device $iface"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.006.sh b/ctdb/tests/eventscripts/10.interface.monitor.006.sh
new file mode 100755 (executable)
index 0000000..3c483a3
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 bond, active slaves, link down"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+setup_bond $iface "" "down"
+
+required_result 1 "ERROR: public network interface $iface is down"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.007.sh b/ctdb/tests/eventscripts/10.interface.monitor.007.sh
new file mode 100755 (executable)
index 0000000..c45900e
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "unknown interface, up"
+
+setup_ctdb
+
+export CTDB_PUBLIC_INTERFACE="dev999"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.008.sh b/ctdb/tests/eventscripts/10.interface.monitor.008.sh
new file mode 100755 (executable)
index 0000000..f73302b
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "unknown interface, down, up"
+
+setup_ctdb
+
+iface="dev999"
+export CTDB_PUBLIC_INTERFACE="$iface"
+
+#EVENTSCRIPTS_TESTS_TRACE="sh -x"
+iterate_test 3 "ok_null" \
+    1 'ethtool_interfaces_down "$iface" ; required_result 1 "ERROR: No link on the public network interface $iface"' \
+    2 'ethtool_interfaces_up "$iface"'
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.009.sh b/ctdb/tests/eventscripts/10.interface.monitor.009.sh
new file mode 100755 (executable)
index 0000000..1b785ff
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "CTDB_PARTIALLY_ONLINE_INTERFACES, 1 down"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+export CTDB_PARTIALLY_ONLINE_INTERFACES="yes"
+
+ethtool_interfaces_down "$iface"
+
+ok "ERROR: No link on the public network interface $iface"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.010.sh b/ctdb/tests/eventscripts/10.interface.monitor.010.sh
new file mode 100755 (executable)
index 0000000..4d23319
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "CTDB_PARTIALLY_ONLINE_INTERFACES, all down"
+
+setup_ctdb
+
+ifaces=$(ctdb_get_interfaces)
+
+export CTDB_PARTIALLY_ONLINE_INTERFACES="yes"
+
+ethtool_interfaces_down $ifaces
+
+msg=$(for i in $ifaces ; do echo "ERROR: No link on the public network interface $i" ; done)
+
+required_result 1 "$msg"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.011.sh b/ctdb/tests/eventscripts/10.interface.monitor.011.sh
new file mode 100755 (executable)
index 0000000..21775d4
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "CTDB_PARTIALLY_ONLINE_INTERFACES, 1 bond down"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+setup_bond $iface "None"
+
+export CTDB_PARTIALLY_ONLINE_INTERFACES="yes"
+
+ethtool_interfaces_down "$iface"
+
+ok "ERROR: No active slaves for bond device $iface"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.012.sh b/ctdb/tests/eventscripts/10.interface.monitor.012.sh
new file mode 100755 (executable)
index 0000000..dbe84b7
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "CTDB_PARTIALLY_ONLINE_INTERFACES, 1 bond down"
+
+setup_ctdb
+
+ifaces=$(ctdb_get_interfaces)
+
+for i in $ifaces ; do
+    setup_bond $i "None"
+done
+
+export CTDB_PARTIALLY_ONLINE_INTERFACES="yes"
+
+ethtool_interfaces_down $ifaces
+
+msg=$(for i in $ifaces ; do echo "ERROR: No active slaves for bond device $i" ; done)
+
+required_result 1 "$msg"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.013.sh b/ctdb/tests/eventscripts/10.interface.monitor.013.sh
new file mode 100755 (executable)
index 0000000..0fcdcd8
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 bond, active slaves, link down"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+setup_bond $iface "" "up" "down"
+
+required_result 1 "ERROR: No active slaves for 802.ad bond device $iface"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.014.sh b/ctdb/tests/eventscripts/10.interface.monitor.014.sh
new file mode 100755 (executable)
index 0000000..ab23d30
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "spurious addresses on interface, no action"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+ip addr add 192.168.253.253/24 dev $iface
+ip addr add 192.168.254.254/24 dev $iface
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.015.sh b/ctdb/tests/eventscripts/10.interface.monitor.015.sh
new file mode 100755 (executable)
index 0000000..1090cb9
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Missing interface, fail"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+ip link delete "$iface"
+
+required_result 1 <<EOF
+ERROR: Interface dev123 does not exist but it is used by public addresses.
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.monitor.016.sh b/ctdb/tests/eventscripts/10.interface.monitor.016.sh
new file mode 100755 (executable)
index 0000000..6fd698a
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Missing interface, CTDB_PARTIALLY_ONLINE_INTERFACES=yes, warn"
+
+setup_ctdb
+
+CTDB_PARTIALLY_ONLINE_INTERFACES="yes"
+
+iface=$(ctdb_get_1_interface)
+ip link delete "$iface"
+
+ok <<EOF
+ERROR: Interface dev123 does not exist but it is used by public addresses.
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.multi.001.sh b/ctdb/tests/eventscripts/10.interface.multi.001.sh
new file mode 100755 (executable)
index 0000000..da8dcf1
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "takeip, removeip"
+
+setup_ctdb
+
+public_address=$(ctdb_get_1_public_address)
+
+ok_null
+
+simple_test_event "takeip" $public_address
+simple_test_event "releaseip" $public_address
diff --git a/ctdb/tests/eventscripts/10.interface.releaseip.001.sh b/ctdb/tests/eventscripts/10.interface.releaseip.001.sh
new file mode 100755 (executable)
index 0000000..934b3dc
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "error - no args given"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+required_result 1 "ERROR: must supply interface, IP and maskbits"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.releaseip.002.sh b/ctdb/tests/eventscripts/10.interface.releaseip.002.sh
new file mode 100755 (executable)
index 0000000..9bcb7f1
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "error - remove a non-existent ip"
+
+setup_ctdb
+
+public_address=$(ctdb_get_1_public_address)
+ip="${public_address% *}" ; ip="${ip#* }"
+
+required_result 1 <<EOF
+RTNETLINK answers: Cannot assign requested address
+Failed to del ${ip} on dev ${public_address%% *}
+EOF
+
+simple_test $public_address
diff --git a/ctdb/tests/eventscripts/10.interface.releaseip.010.sh b/ctdb/tests/eventscripts/10.interface.releaseip.010.sh
new file mode 100755 (executable)
index 0000000..b6d9c7a
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Release 1 IP, 10 connections killed OK"
+
+setup_ctdb
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ip addr add "${ip}/${bits}" dev "$dev"
+
+    # Setup 10 fake connections...
+    count=10
+    out=""
+    nl="
+"
+    i=0
+    while [ $i -lt $count ] ; do
+       echo "${ip}:445 10.254.254.1:1230${i}"
+       # Expected output for killing this connection
+       out="${out}${out:+${nl}}Killing TCP connection 10.254.254.1:1230${i} ${ip}:445"
+       i=$(($i + 1))
+    done >"$FAKE_NETSTAT_TCP_ESTABLISHED_FILE"
+
+    ok <<EOF
+$out
+Killed $count TCP connections to released IP $ip
+EOF
+
+    simple_test $dev $ip $bits
+done
diff --git a/ctdb/tests/eventscripts/10.interface.releaseip.011.sh b/ctdb/tests/eventscripts/10.interface.releaseip.011.sh
new file mode 100755 (executable)
index 0000000..17b7421
--- /dev/null
@@ -0,0 +1,41 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Release 1 IP, 10 connections killed, 1 fails"
+
+setup_ctdb
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ip addr add "${ip}/${bits}" dev "$dev"
+
+    # Setup 10 fake connections...
+    count=10
+    out=""
+    nl="
+"
+    i=0
+    while [ $i -lt $count ] ; do
+       echo "${ip}:445 10.254.254.1:1230${i}"
+       # Expected output for killing this connection
+       out="${out}${out:+${nl}}Killing TCP connection 10.254.254.1:1230${i} ${ip}:445"
+       i=$(($i + 1))
+    done >"$FAKE_NETSTAT_TCP_ESTABLISHED_FILE"
+
+    # Note that the fake TCP killing done by the "ctdb killtcp" stub
+    # can only kill conections in the file, so killing this connection
+    # will never succeed so it will look like a time out.
+    FAKE_NETSTAT_TCP_ESTABLISHED="${ip}:445|10.254.254.1:43210"
+
+    ok <<EOF
+Killing TCP connection 10.254.254.1:43210 ${ip}:445
+$out
+Waiting for 1 connections to be killed for IP ${ip}
+Waiting for 1 connections to be killed for IP ${ip}
+Waiting for 1 connections to be killed for IP ${ip}
+Timed out killing tcp connections for IP $ip
+EOF
+
+    simple_test $dev $ip $bits
+done
diff --git a/ctdb/tests/eventscripts/10.interface.startup.001.sh b/ctdb/tests/eventscripts/10.interface.startup.001.sh
new file mode 100755 (executable)
index 0000000..42ef42d
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "no public addresses"
+
+setup_ctdb
+
+export CTDB_PUBLIC_ADDRESSES="$CTDB_ETC/does/not/exist"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.startup.002.sh b/ctdb/tests/eventscripts/10.interface.startup.002.sh
new file mode 100755 (executable)
index 0000000..ba33f92
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all interfaces up"
+
+setup_ctdb
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.takeip.001.sh b/ctdb/tests/eventscripts/10.interface.takeip.001.sh
new file mode 100755 (executable)
index 0000000..934b3dc
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "error - no args given"
+
+setup_ctdb
+
+iface=$(ctdb_get_1_interface)
+
+required_result 1 "ERROR: must supply interface, IP and maskbits"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/10.interface.takeip.002.sh b/ctdb/tests/eventscripts/10.interface.takeip.002.sh
new file mode 100755 (executable)
index 0000000..8960b08
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "add an ip"
+
+setup_ctdb
+
+public_address=$(ctdb_get_1_public_address)
+
+ok_null
+
+simple_test $public_address
diff --git a/ctdb/tests/eventscripts/10.interface.takeip.003.sh b/ctdb/tests/eventscripts/10.interface.takeip.003.sh
new file mode 100755 (executable)
index 0000000..203cff0
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "error - add same IP twice"
+
+setup_ctdb
+
+public_address=$(ctdb_get_1_public_address)
+dev="${public_address%% *}"
+t="${public_address#* }"
+ip="${t% *}"
+bits="${t#* }"
+
+# This is a bit gross and contrived.  The method of quoting the error
+# message so it makes it to required_result() is horrible.  Hopefully
+# improvements will come.
+
+err2="\
+RTNETLINK answers: File exists
+Failed to add $ip/$bits on dev $dev"
+
+#EVENTSCRIPTS_TESTS_TRACE="sh -x"
+iterate_test -- $public_address -- 2 "ok_null" \
+    2 'required_result 1 "$err2"'
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.001.sh b/ctdb/tests/eventscripts/13.per_ip_routing.001.sh
new file mode 100755 (executable)
index 0000000..8523c10
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "not configured"
+
+setup_ctdb
+
+ok_null
+simple_test_event "takeip"
+
+ok_null
+simple_test_event "ipreallocate"
+
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.002.sh b/ctdb/tests/eventscripts/13.per_ip_routing.002.sh
new file mode 100755 (executable)
index 0000000..d6320c6
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "missing config file"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# Error because policy routing is configured but the configuration
+# file is missing.
+required_result 1 <<EOF
+error: CTDB_PER_IP_ROUTING_CONF=${CTDB_BASE}/policy_routing file not found
+EOF
+
+for i in "startup" "ipreallocated" "monitor" ; do
+    simple_test_event "$i"
+done
+
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.003.sh b/ctdb/tests/eventscripts/13.per_ip_routing.003.sh
new file mode 100755 (executable)
index 0000000..bb2c4b7
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "empty config, ipreallocated"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 0
+
+# ipreallocated should silently add any missing routes
+ok_null
+simple_test_event "ipreallocated"
+
+# empty configuration file should mean there are no routes
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.004.sh b/ctdb/tests/eventscripts/13.per_ip_routing.004.sh
new file mode 100755 (executable)
index 0000000..4595313
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "empty config, takeip"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 0
+
+public_address=$(ctdb_get_1_public_address)
+
+ok_null
+simple_test_event "takeip" $public_address
+
+# empty configuration file should mean there are no routes
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.005.sh b/ctdb/tests/eventscripts/13.per_ip_routing.005.sh
new file mode 100755 (executable)
index 0000000..9495cc5
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, takeip"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# Configuration for 1 IP
+create_policy_routing_config 1 default
+
+# takeip should add routes for the given address
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+done
+
+# Should have routes for 1 IP
+check_routes 1 default
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.006.sh b/ctdb/tests/eventscripts/13.per_ip_routing.006.sh
new file mode 100755 (executable)
index 0000000..b93b6cd
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, takeip, releaseip"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# create config for 1 IP
+create_policy_routing_config 1 default
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    # takeip adds routes
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+
+    # releaseip removes routes
+    ok_null
+    simple_test_event "releaseip" $dev $ip $bits
+done
+
+# should have no routes
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.007.sh b/ctdb/tests/eventscripts/13.per_ip_routing.007.sh
new file mode 100755 (executable)
index 0000000..096bc96
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, ipreallocated"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# create config for 1 IP
+create_policy_routing_config 1 default
+
+# no takeip, but ipreallocated should add any missing routes
+ok_null
+simple_test_event "ipreallocated"
+
+# should have routes for 1 IP
+check_routes 1 default
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.008.sh b/ctdb/tests/eventscripts/13.per_ip_routing.008.sh
new file mode 100755 (executable)
index 0000000..9bb0c19
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, takeip twice"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# create config for 1 IP
+create_policy_routing_config 1 default
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+
+    # 2nd takeip event for the same IP should be a no-op
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+done
+
+# should be routes for 1 IP
+check_routes 1 default
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.009.sh b/ctdb/tests/eventscripts/13.per_ip_routing.009.sh
new file mode 100755 (executable)
index 0000000..cbea1ad
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "All IPs configured, takeip 1 address"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# configure all addresses
+create_policy_routing_config all default
+
+# add routes for all 1 IP
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+done
+
+# for 1 IP
+check_routes 1 default
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.010.sh b/ctdb/tests/eventscripts/13.per_ip_routing.010.sh
new file mode 100755 (executable)
index 0000000..d11585e
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "All IPs configured, takeip on all nodes"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# create config for all IPs
+create_policy_routing_config all default
+
+ctdb_get_my_public_addresses |
+while read dev ip bits ; do
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+done
+
+# should have routes for all IPs
+check_routes all default
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.011.sh b/ctdb/tests/eventscripts/13.per_ip_routing.011.sh
new file mode 100755 (executable)
index 0000000..d8ec9ac
--- /dev/null
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "__auto_link_local__, takeip all on node"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+# do link local fu instead of creating configuration
+export CTDB_PER_IP_ROUTING_CONF="__auto_link_local__"
+
+# add routes for all addresses
+ctdb_get_my_public_addresses |
+while read dev ip bits ; do
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+done
+
+check_routes all
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.012.sh b/ctdb/tests/eventscripts/13.per_ip_routing.012.sh
new file mode 100755 (executable)
index 0000000..6c8a6ab
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, takeip, releaseip, ipreallocated"
+
+# This partly tests the test infrastructure.  If the (stub) "ctdb
+# moveip" doesn't do anything then the IP being released will still be
+# on the node and the ipreallocated event will add the routes back.
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 1 default
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ok_null
+    simple_test_event "takeip" $dev $ip $bits
+
+    ok_null
+    ctdb moveip $ip 1
+    simple_test_event "releaseip" $dev $ip $bits
+
+    ok_null
+    simple_test_event "ipreallocated"
+done
+
+# all routes should have been removed and not added back
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.013.sh b/ctdb/tests/eventscripts/13.per_ip_routing.013.sh
new file mode 100755 (executable)
index 0000000..567622e
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, releaseip of unassigned"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 1 default
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ok <<EOF
+WARNING: Failed to delete policy routing rule
+  Command "ip rule del from $ip pref $CTDB_PER_IP_ROUTING_RULE_PREF table ctdb.$ip" failed:
+  RTNETLINK answers: No such file or directory
+EOF
+
+    simple_test_event "releaseip" $dev $ip $bits
+done
+
+# there should be no routes
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.014.sh b/ctdb/tests/eventscripts/13.per_ip_routing.014.sh
new file mode 100755 (executable)
index 0000000..ee08c36
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, takeip, moveip, ipreallocated"
+
+# We move the IP to another node but don't run releaseip.
+# ipreallocated should remove the bogus routes.
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 1 default
+
+ctdb_get_1_public_address |
+while read dev ip bits ; do
+    ok_null
+    # Set up the routes for an IP that we have
+    simple_test_event "takeip" $dev $ip $bits
+
+    # Now move that IPs but don't run the associated "releaseip"
+    ctdb moveip $ip 1
+
+    # This should handle removal of the routes
+    ok "Removing ip rule/routes for unhosted public address $ip"
+    simple_test_event "ipreallocated"
+done
+
+# no routes left
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.015.sh b/ctdb/tests/eventscripts/13.per_ip_routing.015.sh
new file mode 100755 (executable)
index 0000000..2b9ecba
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, releaseip of unassigned"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+export IP_ROUTE_BAD_TABLE_ID=true
+
+create_policy_routing_config 1 default
+
+ctdb_get_1_public_address |
+{
+    read dev ip bits
+
+    ok <<EOF
+WARNING: Failed to delete policy routing rule
+  Command "ip rule del from $ip pref $CTDB_PER_IP_ROUTING_RULE_PREF table ctdb.$ip" failed:
+  Error: argument ctdb.$ip is wrong: invalid table ID
+  Error: argument ctdb.$ip is wrong: table id value is invalid
+EOF
+
+    simple_test_event "releaseip" $dev $ip $bits
+}
+
+
+# there should be no routes
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.016.sh b/ctdb/tests/eventscripts/13.per_ip_routing.016.sh
new file mode 100755 (executable)
index 0000000..85320b6
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "empty config, reconfigure, NOOP"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 0
+
+ok "Reconfiguring service \"${service_name}\"..."
+simple_test_event "reconfigure"
+
+check_routes 0
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.017.sh b/ctdb/tests/eventscripts/13.per_ip_routing.017.sh
new file mode 100755 (executable)
index 0000000..8870015
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, reconfigure"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 1 default
+
+# no takeip, but reconfigure should add any missing routes
+ok "Reconfiguring service \"${service_name}\"..."
+simple_test_event "reconfigure"
+
+check_routes 1 default
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.018.sh b/ctdb/tests/eventscripts/13.per_ip_routing.018.sh
new file mode 100755 (executable)
index 0000000..ce91989
--- /dev/null
@@ -0,0 +1,22 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, ipreallocated, more routes, reconfigure"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 1
+
+# no takeip, but ipreallocated should add any missing routes
+ok_null
+simple_test_event "ipreallocated"
+
+create_policy_routing_config 1 default
+
+# reconfigure should update routes even though rules are unchanged
+ok "Reconfiguring service \"${service_name}\"..."
+simple_test_event "reconfigure"
+
+check_routes 1 default
diff --git a/ctdb/tests/eventscripts/13.per_ip_routing.019.sh b/ctdb/tests/eventscripts/13.per_ip_routing.019.sh
new file mode 100755 (executable)
index 0000000..072c929
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "1 IP configured, ipreallocated, less routes, reconfigure"
+
+setup_ctdb
+setup_ctdb_policy_routing
+
+create_policy_routing_config 1 default
+
+# no takeip, but ipreallocated should add any missing routes
+ok_null
+simple_test_event "ipreallocated"
+
+# rewrite the configuration to take out the default routes, as per the
+# above change to $args
+create_policy_routing_config 1
+
+# reconfigure should update routes even though rules are unchanged
+ok "Reconfiguring service \""${service_name}\""..."
+simple_test_event "reconfigure"
+
+check_routes 1
diff --git a/ctdb/tests/eventscripts/20.multipathd.monitor.001.sh b/ctdb/tests/eventscripts/20.multipathd.monitor.001.sh
new file mode 100755 (executable)
index 0000000..4eafefc
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "No multipath devices configure to check"
+
+setup_multipathd
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/20.multipathd.monitor.002.sh b/ctdb/tests/eventscripts/20.multipathd.monitor.002.sh
new file mode 100755 (executable)
index 0000000..fbfe952
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 multipath devices configure to check, all up"
+
+setup_multipathd "mpatha"  "mpathb"  "mpathc"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/20.multipathd.monitor.003.sh b/ctdb/tests/eventscripts/20.multipathd.monitor.003.sh
new file mode 100755 (executable)
index 0000000..d9a2125
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 multipath devices configure to check, one down"
+
+setup_multipathd "mpatha"  "!mpathb"  "mpathc"
+
+required_result 1 <<EOF
+ERROR: multipath device "mpathb" has no active paths
+multipath monitoring failed
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/20.multipathd.monitor.004.sh b/ctdb/tests/eventscripts/20.multipathd.monitor.004.sh
new file mode 100755 (executable)
index 0000000..5f45c73
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 multipath devices configure to check, multipath hangs"
+
+setup_multipathd "mpatha"  "!mpathb"  "mpathc"
+export FAKE_MULTIPATH_HANG="yes"
+
+required_result 1 <<EOF
+ERROR: callout to multipath checks hung
+multipath monitoring failed
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/40.vsftpd.monitor.001.sh b/ctdb/tests/eventscripts/40.vsftpd.monitor.001.sh
new file mode 100755 (executable)
index 0000000..fdad12a
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "not managed, check no-op"
+
+setup_vsftpd "down"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/41.httpd.monitor.001.sh b/ctdb/tests/eventscripts/41.httpd.monitor.001.sh
new file mode 100755 (executable)
index 0000000..f400eaa
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "not managed, check no-op"
+
+setup_httpd "down"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/49.winbind.monitor.001.sh b/ctdb/tests/eventscripts/49.winbind.monitor.001.sh
new file mode 100755 (executable)
index 0000000..94253d8
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "not managed, check no-op"
+
+setup_winbind "down"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/49.winbind.monitor.050.sh b/ctdb/tests/eventscripts/49.winbind.monitor.050.sh
new file mode 100755 (executable)
index 0000000..02589b3
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "auto-start, simple"
+
+setup_winbind "down"
+
+export CTDB_SERVICE_AUTOSTARTSTOP="yes"
+export CTDB_MANAGED_SERVICES="foo winbind bar"
+
+ok <<EOF
+Starting service "winbind" - now managed
+&Starting winbind: OK
+EOF
+simple_test
diff --git a/ctdb/tests/eventscripts/49.winbind.monitor.051.sh b/ctdb/tests/eventscripts/49.winbind.monitor.051.sh
new file mode 100755 (executable)
index 0000000..fbad928
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "auto-stop, simple"
+
+setup_winbind
+
+export CTDB_SERVICE_AUTOSTARTSTOP="yes"
+export CTDB_MANAGED_SERVICES="foo"
+unset CTDB_MANAGES_WINBIND
+
+ok <<EOF
+Stopping service "winbind" - no longer managed
+&Stopping winbind: OK
+EOF
+simple_test
diff --git a/ctdb/tests/eventscripts/49.winbind.monitor.101.sh b/ctdb/tests/eventscripts/49.winbind.monitor.101.sh
new file mode 100755 (executable)
index 0000000..ec2952b
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all OK"
+
+setup_winbind
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/49.winbind.monitor.102.sh b/ctdb/tests/eventscripts/49.winbind.monitor.102.sh
new file mode 100755 (executable)
index 0000000..e4a4cac
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "winbind down"
+
+setup_winbind
+wbinfo_down
+
+required_result 1 "ERROR: wbinfo -p returned error"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.001.sh b/ctdb/tests/eventscripts/50.samba.monitor.001.sh
new file mode 100755 (executable)
index 0000000..ac3708f
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "not managed, check no-op"
+
+setup_samba "down"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.050.sh b/ctdb/tests/eventscripts/50.samba.monitor.050.sh
new file mode 100755 (executable)
index 0000000..69530f3
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "auto-start, simple"
+
+setup_samba "down"
+
+export CTDB_SERVICE_AUTOSTARTSTOP="yes"
+export CTDB_MANAGED_SERVICES="foo samba winbind bar"
+
+ok <<EOF
+Starting service "samba" - now managed
+&Starting smb: OK
+EOF
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.051.sh b/ctdb/tests/eventscripts/50.samba.monitor.051.sh
new file mode 100755 (executable)
index 0000000..04c1fce
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "auto-stop, simple"
+
+setup_samba
+
+export CTDB_SERVICE_AUTOSTARTSTOP="yes"
+export CTDB_MANAGED_SERVICES="foo"
+unset CTDB_MANAGES_SAMBA
+
+ok <<EOF
+Stopping service "samba" - no longer managed
+&Stopping smb: OK
+EOF
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.101.sh b/ctdb/tests/eventscripts/50.samba.monitor.101.sh
new file mode 100755 (executable)
index 0000000..cf3b53a
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all OK"
+
+setup_samba
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.103.sh b/ctdb/tests/eventscripts/50.samba.monitor.103.sh
new file mode 100755 (executable)
index 0000000..6f71a96
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "port 445 down"
+
+setup_samba
+tcp_port_down 445
+
+required_result 1 "ERROR: samba tcp port 445 is not responding"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.104.sh b/ctdb/tests/eventscripts/50.samba.monitor.104.sh
new file mode 100755 (executable)
index 0000000..9de0223
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "port 139 down"
+
+setup_samba
+tcp_port_down 139
+
+required_result 1 "ERROR: samba tcp port 139 is not responding"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.105.sh b/ctdb/tests/eventscripts/50.samba.monitor.105.sh
new file mode 100755 (executable)
index 0000000..9936eff
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "non-existent share path"
+
+setup_samba
+shares_missing "ERROR: samba directory \"%s\" not available" 2
+
+required_result 1 "$MISSING_SHARES_TEXT"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.106.sh b/ctdb/tests/eventscripts/50.samba.monitor.106.sh
new file mode 100755 (executable)
index 0000000..8fabfb3
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "non-existent share - not checked"
+
+setup_samba
+shares_missing "ERROR: samba directory \"%s\" not available" 2
+
+export CTDB_SAMBA_SKIP_SHARE_CHECK="yes"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/50.samba.monitor.107.sh b/ctdb/tests/eventscripts/50.samba.monitor.107.sh
new file mode 100755 (executable)
index 0000000..573ff80
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "port 139 down, default tcp checker, debug"
+
+export CTDB_SCRIPT_DEBUGLEVEL=4
+
+setup_samba
+tcp_port_down 139
+
+required_result 1 <<EOF
+ERROR: samba tcp port 139 is not responding
+DEBUG: "ctdb checktcpport 139" was able to bind to port
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.ganesha.monitor.101.sh b/ctdb/tests/eventscripts/60.ganesha.monitor.101.sh
new file mode 100755 (executable)
index 0000000..d68ad6a
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all services available"
+
+setup_nfs_ganesha
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.ganesha.monitor.131.sh b/ctdb/tests/eventscripts/60.ganesha.monitor.131.sh
new file mode 100755 (executable)
index 0000000..95ce450
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "rquotad down"
+
+setup_nfs_ganesha
+rpc_services_down "rquotad"
+
+ok<<EOF
+ERROR: rquotad failed RPC check:
+rpcinfo: RPC: Program not registered
+program rquotad version 1 is not available
+Trying to restart rquotad [rpc.rquotad]
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.ganesha.monitor.141.sh b/ctdb/tests/eventscripts/60.ganesha.monitor.141.sh
new file mode 100755 (executable)
index 0000000..9cd82f8
--- /dev/null
@@ -0,0 +1,39 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "statd down, 6 iterations"
+
+# statd fails and attempts to restart it fail.
+
+setup_nfs_ganesha
+rpc_services_down "status"
+
+ok_null
+simple_test || exit $?
+
+ok<<EOF
+Trying to restart statd [rpc.statd]
+EOF
+simple_test || exit $?
+
+ok_null
+simple_test || exit $?
+
+ok<<EOF
+ERROR: status failed RPC check:
+rpcinfo: RPC: Program not registered
+program status version 1 is not available
+Trying to restart statd [rpc.statd]
+EOF
+simple_test || exit $?
+
+ok_null
+simple_test || exit $?
+
+required_result 1 <<EOF
+ERROR: status failed RPC check:
+rpcinfo: RPC: Program not registered
+program status version 1 is not available
+EOF
+simple_test || exit $?
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.001.sh b/ctdb/tests/eventscripts/60.nfs.monitor.001.sh
new file mode 100755 (executable)
index 0000000..c62e5cf
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "not managed, check no-op"
+
+setup_nfs "down"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.101.sh b/ctdb/tests/eventscripts/60.nfs.monitor.101.sh
new file mode 100755 (executable)
index 0000000..1a68927
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all services available"
+
+setup_nfs
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.102.sh b/ctdb/tests/eventscripts/60.nfs.monitor.102.sh
new file mode 100755 (executable)
index 0000000..bb988aa
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all services available, check nfsd thread count, count matches"
+
+setup_nfs
+
+CTDB_MONITOR_NFS_THREAD_COUNT="yes"
+RPCNFSDCOUNT=8
+FAKE_NFSD_THREAD_PIDS="1 2 3 4 5 6 7 8"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.103.sh b/ctdb/tests/eventscripts/60.nfs.monitor.103.sh
new file mode 100755 (executable)
index 0000000..75d7291
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all services available, check nfsd thread count, not enough threads"
+
+setup_nfs
+
+CTDB_MONITOR_NFS_THREAD_COUNT="yes"
+RPCNFSDCOUNT=8
+FAKE_NFSD_THREAD_PIDS="1 2 3 4 5"
+
+ok "Attempting to correct number of nfsd threads from 5 to 8"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.104.sh b/ctdb/tests/eventscripts/60.nfs.monitor.104.sh
new file mode 100755 (executable)
index 0000000..a052be8
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+# Add this extra test to catch a design change where we only ever
+# increase the number of threads.  That is, this test would need to be
+# consciously removed.
+define_test "all services available, check nfsd thread count, too many threads"
+
+setup_nfs
+
+CTDB_MONITOR_NFS_THREAD_COUNT="yes"
+RPCNFSDCOUNT=4
+FAKE_NFSD_THREAD_PIDS="1 2 3 4 5 6"
+
+ok "Attempting to correct number of nfsd threads from 6 to 4"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.111.sh b/ctdb/tests/eventscripts/60.nfs.monitor.111.sh
new file mode 100755 (executable)
index 0000000..414fcc8
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "knfsd down, 1 iteration"
+
+setup_nfs
+rpc_services_down "nfs"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.112.sh b/ctdb/tests/eventscripts/60.nfs.monitor.112.sh
new file mode 100755 (executable)
index 0000000..49ee335
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "knfsd down, 6 iterations"
+
+# knfsd fails and attempts to restart it fail.
+
+setup_nfs
+rpc_services_down "nfs"
+
+iterate_test 10 'rpc_set_service_failure_response "nfsd"'
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.113.sh b/ctdb/tests/eventscripts/60.nfs.monitor.113.sh
new file mode 100755 (executable)
index 0000000..505df1b
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "knfsd down, 6 iterations, dump 5 threads, none hung"
+
+# knfsd fails and attempts to restart it fail.
+setup_nfs
+rpc_services_down "nfs"
+
+# Additionally, any hung threads should have stack traces dumped.
+CTDB_NFS_DUMP_STUCK_THREADS=5
+FAKE_NFSD_THREAD_PIDS=""
+
+iterate_test 10 'rpc_set_service_failure_response "nfsd"'
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.114.sh b/ctdb/tests/eventscripts/60.nfs.monitor.114.sh
new file mode 100755 (executable)
index 0000000..496f5e7
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "knfsd down, 6 iterations, dump 5 threads, 3 hung"
+
+# knfsd fails and attempts to restart it fail.
+setup_nfs
+rpc_services_down "nfs"
+
+# Additionally, any hung threads should have stack traces dumped.
+CTDB_NFS_DUMP_STUCK_THREADS=5
+FAKE_NFSD_THREAD_PIDS="1001 1002 1003"
+
+iterate_test 10 'rpc_set_service_failure_response "nfsd"'
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.121.sh b/ctdb/tests/eventscripts/60.nfs.monitor.121.sh
new file mode 100755 (executable)
index 0000000..6d27f60
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "lockd down, 15 iterations"
+
+# This simulates an ongoing failure in the eventscript's automated
+# attempts to restart the service.  That is, the eventscript is unable
+# to restart the service.
+
+setup_nfs
+rpc_services_down "nlockmgr"
+
+#EVENTSCRIPTS_TESTS_TRACE="sh -x"
+iterate_test 15 "ok_null" \
+    10 "rpc_set_service_failure_response 'lockd'" \
+    15 "rpc_set_service_failure_response 'lockd'"
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.122.sh b/ctdb/tests/eventscripts/60.nfs.monitor.122.sh
new file mode 100755 (executable)
index 0000000..fc5cea8
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "lockd down, 15 iterations, back up after 10"
+
+# This simulates a success the eventscript's automated attempts to
+# restart the service.
+
+setup_nfs
+rpc_services_down "nlockmgr"
+
+# Iteration 10 should try to restart rpc.lockd.  However, our test
+# stub rpc.lockd does nothing, so we have to explicitly flag it as up.
+
+iterate_test 15 "ok_null" \
+    10 "rpc_set_service_failure_response 'lockd'" \
+    11 "rpc_services_up nlockmgr"
+
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.131.sh b/ctdb/tests/eventscripts/60.nfs.monitor.131.sh
new file mode 100755 (executable)
index 0000000..1cf72a9
--- /dev/null
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "rquotad down, 5 iterations"
+
+setup_nfs
+rpc_services_down "rquotad"
+
+iterate_test 5 'rpc_set_service_failure_response "rquotad"'
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.132.sh b/ctdb/tests/eventscripts/60.nfs.monitor.132.sh
new file mode 100755 (executable)
index 0000000..b8f3f2b
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "rquotad down, 5 iterations, back up after 1"
+
+# rquotad fails once but then comes back of its own accord after 1
+# failure.
+
+setup_nfs
+rpc_services_down "rquotad"
+
+iterate_test 5 'ok_null' \
+    1 'rpc_set_service_failure_response "rquotad"' \
+    2 'rpc_services_up "rquotad"'
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.141.sh b/ctdb/tests/eventscripts/60.nfs.monitor.141.sh
new file mode 100755 (executable)
index 0000000..c77b1a7
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "statd down, 6 iterations"
+
+# statd fails and attempts to restart it fail.
+
+setup_nfs
+rpc_services_down "status"
+
+iterate_test 6 'ok_null' \
+    2 'rpc_set_service_failure_response "statd"' \
+    4 'rpc_set_service_failure_response "statd"' \
+    6 'rpc_set_service_failure_response "statd"'
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.142.sh b/ctdb/tests/eventscripts/60.nfs.monitor.142.sh
new file mode 100755 (executable)
index 0000000..4373d8d
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "statd down, 8 iterations, back up after 2"
+
+# statd fails and the first attempt to restart it succeeds.
+
+setup_nfs
+rpc_services_down "status"
+
+iterate_test 8 'ok_null' \
+    2 'rpc_set_service_failure_response "statd"' \
+    3 'rpc_services_up "status"'
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.151.sh b/ctdb/tests/eventscripts/60.nfs.monitor.151.sh
new file mode 100755 (executable)
index 0000000..ea9aa78
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "mountd down, 1 iteration"
+
+setup_nfs
+rpc_services_down "mountd"
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.152.sh b/ctdb/tests/eventscripts/60.nfs.monitor.152.sh
new file mode 100755 (executable)
index 0000000..c4eb419
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "mountd down, 10 iterations"
+
+# This simulates an ongoing failure in the eventscript's automated
+# attempts to restart the service.  That is, the eventscript is unable
+# to restart the service.
+
+setup_nfs
+rpc_services_down "mountd"
+
+iterate_test 10 "ok_null" \
+    5 "rpc_set_service_failure_response 'mountd'" \
+    10 "rpc_set_service_failure_response 'mountd'"
+
+#export FAKE_NETSTAT_TCP_ESTABLISHED="10.0.0.1:2049|10.254.254.1:12301 10.0.0.1:2049|10.254.254.1:12302 10.0.0.1:2049|10.254.254.1:12303 10.0.0.1:2049|10.254.254.2:12304 10.0.0.1:2049|10.254.254.2:12305"
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.153.sh b/ctdb/tests/eventscripts/60.nfs.monitor.153.sh
new file mode 100755 (executable)
index 0000000..cf33e39
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "mountd down, 10 iterations, back up after 5"
+
+setup_nfs
+rpc_services_down "mountd"
+
+# Iteration 5 should try to restart rpc.mountd.  However, our test
+# stub rpc.mountd does nothing, so we have to explicitly flag it as
+# up.
+iterate_test 10 "ok_null" \
+    5 "rpc_set_service_failure_response 'mountd'" \
+    6 "rpc_services_up mountd"
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.161.sh b/ctdb/tests/eventscripts/60.nfs.monitor.161.sh
new file mode 100755 (executable)
index 0000000..1e07c18
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "2nd share missing"
+
+setup_nfs
+
+shares_missing "ERROR: nfs directory \"%s\" not available" 2
+
+required_result 1 "$MISSING_SHARES_TEXT"
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.monitor.162.sh b/ctdb/tests/eventscripts/60.nfs.monitor.162.sh
new file mode 100755 (executable)
index 0000000..ccd4ca8
--- /dev/null
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "2nd share missing, skipping share checks"
+
+setup_nfs
+export CTDB_NFS_SKIP_SHARE_CHECK="yes"
+
+shares_missing "ERROR: nfs directory \"%s\" not available" 2
+
+ok_null
+
+simple_test
diff --git a/ctdb/tests/eventscripts/60.nfs.multi.001.sh b/ctdb/tests/eventscripts/60.nfs.multi.001.sh
new file mode 100755 (executable)
index 0000000..f983df7
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "takeip, ipreallocated -> reconfigure"
+
+setup_nfs
+
+public_address=$(ctdb_get_1_public_address)
+
+ok_null
+
+simple_test_event "takeip" $public_address
+
+ok <<EOF
+Reconfiguring service "nfs"...
+EOF
+
+simple_test_event "ipreallocated"
diff --git a/ctdb/tests/eventscripts/60.nfs.multi.002.sh b/ctdb/tests/eventscripts/60.nfs.multi.002.sh
new file mode 100755 (executable)
index 0000000..350c1bc
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "takeip, monitor -> reconfigure"
+
+setup_nfs
+
+public_address=$(ctdb_get_1_public_address)
+
+ok_null
+
+simple_test_event "takeip" $public_address
+
+# This currently assumes that ctdb scriptstatus will always return a
+# good status (when replaying).  That should change and we will need
+# to split this into 2 tests.
+ok <<EOF
+Reconfiguring service "nfs"...
+Replaying previous status for this script due to reconfigure...
+EOF
+
+simple_test_event "monitor"
diff --git a/ctdb/tests/eventscripts/60.nfs.multi.003.sh b/ctdb/tests/eventscripts/60.nfs.multi.003.sh
new file mode 100755 (executable)
index 0000000..68f45ab
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "takeip, monitor -> reconfigure, replay error"
+
+setup_nfs
+
+public_address=$(ctdb_get_1_public_address)
+
+err="foo: bar error occurred"
+
+ok_null
+
+simple_test_event "takeip" $public_address
+
+ctdb_fake_scriptstatus 1 "ERROR" "$err"
+
+required_result 1 <<EOF
+Reconfiguring service "nfs"...
+Replaying previous status for this script due to reconfigure...
+$err
+EOF
+
+simple_test_event "monitor"
diff --git a/ctdb/tests/eventscripts/60.nfs.multi.004.sh b/ctdb/tests/eventscripts/60.nfs.multi.004.sh
new file mode 100755 (executable)
index 0000000..b071ec8
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "takeip, monitor -> reconfigure, replay timedout"
+
+setup_nfs
+
+public_address=$(ctdb_get_1_public_address)
+
+err="waiting, waiting..."
+
+ok_null
+
+simple_test_event "takeip" $public_address
+
+ctdb_fake_scriptstatus -62 "TIMEDOUT" "$err"
+
+required_result 1 <<EOF
+Reconfiguring service "nfs"...
+Replaying previous status for this script due to reconfigure...
+[Replay of TIMEDOUT scriptstatus - note incorrect return code.] $err
+EOF
+
+simple_test_event "monitor"
diff --git a/ctdb/tests/eventscripts/60.nfs.multi.005.sh b/ctdb/tests/eventscripts/60.nfs.multi.005.sh
new file mode 100755 (executable)
index 0000000..82802aa
--- /dev/null
@@ -0,0 +1,25 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "takeip, monitor -> reconfigure, replay disabled"
+
+setup_nfs
+
+public_address=$(ctdb_get_1_public_address)
+
+err=""
+
+ok_null
+
+simple_test_event "takeip" $public_address
+
+ctdb_fake_scriptstatus -8 "DISABLED" "$err"
+
+ok <<EOF
+Reconfiguring service "nfs"...
+Replaying previous status for this script due to reconfigure...
+[Replay of DISABLED scriptstatus - note incorrect return code.] $err
+EOF
+
+simple_test_event "monitor"
diff --git a/ctdb/tests/eventscripts/60.nfs.multi.006.sh b/ctdb/tests/eventscripts/60.nfs.multi.006.sh
new file mode 100755 (executable)
index 0000000..84bb9ef
--- /dev/null
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "reconfigure (synthetic), twice"
+# This checks that the lock is released...
+
+setup_nfs
+
+public_address=$(ctdb_get_1_public_address)
+
+err=""
+
+ok <<EOF
+Reconfiguring service "nfs"...
+EOF
+
+simple_test_event "reconfigure"
+simple_test_event "reconfigure"
diff --git a/ctdb/tests/eventscripts/README b/ctdb/tests/eventscripts/README
new file mode 100644 (file)
index 0000000..266c530
--- /dev/null
@@ -0,0 +1,47 @@
+eventscript unit tests
+======================
+
+This directory contains some eventscript unit tests for CTDB.  These
+tests can be run as a non-privileged user.  There are a lot of stub
+implementations of commands (located in stubs/) used to make the
+eventscripts think they're running against a real system.
+
+Test case filenames look like:
+
+  <eventscript>.<event>.NNN.sh
+
+The test helper functions will run <eventscript> with specified
+options.  If using the simple_test() or iterate_test() helper
+functions then the 1st <event> argument is automatically passed.  When
+simple_test_event() is used the event name must be explicitly passed
+as the 1st argument - this is more flexible and supports multiple
+events per test.
+
+Examples:
+
+* ../run_tests.sh .
+
+  Run all tests, displaying minimal output.
+
+* ../run_tests.sh -s .
+
+  Run all tests, displaying minimal output and a summary.
+
+* ../run_tests.sh -s ./10.interface.*.sh
+
+  Run all the tests against the 10.interface eventscript.
+
+* ../run_tests.sh -v -s .
+
+  Run all tests, displaying extra output and a summary.
+
+* ../run_tests.sh -sq .
+
+  Run all tests, displaying only a summary.
+
+* ../run_tests.sh -X ./10.interface.startup.002.sh
+
+  Run a test and have the eventscript itself run with "sh -x".  This
+  will usually make a test fail because the (undesirable) trace output
+  will be included with the output of the eventscript.  However, this
+  is useful for finding out why a test might be failing.
diff --git a/ctdb/tests/eventscripts/etc-ctdb/events.d b/ctdb/tests/eventscripts/etc-ctdb/events.d
new file mode 120000 (symlink)
index 0000000..69d2396
--- /dev/null
@@ -0,0 +1 @@
+../../../config/events.d
\ No newline at end of file
diff --git a/ctdb/tests/eventscripts/etc-ctdb/functions b/ctdb/tests/eventscripts/etc-ctdb/functions
new file mode 120000 (symlink)
index 0000000..86ba904
--- /dev/null
@@ -0,0 +1 @@
+../../../config/functions
\ No newline at end of file
diff --git a/ctdb/tests/eventscripts/etc-ctdb/nfs-rpc-checks.d b/ctdb/tests/eventscripts/etc-ctdb/nfs-rpc-checks.d
new file mode 120000 (symlink)
index 0000000..991b966
--- /dev/null
@@ -0,0 +1 @@
+../../../config/nfs-rpc-checks.d
\ No newline at end of file
diff --git a/ctdb/tests/eventscripts/etc-ctdb/public_addresses b/ctdb/tests/eventscripts/etc-ctdb/public_addresses
new file mode 100644 (file)
index 0000000..cd2f6be
--- /dev/null
@@ -0,0 +1,9 @@
+10.0.0.1/24 dev123
+10.0.0.2/24 dev123
+10.0.0.3/24 dev123
+10.0.0.4/24 dev123
+10.0.0.5/24 dev123
+10.0.0.6/24 dev123
+10.0.1.1/24 dev456
+10.0.1.2/24 dev456
+10.0.1.3/24 dev456
diff --git a/ctdb/tests/eventscripts/etc-ctdb/rc.local b/ctdb/tests/eventscripts/etc-ctdb/rc.local
new file mode 100755 (executable)
index 0000000..6052d87
--- /dev/null
@@ -0,0 +1,61 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+# Use a "service" command in $PATH if one exists.
+service ()
+{
+    if _t=$(which "service" 2>/dev/null) ; then
+       "$_t" "$@"
+    else
+       _nice=""
+       _service "$@"
+    fi
+}
+
+nice_service ()
+{
+    if _t=$(which "service" 2>/dev/null) ; then
+       nice "$_t" "$@"
+    else
+       _nice="nice"
+       _service "$@"
+    fi
+}
+
+# Always succeeds
+set_proc () { : ; }
+
+get_proc ()
+{
+    case "$1" in
+       net/bonding/*)
+           cat "$FAKE_PROC_NET_BONDING/${1##*/}"
+           ;;
+       sys/net/ipv4/conf/all/arp_filter)
+           echo 1
+           ;;
+       fs/nfsd/threads)
+           echo "$FAKE_NFSD_THREAD_PIDS" | wc -w
+           ;;
+       */stack)
+           echo "[<ffffffff87654321>] fake_stack_trace_for_pid_${1}+0x0/0xff"
+           ;;
+       meminfo)
+           echo "$FAKE_PROC_MEMINFO"
+           ;;
+       *)
+           echo "get_proc: \"$1\" not implemented"
+           exit 1
+    esac
+}
+
+# Always succeeds
+iptables () { : ; }
+
+# Do not actually background - we want to see the output
+background_with_logging ()
+{
+    "$@" 2>&1 </dev/null | sed -e 's@^@\&@'
+}
+
+CTDB_INIT_STYLE="redhat"
+PATH="${EVENTSCRIPTS_PATH}:$PATH"
diff --git a/ctdb/tests/eventscripts/etc-ctdb/statd-callout b/ctdb/tests/eventscripts/etc-ctdb/statd-callout
new file mode 100755 (executable)
index 0000000..51779bd
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+# For now, always succeed.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/etc/init.d/nfs b/ctdb/tests/eventscripts/etc/init.d/nfs
new file mode 100755 (executable)
index 0000000..43eb308
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# This is not used.  The fake "service" script is used instead.  This
+# is only needed to shut up functions like startstop_nfs(), which look
+# for this script.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/etc/init.d/nfslock b/ctdb/tests/eventscripts/etc/init.d/nfslock
new file mode 100755 (executable)
index 0000000..43eb308
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# This is not used.  The fake "service" script is used instead.  This
+# is only needed to shut up functions like startstop_nfs(), which look
+# for this script.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/etc/samba/smb.conf b/ctdb/tests/eventscripts/etc/samba/smb.conf
new file mode 100644 (file)
index 0000000..da89db2
--- /dev/null
@@ -0,0 +1,42 @@
+[global]
+       # enable clustering
+       clustering=yes
+       ctdb:registry.tdb=yes
+
+       security = ADS
+       auth methods = guest sam winbind
+
+       netbios name = cluster1
+       workgroup = CLUSTER1
+       realm = CLUSTER1.COM
+       server string = "Clustered Samba"
+       disable netbios = yes
+       disable spoolss = yes
+       fileid:mapping = fsname
+       use mmap = yes
+       gpfs:sharemodes = yes
+       gpfs:leases = yes
+       passdb backend = tdbsam
+       preferred master = no
+       kernel oplocks = yes
+       syslog = 1
+       host msdfs = no
+       notify:inotify = no
+       vfs objects = shadow_copy2 syncops gpfs fileid
+       shadow:snapdir = .snapshots
+       shadow:fixinodes = yes
+       wide links = no
+       smbd:backgroundqueue = False
+       read only = no
+       use sendfile = yes
+       strict locking = yes
+       posix locking = yes
+       large readwrite = yes
+       force unknown acl user = yes
+       nfs4:mode = special
+       nfs4:chown = yes
+       nfs4:acedup = merge
+       nfs4:sidmap = /etc/samba/sidmap.tdb
+       map readonly = no
+       ea support = yes
+       dmapi support = no
diff --git a/ctdb/tests/eventscripts/etc/sysconfig/ctdb b/ctdb/tests/eventscripts/etc/sysconfig/ctdb
new file mode 100644 (file)
index 0000000..4584c11
--- /dev/null
@@ -0,0 +1,2 @@
+CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
+CTDB_DEBUGLEVEL=ERR
diff --git a/ctdb/tests/eventscripts/etc/sysconfig/nfs b/ctdb/tests/eventscripts/etc/sysconfig/nfs
new file mode 100644 (file)
index 0000000..090d786
--- /dev/null
@@ -0,0 +1,2 @@
+NFS_HOSTNAME="cluster1"
+STATD_HOSTNAME="$NFS_HOSTNAME -H /etc/ctdb/statd-callout "
diff --git a/ctdb/tests/eventscripts/scripts/local.sh b/ctdb/tests/eventscripts/scripts/local.sh
new file mode 100644 (file)
index 0000000..e6186a0
--- /dev/null
@@ -0,0 +1,1037 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+# Augment PATH with relevant stubs/ directories.  We do this by actually
+# setting PATH, and also by setting $EVENTSCRIPTS_PATH and then
+# prepending that to $PATH in rc.local to avoid the PATH reset in
+# functions.
+
+EVENTSCRIPTS_PATH=""
+
+if [ -d "${TEST_SUBDIR}/stubs" ] ; then
+    EVENTSCRIPTS_PATH="${TEST_SUBDIR}/stubs"
+fi
+
+export EVENTSCRIPTS_PATH
+
+PATH="${EVENTSCRIPTS_PATH}:${PATH}"
+
+export EVENTSCRIPTS_TESTS_VAR_DIR="${TEST_VAR_DIR}/unit_eventscripts"
+if [ -d "$EVENTSCRIPTS_TESTS_VAR_DIR" -a \
+    "$EVENTSCRIPTS_TESTS_VAR_DIR" != "/unit_eventscripts" ] ; then
+    rm -r "$EVENTSCRIPTS_TESTS_VAR_DIR"
+fi
+mkdir -p "$EVENTSCRIPTS_TESTS_VAR_DIR"
+export CTDB_VARDIR="$EVENTSCRIPTS_TESTS_VAR_DIR/ctdb"
+
+export CTDB_LOGFILE="${EVENTSCRIPTS_TESTS_VAR_DIR}/log.ctdb"
+touch "$CTDB_LOGFILE" || die "Unable to create CTDB_LOGFILE=$CTDB_LOGFILE"
+
+if [ -d "${TEST_SUBDIR}/etc" ] ; then    
+    cp -a "${TEST_SUBDIR}/etc" "$EVENTSCRIPTS_TESTS_VAR_DIR"
+    export CTDB_ETCDIR="${EVENTSCRIPTS_TESTS_VAR_DIR}/etc"
+else
+    die "Unable to setup \$CTDB_ETCDIR"
+fi
+
+if [ -d "${TEST_SUBDIR}/etc-ctdb" ] ; then
+    cp -prL "${TEST_SUBDIR}/etc-ctdb" "$EVENTSCRIPTS_TESTS_VAR_DIR"
+    export CTDB_BASE="${EVENTSCRIPTS_TESTS_VAR_DIR}/etc-ctdb"
+else
+    die "Unable to set \$CTDB_BASE"
+fi
+export CTDB_BASE
+
+if [ ! -d "${CTDB_BASE}/events.d" ] ; then
+    cat <<EOF
+ERROR: Directory ${CTDB_BASE}/events.d does not exist.
+
+That means that no eventscripts can be tested.
+
+One possible explanation:
+
+  You have CTDB installed via RPMs (or similar), so the regular
+  CTDB_BASE directory is in /etc/ctdb/
+
+  BUT
+
+  You have done a regular "configure" and "make install" so the tests
+  are installed under /usr/local/.
+
+If so, one possible hack to fix this is to create a symlink:
+
+  ln -s /etc/ctdb /usr/local/etc/ctdb
+
+This is nasty but it works...  :-)
+EOF
+    exit 1
+fi
+
+######################################################################
+
+if "$TEST_VERBOSE" ; then
+    debug () { echo "$@" ; }
+else
+    debug () { : ; }
+fi
+
+eventscripts_tests_cleanup_hooks=""
+
+# This loses quoting!
+eventscripts_test_add_cleanup ()
+{
+    eventscripts_tests_cleanup_hooks="${eventscripts_tests_cleanup_hooks}${eventscripts_tests_cleanup_hooks:+ ; }$*"
+}
+
+trap 'eval $eventscripts_tests_cleanup_hooks' 0
+
+
+######################################################################
+
+# General setup fakery
+
+setup_generic ()
+{
+    debug "Setting up shares (3 existing shares)"
+    # Create 3 fake shares/exports.
+    export FAKE_SHARES=""
+    for i in $(seq 1 3) ; do
+       _s="${EVENTSCRIPTS_TESTS_VAR_DIR}/shares/${i}_existing"
+       mkdir -p "$_s"
+       FAKE_SHARES="${FAKE_SHARES}${FAKE_SHARES:+ }${_s}"
+    done
+
+    export FAKE_PROC_NET_BONDING="$EVENTSCRIPTS_TESTS_VAR_DIR/proc-net-bonding"
+    mkdir -p "$FAKE_PROC_NET_BONDING"
+    rm -f "$FAKE_PROC_NET_BONDING"/*
+
+    export FAKE_ETHTOOL_LINK_DOWN="$EVENTSCRIPTS_TESTS_VAR_DIR/ethtool-link-down"
+    mkdir -p "$FAKE_ETHTOOL_LINK_DOWN"
+    rm -f "$FAKE_ETHTOOL_LINK_DOWN"/*
+
+    # This can only have 2 levels.  We don't want to resort to usings
+    # something dangerous like "rm -r" setup time.
+    export FAKE_IP_STATE="$EVENTSCRIPTS_TESTS_VAR_DIR/fake-ip-state"
+    mkdir -p "$FAKE_IP_STATE"
+    rm -f "$FAKE_IP_STATE"/*/*
+    rm -f "$FAKE_IP_STATE"/* 2>/dev/null || true
+    rmdir "$FAKE_IP_STATE"/* 2>/dev/null || true
+
+
+    export CTDB_DBDIR="${EVENTSCRIPTS_TESTS_VAR_DIR}/db"
+    mkdir -p "${CTDB_DBDIR}/persistent"
+
+    export FAKE_TDBTOOL_SUPPORTS_CHECK="yes"
+    export FAKE_TDB_IS_OK
+    export FAKE_DATE_OUTPUT
+
+    export FAKE_NETSTAT_TCP_ESTABLISHED FAKE_TCP_LISTEN FAKE_NETSTAT_UNIX_LISTEN
+    export FAKE_NETSTAT_TCP_ESTABLISHED_FILE=$(mktemp --tmpdir="$EVENTSCRIPTS_TESTS_VAR_DIR")
+}
+
+tcp_port_down ()
+{
+    for _i ; do
+       debug "Marking TCP port \"${_i}\" as not listening"
+       FAKE_TCP_LISTEN=$(echo "$FAKE_TCP_LISTEN" | sed -r -e "s@[[:space:]]*[\.0-9]+:${_i}@@g")
+    done
+}
+
+shares_missing ()
+{
+    _fmt="$1" ; shift
+
+    # Replace some shares with non-existent ones.
+    _t=""
+    _n=1
+    _nl="
+"
+    export MISSING_SHARES_TEXT=""
+    for _i in $FAKE_SHARES ; do
+       if [ $_n = "$1" ] ; then
+           shift
+           _i="${_i%_existing}_missing"
+           debug "Replacing share $_n with missing share \"$_i\""
+           rmdir "$_i" 2>/dev/null || true
+           MISSING_SHARES_TEXT="${MISSING_SHARES_TEXT}${MISSING_SHARES_TEXT:+${_nl}}"$(printf "$_fmt" "${_i}")
+       fi
+       _t="${_t}${_t:+ }${_i}"
+       _n=$(($_n + 1))
+    done
+    FAKE_SHARES="$_t"
+}
+
+# Setup some fake /proc/net/bonding files with just enough info for
+# the eventscripts.
+
+# arg1 is interface name, arg2 is currently active slave (use "None"
+# if none), arg3 is MII status ("up" or "down").
+setup_bond ()
+{
+    _iface="$1"
+    _slave="${2:-${_iface}_sl_0}"
+    _mii_s="${3:-up}"
+    _mii_subs="${4:-${_mii_s:-up}}"
+    echo "Setting $_iface to be a bond with active slave $_slave and MII status $_mii_s"
+    cat >"${FAKE_PROC_NET_BONDING}/$_iface" <<EOF
+Bonding Mode: IEEE 802.3ad Dynamic link aggregation
+Currently Active Slave: $_slave
+# Status of the bond
+MII Status: $_mii_s
+# Status of 1st pretend adapter
+MII Status: $_mii_subs
+# Status of 2nd pretend adapter
+MII Status: $_mii_subs
+EOF
+}
+
+ethtool_interfaces_down ()
+{
+    for _i ; do
+       echo "Marking interface $_i DOWN for ethtool"
+       touch "${FAKE_ETHTOOL_LINK_DOWN}/${_i}"
+    done
+}
+
+ethtool_interfaces_up ()
+{
+    for _i ; do
+       echo "Marking interface $_i UP for ethtool"
+       rm -f "${FAKE_ETHTOOL_LINK_DOWN}/${_i}"
+    done
+}
+
+setup_nmap_output_filter ()
+{
+    OUT_FILTER="-e 's@^(DEBUG: # Nmap 5.21 scan initiated) .+ (as:)@\1 DATE \2@' -e 's@^(DEBUG: # Nmap done at) .+ (--)@\1 DATE \2@'"
+}
+
+dump_routes ()
+{
+    echo "# ip rule show"
+    ip rule show
+
+    ip rule show |
+    while read _p _x _i _x _t ; do
+       # Remove trailing colon after priority/preference.
+       _p="${_p%:}"
+       # Only remove rules that match our priority/preference.
+       [ "$CTDB_PER_IP_ROUTING_RULE_PREF" = "$_p" ] || continue
+
+       echo "# ip route show table $_t"
+       ip route show table "$_t"
+    done
+}
+
+# Copied from 13.per_ip_routing for now... so this is lazy testing  :-(
+ipv4_host_addr_to_net ()
+{
+    _host="$1"
+    _maskbits="$2"
+
+    # Convert the host address to an unsigned long by splitting out
+    # the octets and doing the math.
+    _host_ul=0
+    for _o in $(export IFS="." ; echo $_host) ; do
+       _host_ul=$(( ($_host_ul << 8) + $_o)) # work around Emacs color bug
+    done
+
+    # Calculate the mask and apply it.
+    _mask_ul=$(( 0xffffffff << (32 - $_maskbits) ))
+    _net_ul=$(( $_host_ul & $_mask_ul ))
+
+    # Now convert to a network address one byte at a time.
+    _net=""
+    for _o in $(seq 1 4) ; do
+       _net="$(($_net_ul & 255))${_net:+.}${_net}"
+       _net_ul=$(($_net_ul >> 8))
+    done
+
+    echo "${_net}/${_maskbits}"
+}
+
+######################################################################
+
+# CTDB fakery
+
+# Evaluate an expression that probably calls functions or uses
+# variables from the CTDB functions file.  This is used for test
+# initialisation.
+eventscript_call ()
+{
+    (
+       . "$CTDB_BASE/functions"
+       "$@"
+    )
+}
+
+# Set output for ctdb command.  Option 1st argument is return code.
+ctdb_set_output ()
+{
+    _out="$EVENTSCRIPTS_TESTS_VAR_DIR/ctdb.out"
+    cat >"$_out"
+
+    _rc="$EVENTSCRIPTS_TESTS_VAR_DIR/ctdb.rc"
+    echo "${1:-0}" >"$_rc"
+
+    eventscripts_test_add_cleanup "rm -f $_out $_rc"
+}
+
+setup_ctdb ()
+{
+    setup_generic
+
+    export FAKE_CTDB_NUMNODES="${1:-3}"
+    echo "Setting up CTDB with ${FAKE_CTDB_NUMNODES} fake nodes"
+
+    export FAKE_CTDB_PNN="${2:-0}"
+    echo "Setting up CTDB with PNN ${FAKE_CTDB_PNN}"
+
+    export CTDB_PUBLIC_ADDRESSES="${CTDB_BASE}/public_addresses"
+    if [ -n "$3" ] ; then
+       echo "Setting up CTDB_PUBLIC_ADDRESSES: $3"
+       CTDB_PUBLIC_ADDRESSES=$(mktemp)
+       for _i in $3 ; do
+           _ip="${_i%@*}"
+           _ifaces="${_i#*@}"
+           echo "${_ip} ${_ifaces}" >>"$CTDB_PUBLIC_ADDRESSES"
+       done
+       eventscripts_test_add_cleanup "rm -f $CTDB_PUBLIC_ADDRESSES"
+    fi
+
+    export FAKE_CTDB_STATE="$EVENTSCRIPTS_TESTS_VAR_DIR/fake-ctdb"
+
+    export FAKE_CTDB_IFACES_DOWN="$FAKE_CTDB_STATE/ifaces-down"
+    mkdir -p "$FAKE_CTDB_IFACES_DOWN"
+    rm -f "$FAKE_CTDB_IFACES_DOWN"/*
+
+    export FAKE_CTDB_SCRIPTSTATUS="$FAKE_CTDB_STATE/scriptstatus"
+    mkdir -p "$FAKE_CTDB_SCRIPTSTATUS"
+    rm -f "$FAKE_CTDB_SCRIPTSTATUS"/*
+
+    export CTDB_PARTIALLY_ONLINE_INTERFACES
+}
+
+setup_memcheck ()
+{
+    setup_ctdb
+
+    _swap_total="5857276"
+
+    if [ "$1" = "bad" ] ; then
+       _swap_free="   4352"
+       _mem_cached=" 112"
+       _mem_free=" 468"
+    else
+       _swap_free="$_swap_total"
+       _mem_cached="1112"
+       _mem_free="1468"
+    fi
+
+    export FAKE_PROC_MEMINFO="\
+MemTotal:        3940712 kB
+MemFree:          225268 kB
+Buffers:          146120 kB
+Cached:          1139348 kB
+SwapCached:        56016 kB
+Active:          2422104 kB
+Inactive:        1019928 kB
+Active(anon):    1917580 kB
+Inactive(anon):   523080 kB
+Active(file):     504524 kB
+Inactive(file):   496848 kB
+Unevictable:        4844 kB
+Mlocked:            4844 kB
+SwapTotal:       ${_swap_total} kB
+SwapFree:        ${_swap_free} kB
+..."
+
+    export FAKE_FREE_M="\
+             total       used       free     shared    buffers     cached
+Mem:          3848       3634        213          0        142       ${_mem_cached}
+-/+ buffers/cache:       2379       ${_mem_free}
+Swap:         5719        246       5473"
+
+    export CTDB_MONITOR_FREE_MEMORY
+    export CTDB_MONITOR_FREE_MEMORY_WARN
+    export CTDB_CHECK_SWAP_IS_NOT_USED
+}
+
+ctdb_get_interfaces ()
+{
+    # The echo/subshell forces all the output onto 1 line.
+    echo $(ctdb ifaces -Y | awk -F: 'FNR > 1 {print $2}')
+}
+
+ctdb_get_1_interface ()
+{
+    _t=$(ctdb_get_interfaces)
+    echo ${_t%% *}
+}
+
+# Print all public addresses as: interface IP maskbits
+# Each line is suitable for passing to takeip/releaseip
+ctdb_get_all_public_addresses ()
+{
+    _f="${CTDB_PUBLIC_ADDRESSES:-${CTDB_BASE}/public_addresses}"
+    while IFS="/$IFS" read _ip _maskbits _ifaces ; do
+       echo "$_ifaces $_ip $_maskbits"
+    done <"$_f"
+}
+
+# Print public addresses on this node as: interface IP maskbits
+# Each line is suitable for passing to takeip/releaseip
+ctdb_get_my_public_addresses ()
+{
+    ctdb ip -v -Y | {
+       read _x # skip header line
+
+       while IFS=":" read _x _ip _x _iface _x ; do
+           [ -n "$_iface" ] || continue
+           while IFS="/$IFS" read _i _maskbits _x ; do
+               if [ "$_ip" = "$_i" ] ; then
+                   echo $_iface $_ip $_maskbits
+                   break
+               fi
+           done <"${CTDB_PUBLIC_ADDRESSES:-${CTDB_BASE}/public_addresses}"
+       done
+    }
+}
+
+# Prints the 1st public address as: interface IP maskbits
+# This is suitable for passing to takeip/releaseip
+ctdb_get_1_public_address ()
+{
+    ctdb_get_my_public_addresses | head -n 1
+}
+
+ctdb_not_implemented ()
+{
+    export CTDB_NOT_IMPLEMENTED="$1"
+    ctdb_not_implemented="\
+DEBUG: ctdb: command \"$1\" not implemented in stub"
+}
+
+ctdb_fake_scriptstatus ()
+{
+    _code="$1"
+    _status="$2"
+    _err_out="$3"
+
+    _d1=$(date '+%s.%N')
+    _d2=$(date '+%s.%N')
+
+    echo "$_code $_status $_err_out" >"$FAKE_CTDB_SCRIPTSTATUS/$script"
+}
+
+######################################################################
+
+setup_ctdb_policy_routing ()
+{
+    service_name="per_ip_routing"
+
+    export CTDB_PER_IP_ROUTING_CONF="$CTDB_BASE/policy_routing"
+    export CTDB_PER_IP_ROUTING_RULE_PREF=100
+    export CTDB_PER_IP_ROUTING_TABLE_ID_LOW=1000
+    export CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=2000
+
+    # Tests need to create and populate this file
+    rm -f "$CTDB_PER_IP_ROUTING_CONF"
+}
+
+# Create policy routing configuration in $CTDB_PER_IP_ROUTING_CONF.
+# $1 is the number of assigned IPs to use (<num>, all), defaulting to
+# 1.  If $2 is "default" then a default route is also added.
+create_policy_routing_config ()
+{
+    _num_ips="${1:-1}"
+    _should_add_default="$2"
+
+    ctdb_get_my_public_addresses |
+    if [ "$_num_ips" = "all" ] ; then
+       cat
+    else
+       head -n "$_num_ips"
+    fi |
+    while read _dev _ip _bits ; do
+       _net=$(ipv4_host_addr_to_net "$_ip" "$_bits")
+       _gw="${_net%.*}.1" # a dumb, calculated default
+
+       echo "$_ip $_net"
+
+       if [ "$_should_add_default" = "default" ] ; then
+           echo "$_ip 0.0.0.0/0 $_gw"
+       fi
+    done >"$CTDB_PER_IP_ROUTING_CONF"
+}
+
+# Check the routes against those that are expected.  $1 is the number
+# of assigned IPs to use (<num>, all), defaulting to 1.  If $2 is
+# "default" then expect default routes to have been added.
+check_routes ()
+{
+    _num_ips="${1:-1}"
+    _should_add_default="$2"
+
+    _policy_rules=""
+    _policy_routes=""
+
+    ctdb_get_my_public_addresses |
+    if [ "$_num_ips" = "all" ] ; then
+       cat
+    else
+       head -n "$_num_ips"
+    fi | {
+       while read _dev _ip _bits ; do
+           _net=$(ipv4_host_addr_to_net "$_ip" "$_bits")
+           _gw="${_net%.*}.1" # a dumb, calculated default
+
+           _policy_rules="${_policy_rules}
+${CTDB_PER_IP_ROUTING_RULE_PREF}:      from $_ip lookup ctdb.$_ip "
+           _policy_routes="${_policy_routes}
+# ip route show table ctdb.$_ip
+$_net dev $_dev  scope link "
+
+           if [ "$_should_add_default" = "default" ] ; then
+               _policy_routes="${_policy_routes}
+default via $_gw dev $_dev "
+           fi
+       done
+
+       ok <<EOF
+# ip rule show
+0:     from all lookup local ${_policy_rules}
+32766: from all lookup main 
+32767: from all lookup default ${_policy_routes}
+EOF
+
+       simple_test_command dump_routes
+    }
+}
+
+######################################################################
+
+# Samba/winbind fakery
+
+setup_samba ()
+{
+    setup_ctdb
+
+    service_name="samba"
+
+    if [ "$1" != "down" ] ; then
+
+       debug "Marking Samba services as up, listening and managed by CTDB"
+        # Get into known state.
+       eventscript_call ctdb_service_managed
+
+        # All possible service names for all known distros.
+       for i in "smb" "nmb" "samba" ; do
+           service "$i" force-started
+       done
+
+       export CTDB_SAMBA_SKIP_SHARE_CHECK="no"
+       export CTDB_MANAGED_SERVICES="foo samba bar"
+
+       export FAKE_TCP_LISTEN="0.0.0.0:445 0.0.0.0:139"
+       export FAKE_WBINFO_FAIL="no"
+
+       # Some things in 50.samba are backgrounded and waited for.  If
+       # we don't sleep at all then timeouts can happen.  This avoids
+       # that...  :-)
+       export FAKE_SLEEP_FORCE=0.1
+    else
+       debug "Marking Samba services as down, not listening and not managed by CTDB"
+        # Get into known state.
+       eventscript_call ctdb_service_unmanaged
+
+        # All possible service names for all known distros.
+       for i in "smb" "nmb" "samba" ; do
+           service "$i" force-stopped
+       done
+
+       export CTDB_SAMBA_SKIP_SHARE_CHECK="no"
+       export CTDB_MANAGED_SERVICES="foo bar"
+       unset CTDB_MANAGES_SAMBA
+
+       export FAKE_TCP_LISTEN=""
+       export FAKE_WBINFO_FAIL="yes"
+    fi
+
+    # This is ugly but if this file isn't removed before each test
+    # then configuration changes between tests don't stick.
+    rm -f "$CTDB_VARDIR/state/samba/smb.conf.cache"
+}
+
+setup_winbind ()
+{
+    setup_ctdb
+
+    service_name="winbind"
+
+    if [ "$1" != "down" ] ; then
+
+       debug "Marking Winbind service as up and managed by CTDB"
+        # Get into known state.
+       eventscript_call ctdb_service_managed
+
+       service "winbind" force-started
+
+       export CTDB_MANAGED_SERVICES="foo winbind bar"
+
+       export FAKE_WBINFO_FAIL="no"
+
+    else
+       debug "Marking Winbind service as down and not managed by CTDB"
+        # Get into known state.
+       eventscript_call ctdb_service_unmanaged
+
+       service "winbind" force-stopped
+
+       export CTDB_MANAGED_SERVICES="foo bar"
+       unset CTDB_MANAGES_WINBIND
+
+       export FAKE_WBINFO_FAIL="yes"
+    fi
+}
+
+wbinfo_down ()
+{
+    debug "Making wbinfo commands fail"
+    FAKE_WBINFO_FAIL="yes"
+}
+
+######################################################################
+
+# NFS fakery
+
+setup_nfs ()
+{
+    setup_ctdb
+
+    service_name="nfs"
+
+    export FAKE_RPCINFO_SERVICES=""
+
+    export CTDB_NFS_SKIP_SHARE_CHECK="no"
+
+    export CTDB_MONITOR_NFS_THREAD_COUNT RPCNFSDCOUNT FAKE_NFSD_THREAD_PIDS
+    export CTDB_NFS_DUMP_STUCK_THREADS
+
+    # Reset the failcounts for nfs services.
+    eventscript_call eval rm -f '$ctdb_fail_dir/nfs_*'
+
+    if [ "$1" != "down" ] ; then
+       debug "Setting up NFS environment: all RPC services up, NFS managed by CTDB"
+
+       eventscript_call ctdb_service_managed
+       service "nfs" force-started  # might not be enough
+
+       export CTDB_MANAGED_SERVICES="foo nfs bar"
+
+       rpc_services_up "nfs" "mountd" "rquotad" "nlockmgr" "status"
+    else
+       debug "Setting up NFS environment: all RPC services down, NFS not managed by CTDB"
+
+       eventscript_call ctdb_service_unmanaged
+       service "nfs" force-stopped  # might not be enough
+       eventscript_call startstop_nfs stop
+
+       export CTDB_MANAGED_SERVICES="foo bar"
+       unset CTDB_MANAGES_NFS
+    fi
+}
+
+setup_nfs_ganesha ()
+{
+    setup_nfs "$@"
+    export CTDB_NFS_SERVER_MODE="ganesha"
+    if [ "$1" != "down" ] ; then
+       export CTDB_MANAGES_NFS="yes"
+    fi
+
+    # We do not support testing the Ganesha-nfsd-specific part of the
+    # eventscript.
+    export CTDB_SKIP_GANESHA_NFSD_CHECK="yes"
+    export CTDB_NFS_SKIP_SHARE_CHECK="yes"
+}
+
+rpc_services_down ()
+{
+    for _i ; do
+       debug "Marking RPC service \"${_i}\" as unavailable"
+       FAKE_RPCINFO_SERVICES=$(echo "$FAKE_RPCINFO_SERVICES" | sed -r -e "s@[[:space:]]*${_i}:[0-9]+:[0-9]+@@g")
+    done
+}
+
+rpc_services_up ()
+{
+    for _i ; do
+       debug "Marking RPC service \"${_i}\" as available"
+       case "$_i" in
+           nfs)      _t="2:3" ;;
+           mountd)   _t="1:3" ;;
+           rquotad)  _t="1:2" ;;
+           nlockmgr) _t="3:4" ;;
+           status)   _t="1:1" ;;
+           *) die "Internal error - unsupported RPC service \"${_i}\"" ;;
+       esac
+
+       FAKE_RPCINFO_SERVICES="${FAKE_RPCINFO_SERVICES}${FAKE_RPCINFO_SERVICES:+ }${_i}:${_t}"
+    done
+}
+
+# Set the required result for a particular RPC program having failed
+# for a certain number of iterations.  This is probably still a work
+# in progress.  Note that we could hook aggressively
+# nfs_check_rpc_service() to try to implement this but we're better
+# off testing nfs_check_rpc_service() using independent code...  even
+# if it is incomplete and hacky.  So, if the 60.nfs eventscript
+# changes and the tests start to fail then it may be due to this
+# function being incomplete.
+rpc_set_service_failure_response ()
+{
+    _progname="$1"
+    # The number of failures defaults to the iteration number.  This
+    # will be true when we fail from the 1st iteration... but we need
+    # the flexibility to set the number of failures.
+    _numfails="${2:-${iteration}}"
+
+    _etc="$CTDB_ETCDIR" # shortcut for readability
+    for _c in "$_etc/sysconfig/nfs" "$_etc/default/nfs" "$_etc/ctdb/sysconfig/nfs" ; do
+       if [ -r "$_c" ] ; then
+           . "$_c"
+           break
+       fi
+    done
+
+    # A handy newline.  :-)
+    _nl="
+"
+
+    # Default
+    ok_null
+
+    _file=$(ls "${CTDB_BASE}/nfs-rpc-checks.d/"[0-9][0-9]."${_progname}.check")
+    [ -r "$_file" ] || die "RPC check file \"$_file\" does not exist or is not unique"
+
+    while read _op _li _actions ; do
+       # Skip comments
+       case "$_op" in
+           \#*) continue ;;
+       esac
+
+       _hit=false
+       if [ "$_op" != "%" ] ; then
+           if [ $_numfails $_op $_li ] ; then
+               _hit=true
+           fi
+       else
+           if [ $(($_numfails $_op $_li)) -eq 0 ] ; then
+               _hit=true
+           fi
+       fi
+       if $_hit ; then
+           _out=""
+           _rc=0
+           for _action in $_actions ; do
+               case "$_action" in
+                   verbose)
+                       _ver=1
+                       _pn="$_progname"
+                       case "$_progname" in
+                           nfsd) _ver=3 ; _pn="nfs" ;;
+                           lockd) _ver=4 ; _pn="nlockmgr" ;;
+                           statd) _pn="status" ;;
+                       esac
+                       _out="\
+ERROR: $_pn failed RPC check:
+rpcinfo: RPC: Program not registered
+program $_pn version $_ver is not available"
+                       ;;
+                   restart*)
+                       _p="rpc.${_progname}"
+                       case "$_action" in
+                           *:b) _bg="&" ;;
+                           *)   _bg=""  ;;
+                       esac
+                       case "$_progname" in
+                           nfsd)
+                               _t="\
+Trying to restart NFS service"
+
+                               if [ -n "$CTDB_NFS_DUMP_STUCK_THREADS" ] ; then
+                                   for _pid in $FAKE_NFSD_THREAD_PIDS ; do
+                                       _t="\
+$_t
+${_bg}Stack trace for stuck nfsd thread [${_pid}]:
+${_bg}[<ffffffff87654321>] fake_stack_trace_for_pid_${_pid}/stack+0x0/0xff"
+                                   done
+                               fi
+
+                               _t="\
+${_t}
+${_bg}Starting nfslock: OK
+${_bg}Starting nfs: OK"
+                               ;;
+                           lockd)
+                               _t="\
+Trying to restart lock manager service
+${_bg}Starting nfslock: OK"
+                               ;;
+                           *)
+                               _t="Trying to restart $_progname [${_p}]"
+                       esac
+                       _out="${_out}${_out:+${_nl}}${_t}"
+                       ;;
+                   unhealthy)
+                       _rc=1
+               esac
+           done
+           required_result $_rc "$_out"
+           return
+       fi
+    done <"$_file"
+}
+
+######################################################################
+
+# VSFTPD fakery
+
+setup_vsftpd ()
+{
+    service_name="vsftpd"
+
+    if [ "$1" != "down" ] ; then
+       die "setup_vsftpd up not implemented!!!"
+    else
+       debug "Setting up VSFTPD environment: service down, not managed by CTDB"
+
+       eventscript_call ctdb_service_unmanaged
+       service vsftpd force-stopped
+
+       export CTDB_MANAGED_SERVICES="foo"
+       unset CTDB_MANAGES_VSFTPD
+    fi
+}
+
+######################################################################
+
+# HTTPD fakery
+
+setup_httpd ()
+{
+    if [ "$1" != "down" ] ; then
+       die "setup_httpd up not implemented!!!"
+    else
+       debug "Setting up HTTPD environment: service down, not managed by CTDB"
+
+       for service_name in "apache2" "httpd" ; do
+           eventscript_call ctdb_service_unmanaged
+           service "$service_name" force-stopped
+       done
+
+       export CTDB_MANAGED_SERVICES="foo"
+       unset CTDB_MANAGES_HTTPD
+    fi
+}
+
+######################################################################
+
+# multipathd fakery
+
+setup_multipathd ()
+{
+    for i ; do
+       case "$i" in
+           \!*)
+               _t="${i#!}"
+               echo "Marking ${_t} as having no active paths"
+               FAKE_MULTIPATH_FAILURES="${FAKE_MULTIPATH_FAILURES}${FAKE_MULTIPATH+FAILURES:+ }${_t}"
+               ;;
+           *)
+               _t="$i"         
+       esac
+       CTDB_MONITOR_MPDEVICES="${CTDB_MONITOR_MPDEVICES}${CTDB_MONITOR_MPDEVICES:+ }${_t}"
+    done
+
+    export CTDB_MONITOR_MPDEVICES FAKE_MULTIPATH_FAILURES
+    export FAKE_SLEEP_FORCE=0.1
+}
+
+######################################################################
+
+# Result and test functions
+
+# Set some globals and print the summary.
+define_test ()
+{
+    desc="$1"
+
+    _f=$(basename "$0" ".sh")
+
+    # Remaining format should be NN.service.event.NNN or NN.service.NNN:
+    _num="${_f##*.}"
+    _f="${_f%.*}"
+    case "$_f" in
+       *.*.*)
+           script="${_f%.*}"
+           event="${_f##*.}"
+           ;;
+       *.*)
+           script="$_f"
+           unset event
+           ;;
+       *)
+           die "Internal error - unknown testcase filename format"
+    esac
+
+    printf "%-17s %-10s %-4s - %s\n\n" "$script" "$event" "$_num" "$desc"
+}
+
+_extra_header ()
+{
+    cat <<EOF
+CTDB_BASE="$CTDB_BASE"
+CTDB_ETCDIR="$CTDB_ETCDIR"
+ctdb client is "$(which ctdb)"
+EOF
+}
+
+# Run an eventscript once.  The test passes if the return code and
+# output match those required.
+
+# Any args are passed to the eventscript.
+
+simple_test ()
+{
+    [ -n "$event" ] || die 'simple_test: $event not set'
+
+    _extra_header=$(_extra_header)
+
+    echo "Running eventscript \"$script $event${1:+ }$*\""
+    _shell=""
+    if $TEST_COMMAND_TRACE ; then
+       _shell="sh -x"
+    else
+       _shell="sh"
+    fi
+    _out=$($_shell "${CTDB_BASE}/events.d/$script" "$event" "$@" 2>&1)
+
+    result_check "$_extra_header"
+}
+
+simple_test_event ()
+{
+    # If something has previously failed then don't continue.
+    : ${_passed:=true}
+    $_passed || return 1
+
+    event="$1" ; shift
+    echo "=================================================="
+    simple_test "$@"
+}
+
+simple_test_command ()
+{
+    # If something has previously failed then don't continue.
+    : ${_passed:=true}
+    $_passed || return 1
+
+    echo "=================================================="
+    echo "Running command \"$*\""
+    _out=$("$@" 2>&1)
+
+    result_check
+}
+
+check_ctdb_logfile ()
+{
+    # If something has previously failed then don't continue.
+    : ${_passed:=true}
+    $_passed || return 1
+
+    echo "=================================================="
+    echo "Checking CTDB_LOGFILE=\"${CTDB_LOGFILE}\""
+    _out=$(cat "$CTDB_LOGFILE" 2>&1)
+
+    result_check
+}
+
+# Run an eventscript iteratively.
+# - 1st argument is the number of iterations.
+# - 2nd argument is something to eval to do setup for every iteration.
+#   The easiest thing to do here is to define a function and pass it
+#   here.
+# - Subsequent arguments come in pairs: an iteration number and
+#   something to eval for that iteration.  Each time an iteration
+#   number is matched the associated argument is given to eval after
+#   the default setup is done.  The iteration numbers need to be given
+#   in ascending order.
+#
+# Some optional args can be given *before* these, surrounded by extra
+# "--" args.  These args are passed to the eventscript.  Quoting is
+# lost.
+#
+# One use of the 2nd and further arguments is to call
+# required_result() to change what is expected of a particular
+# iteration.
+iterate_test ()
+{
+    [ -n "$event" ] || die 'simple_test: $event not set'
+
+    args=""
+    if [ "$1" = "--" ] ; then
+       shift
+       while [ "$1" != "--" ] ; do
+           args="${args}${args:+ }$1"
+           shift
+       done
+       shift
+    fi
+
+    _repeats="$1"
+    _setup_default="$2"
+    shift 2
+
+    echo "Running $_repeats iterations of \"$script $event\" $args"
+
+    _result=true
+
+    for iteration in $(seq 1 $_repeats) ; do
+       # This is inefficient because the iteration-specific setup
+       # might completely replace the default one.  However, running
+       # the default is good because it allows you to revert to a
+       # particular result without needing to specify it explicitly.
+       eval $_setup_default
+       if [ $iteration = "$1" ] ; then
+           eval $2
+           shift 2
+       fi
+
+       _shell=""
+       if $TEST_COMMAND_TRACE ; then
+           _shell="sh -x"
+       else
+           _shell="sh"
+       fi
+       _out=$($_shell "${CTDB_BASE}/events.d/$script" "$event" $args 2>&1)
+       _rc=$?
+
+    if [ -n "$OUT_FILTER" ] ; then
+       _fout=$(echo "$_out" | eval sed -r $OUT_FILTER)
+    else
+       _fout="$_out"
+    fi
+
+       if [ "$_fout" = "$required_output" -a $_rc = $required_rc ] ; then
+           _passed=true
+       else
+           _passed=false
+           _result=false
+       fi
+
+       result_print "$_passed" "$_out" "$_rc" "Iteration $iteration"
+    done
+
+    result_footer "$_result" "$(_extra_header)"
+}
diff --git a/ctdb/tests/eventscripts/stubs/ctdb b/ctdb/tests/eventscripts/stubs/ctdb
new file mode 100755 (executable)
index 0000000..da84ed7
--- /dev/null
@@ -0,0 +1,334 @@
+#!/bin/sh
+
+prog="ctdb"
+
+not_implemented_exit_code=1
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: $prog [-Y] cmd
+
+A fake CTDB stub that prints items depending on the variables
+FAKE_CTDB_PNN (default 0) depending on command-line options.
+EOF
+    exit 1
+}
+
+not_implemented ()
+{
+    echo "${prog}: command \"$1\" not implemented in stub" >&2
+    exit $not_implemented_exit_code
+}
+
+# Don't set $POSIXLY_CORRECT here.
+_temp=$(getopt -n "$prog" -o "Yvhn:" -l help -- "$@") || \
+    usage
+
+eval set -- "$_temp"
+
+verbose=false
+machine_readable=false
+nodespec=""
+
+args="$*"
+
+while true ; do
+    case "$1" in
+       -Y) machine_readable=true ; shift ;;
+       -v) verbose=true ; shift ;;
+       -n) nodespec="$2" ; shift 2 ;;
+       --) shift ; break ;;
+       -h|--help|*) usage ;; # * shouldn't happen, so this is reasonable.
+    esac
+done
+
+[ $# -ge 1 ] || usage
+
+setup_tickles ()
+{
+    # Make sure tickles file exists.
+    tickles_file="$CTDB_VARDIR/fake-ctdb/tickles"
+    mkdir -p $(dirname "$tickles_file")
+    touch "$tickles_file"
+}
+
+ctdb_killtcp ()
+{
+    while read _src _dst ; do
+       sed -i -e "/^$_dst $_src\$/d" "$FAKE_NETSTAT_TCP_ESTABLISHED_FILE"
+    done
+}
+
+setup_pstore ()
+{
+    pstore_dir="$CTDB_VARDIR/fake-ctdb/pstore/$1"
+    mkdir -p "$pstore_dir"
+}
+
+parse_nodespec ()
+{
+    if [ "$nodespec" = "all" ] ; then
+       nodes="$(seq 0 $((FAKE_CTDB_NUMNODES - 1)) )"
+    elif [ -n "$nodespec" ] ; then
+       nodes="$(echo $nodespec | sed -e 's@,@ @g')"
+    else
+       _t=$(ctdb_pnn)
+       nodes="${_t#PNN:}"
+    fi
+}
+
+# For testing backward compatibility...
+for i in $CTDB_NOT_IMPLEMENTED ; do
+    if [ "$i" = "$1" ] ; then
+       not_implemented "$i"
+    fi
+done
+
+ctdb_pnn ()
+{
+    # Defaults to 0
+    echo "PNN:${FAKE_CTDB_PNN:-0}"
+}
+
+######################################################################
+
+FAKE_CTDB_NODE_STATE="$FAKE_CTDB_STATE/node-state"
+FAKE_CTDB_NODES_DISABLED="$FAKE_CTDB_NODE_STATE/0x4"
+
+######################################################################
+
+# NOTE: all nodes share $CTDB_PUBLIC_ADDRESSES
+
+FAKE_CTDB_IP_LAYOUT="$FAKE_CTDB_STATE/ip-layout"
+
+ip_reallocate ()
+{
+    touch "$FAKE_CTDB_IP_LAYOUT"
+
+    (
+       flock 0
+
+       _pa="${CTDB_PUBLIC_ADDRESSES:-${CTDB_BASE}/public_addresses}"
+
+       if [ ! -s "$FAKE_CTDB_IP_LAYOUT" ] ; then
+           sed -n -e 's@^\([^#][^/]*\)/.*@\1 -1@p' \
+               "$_pa" >"$FAKE_CTDB_IP_LAYOUT"
+       fi
+
+       _t="${FAKE_CTDB_IP_LAYOUT}.new"
+
+       _flags=""
+       for _i in $(seq 0 $((FAKE_CTDB_NUMNODES - 1)) ) ; do
+           if ls "$FAKE_CTDB_STATE/node-state/"*"/$_i" >/dev/null 2>&1 ; then
+               # Have non-zero flags
+               _this=0
+               for _j in "$FAKE_CTDB_STATE/node-state/"*"/$_i" ; do
+                   _tf="${_j%/*}" # dirname
+                   _f="${_tf##*/}" # basename
+                   _this=$(( $_this | $_f ))
+               done
+           else
+               _this="0"
+           fi
+           _flags="${_flags}${_flags:+,}${_this}"
+       done
+       CTDB_TEST_LOGLEVEL=2 \
+           "ctdb_takeover_tests" \
+           "ctdb_takeover_run_core" "$_flags" <"$FAKE_CTDB_IP_LAYOUT" |
+           sort >"$_t"
+       mv "$_t" "$FAKE_CTDB_IP_LAYOUT"
+    ) <"$FAKE_CTDB_IP_LAYOUT"
+}
+
+ctdb_ip ()
+{
+    # If nobody has done any IP-fu then generate a layout.
+    [ -f "$FAKE_CTDB_IP_LAYOUT" ] || ip_reallocate
+
+    if $verbose ; then
+       echo ":Public IP:Node:ActiveInterface:AvailableInterfaces:ConfiguredInterfaces:"
+    else
+       echo ":Public IP:Node:"
+    fi
+
+    _mypnn=$(ctdb_pnn | sed -e 's@PNN:@@')
+
+    # Join public addresses file with $FAKE_CTDB_IP_LAYOUT, and
+    # process output line by line...
+    _pa="${CTDB_PUBLIC_ADDRESSES:-${CTDB_BASE}/public_addresses}"
+    sed -e 's@/@ @' "$_pa" | sort | join - "$FAKE_CTDB_IP_LAYOUT" |
+    while read _ip _bit _ifaces _pnn ; do
+       if $verbose ; then
+           # If more than 1 interface, assume all addresses are on the 1st.
+           _first_iface="${_ifaces%%,*}"
+           # Only show interface if address is on this node.
+           _my_iface=""
+           if [ "$_pnn" = "$_mypnn" ]; then
+               _my_iface="$_first_iface"
+           fi
+           echo ":${_ip}:${_pnn}:${_my_iface}:${_first_iface}:${_ifaces}:"
+       else
+           echo ":${_ip}:${_pnn}:"
+       fi
+    done
+}
+
+ctdb_moveip ()
+{
+    _ip="$1"
+    _target="$2"
+
+    ip_reallocate  # should be harmless and ensures we have good state
+
+    (
+       flock 0
+
+       _t="${FAKE_CTDB_IP_LAYOUT}.new"
+
+       while read _i _pnn ; do
+           if [ "$_ip" = "$_i" ] ; then
+               echo "$_ip $_target"
+           else
+               echo "$_ip $_pnn"
+           fi
+       done | sort >"$_t"
+       mv "$_t" "$FAKE_CTDB_IP_LAYOUT"
+    ) <"$FAKE_CTDB_IP_LAYOUT"
+}
+
+######################################################################
+
+ctdb_enable ()
+{
+    parse_nodespec
+    
+    for _i in $nodes ; do
+       rm -f "${FAKE_CTDB_NODES_DISABLED}/${_i}"
+    done
+
+    ip_reallocate
+}
+
+ctdb_disable ()
+{
+    parse_nodespec
+
+    for _i in $nodes ; do
+       mkdir -p "$FAKE_CTDB_NODES_DISABLED"
+       touch "${FAKE_CTDB_NODES_DISABLED}/${_i}"
+    done
+
+    ip_reallocate
+}
+
+######################################################################
+
+ctdb_shutdown ()
+{
+    echo "CTDB says BYE!"
+}
+
+######################################################################
+
+case "$1" in
+    gettickles)
+       setup_tickles
+       echo ":source ip:port:destination ip:port:"
+       while read src dst ; do
+           echo ":${src}:${dst}:"
+       done <"$tickles_file"
+       ;;
+    addtickle)
+       setup_tickles
+       echo "$2 $3" >>"$tickles_file"
+       ;;
+    deltickle)
+       setup_tickles
+       _t=$(grep -F -v "$2 $3" "$tickles_file")
+       echo "$_t" >"$tickles_file"
+       ;;
+    pstore)
+       setup_pstore "$2"
+       cat "$4" >"${pstore_dir}/$3"
+       ;;
+    pfetch)
+       setup_pstore "$2"
+       cat "${pstore_dir}/$3" >"$4" 2>/dev/null
+       ;;
+    ifaces)
+       # Assume -Y.
+       echo ":Name:LinkStatus:References:"
+       _f="${CTDB_PUBLIC_ADDRESSES:-${CTDB_BASE}/public_addresses}"
+       if [ -r "$_f" ] ; then
+           while read _ip _iface ; do
+               case "_$ip" in
+                   \#*) : ;;
+                   *)
+                       _status=1
+                       # For now assume _iface contains only 1.
+                       if [ -f "{FAKE_CTDB_IFACES_DOWN}/${_iface}" ] ; then
+                           _status=0
+                       fi
+                       # Nobody looks at references
+                       echo ":${_iface}:${_status}:0"
+               esac
+           done <"$_f" |
+           sort -u
+       fi
+       ;;
+    setifacelink)
+       # Existence of file means CTDB thinks interface is down.
+       _f="${FAKE_CTDB_IFACES_DOWN}/$2"
+       case "$3" in
+           up)   rm -f "$_f" ;;
+           down) touch "$_f" ;;
+           *)
+               echo "ctdb setifacelink: unsupported interface status $3"
+               exit 1
+       esac
+       ;;
+    checktcpport)
+       for _i in $FAKE_TCP_LISTEN ; do
+           if [ "$2" = "${_i##*:}" ] ; then
+               exit 98
+           fi
+       done
+
+       exit 0
+       ;;
+    scriptstatus)
+       $machine_readable || not_implemented "$1, without -Y"
+       [ "$2" != "all" ] || not_implemented "scriptstatus all"
+       # For now just assume everything is good.
+       echo ":Type:Name:Code:Status:Start:End:Error Output...:"
+       for _i in "$CTDB_BASE/events.d/"*.* ; do
+           _d1=$(date '+%s.%N')
+           _b="${_i##*/}" # basename
+
+           _f="$FAKE_CTDB_SCRIPTSTATUS/$_b"
+           if [ -r "$_f" ] ; then
+               read _code _status _err_out <"$_f"
+           else
+               _code="0"
+               _status="OK"
+               if [ ! -x "$_i" ] ; then
+                   _status="DISABLED"
+                   _code="-8"
+               fi
+               _err_out=""
+           fi
+           _d2=$(date '+%s.%N')
+           echo ":${2:-monitor}:${_b}:${_code}:${_status}:${_d1}:${_d2}:${_err_out}:"
+       done
+       ;;
+    gratiousarp) : ;;  # Do nothing for now
+    killtcp)    ctdb_killtcp "$@" ;;
+    ip)          ctdb_ip "$@" ;;
+    pnn|xpnn)    ctdb_pnn ;;
+    enable)      ctdb_enable "$@";;
+    disable)     ctdb_disable "$@";;
+    moveip)      ctdb_moveip "$@";;
+    shutdown)    ctdb_shutdown "$@";;
+    *) not_implemented "$1" ;;
+esac
diff --git a/ctdb/tests/eventscripts/stubs/date b/ctdb/tests/eventscripts/stubs/date
new file mode 100755 (executable)
index 0000000..2f470a8
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+if [ "$FAKE_DATE_OUTPUT" ] ; then
+    echo "$FAKE_DATE_OUTPUT"
+else   
+    /bin/date "$@"
+fi
diff --git a/ctdb/tests/eventscripts/stubs/ethtool b/ctdb/tests/eventscripts/stubs/ethtool
new file mode 100755 (executable)
index 0000000..bd173f4
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+link="yes"
+
+if [ -f  "${FAKE_ETHTOOL_LINK_DOWN}/${1}" ] ; then
+    link="no"
+fi
+
+# Expect to add more fields later.
+cat <<EOF
+       Link detected: ${link}
+EOF
diff --git a/ctdb/tests/eventscripts/stubs/exportfs b/ctdb/tests/eventscripts/stubs/exportfs
new file mode 100755 (executable)
index 0000000..46c6522
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+opts="10.0.0.0/16(rw,async,insecure,no_root_squash,no_subtree_check)"
+
+for i in $FAKE_SHARES ; do
+    # Directories longer than 15 characters are printed on their own
+    # line.
+    if [ ${#i} -ge 15 ] ; then
+       printf '%s\n\t\t%s\n' "$i" "$opts"
+    else
+       printf '%s\t%s\n' "$i" "$opts"
+    fi
+done
diff --git a/ctdb/tests/eventscripts/stubs/free b/ctdb/tests/eventscripts/stubs/free
new file mode 100755 (executable)
index 0000000..6453509
--- /dev/null
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+if [ "$1" = "-m" ] ; then
+    echo "$FAKE_FREE_M"
+    exit 0
+else
+    echo "free: not implemented - $*"
+    exit 1
+fi
diff --git a/ctdb/tests/eventscripts/stubs/ip b/ctdb/tests/eventscripts/stubs/ip
new file mode 100755 (executable)
index 0000000..053da75
--- /dev/null
@@ -0,0 +1,505 @@
+#!/bin/sh
+
+not_implemented ()
+{
+    echo "ip stub command: \"$1\" not implemented"
+    exit 127
+}
+
+######################################################################
+
+ip_link ()
+{
+    case "$1" in
+       set)
+           shift
+           # iface="$1"
+           case "$2" in
+               up)   ip_link_set_up "$1"  ;;
+               down) ip_link_down_up "$1" ;;
+               *)    not_implemented "\"$2\" in \"$orig_args\"" ;;
+           esac
+           ;;
+       show) shift ; ip_link_show "$@" ;;
+       del*) shift ; ip_link_delete "$@" ;;
+       *) not_implemented "$*" ;;
+    esac
+}
+
+ip_link_delete ()
+{
+    mkdir -p "${FAKE_IP_STATE}/interfaces-deleted"
+    touch "${FAKE_IP_STATE}/interfaces-deleted/$1"
+}
+
+ip_link_set_up ()
+{
+    rm -f "${FAKE_IP_STATE}/interfaces-down/$1"
+    rm -f "${FAKE_IP_STATE}/interfaces-deleted/$1"
+}
+
+ip_link_set_down ()
+{
+    rm -f "${FAKE_IP_STATE}/interfaces-deleted/$1"
+    mkdir -p "${FAKE_IP_STATE}/interfaces-down"
+    touch "${FAKE_IP_STATE}/interfaces-down/$1"
+}
+
+ip_link_show ()
+{
+    dev="$1"
+    if [ "$dev" = "dev" -a -n "$2" ] ; then
+       dev="$2"
+    fi
+
+    if [ -e "${FAKE_IP_STATE}/interfaces-deleted/$dev" ] ; then
+       echo "Device \"${dev}\" does not exist." >&2
+       exit 255
+    fi
+
+    mac=$(echo $dev | md5sum | sed -r -e 's@(..)(..)(..)(..)(..)(..).*@\1:\2:\3:\4:\5:\6@')
+    _state="UP"
+    _flags=",UP,LOWER_UP"
+    if [ -e "${FAKE_IP_STATE}/interfaces-down/$dev" ] ; then
+       _state="DOWN"
+       _flags=""
+    fi
+    cat <<EOF
+${n}: ${dev}: <BROADCAST,MULTICAST${_flags}> mtu 1500 qdisc pfifo_fast state ${_state} qlen 1000
+    link/ether ${mac} brd ff:ff:ff:ff:ff:ff
+EOF
+}
+
+# This is incomplete because it doesn't actually look up table ids in
+# /etc/iproute2/rt_tables.  The rules/routes are actually associated
+# with the name instead of the number.  However, we include a variable
+# to fake a bad table id.
+[ -n "$IP_ROUTE_BAD_TABLE_ID" ] || IP_ROUTE_BAD_TABLE_ID=false
+
+ip_check_table ()
+{
+    _cmd="$1"
+
+    [ -n "$_table" ] || not_implemented "ip rule/route without \"table\""
+
+    # Only allow tables names from 13.per_ip_routing.  This is a cheap
+    # way of avoiding implementing the default/main/local tables.
+    case "$_table" in
+       ctdb.*)
+           if $IP_ROUTE_BAD_TABLE_ID ; then
+               # Ouch.  Simulate inconsistent errors from ip.  :-(
+               case "$_cmd" in
+                   route)
+                       echo "Error: argument "${_table}" is wrong: table id value is invalid" >&2
+                       
+                       ;;
+                   *)
+                       echo "Error: argument "${_table}" is wrong: invalid table ID" >&2
+               esac
+               exit 255
+           fi
+           ;;
+       *) not_implemented "table=${_table} ${orig_args}" ;;
+    esac
+}
+
+######################################################################
+
+ip_addr ()
+{
+    case "$1" in
+       show|list|"") shift ; ip_addr_show "$@" ;;
+       add*)         shift ; ip_addr_add  "$@" ;;
+       del*)         shift ; ip_addr_del  "$@" ;;
+       *) not_implemented "\"$1\" in \"$orig_args\"" ;;
+    esac
+}
+
+ip_addr_show ()
+{
+    dev=""
+    primary=true
+    secondary=true
+    _to=""
+    while [ -n "$1" ] ; do
+       case "$1" in
+           dev)
+               dev="$2" ; shift 2
+               ;;
+            # Do stupid things and stupid things will happen!
+           primary)
+               primary=true ; secondary=false ; shift
+               ;;
+           secondary)
+               secondary=true ; primary=false ; shift
+               ;;
+           to)
+               _to="$2" ; shift 2
+               ;;
+           *)
+               # Assume an interface name
+               dev="$1" ; shift 1
+       esac
+    done
+    devices="$dev"
+    if [ -z "$devices" ] ; then
+       # No device specified?  Get all the primaries...
+       devices=$(ls "${FAKE_IP_STATE}/addresses/"*-primary 2>/dev/null | \
+           sed -e 's@.*/@@' -e 's@-primary$@@')
+    fi
+    calc_brd ()
+    {
+       case "${local#*/}" in
+           24)
+               brd="${local%.*}.255"
+               ;;
+           *)
+               not_implemented "list ... fake bits other than 24: ${local#*/}"
+       esac
+    }
+    show_iface()
+    {
+       pf="${FAKE_IP_STATE}/addresses/${dev}-primary"
+       sf="${FAKE_IP_STATE}/addresses/${dev}-secondary"
+       ip_link_show "$dev"
+       if $primary && [ -r "$pf" ] ; then
+           read local <"$pf"
+           if [ -z "$_to" -o "${_to%/*}" = "${local%/*}" ] ; then
+               calc_brd
+               cat <<EOF
+    inet ${local} brd ${brd} scope global ${dev}
+EOF
+           fi
+       fi
+       if $secondary && [ -r "$sf" ] ; then
+           while read local ; do
+               if [ -z "$_to" -o "${_to%/*}" = "${local%/*}" ] ; then
+                   calc_brd
+                   cat <<EOF
+    inet ${local} brd ${brd} scope global secondary ${dev}
+EOF
+               fi
+           done <"$sf"
+       fi
+       if [ -z "$_to" ] ; then
+           cat <<EOF
+       valid_lft forever preferred_lft forever
+EOF
+       fi
+    }
+    n=1
+    for dev in $devices ; do
+       if [ -z "$_to" ] || \
+           grep -F "${_to%/*}/" "${FAKE_IP_STATE}/addresses/${dev}-"* >/dev/null ; then
+           show_iface
+       fi
+       n=$(($n + 1))
+    done
+}
+
+ip_addr_add ()
+{
+    local=""
+    dev=""
+    brd=""
+    while [ -n "$1" ] ; do
+       case "$1" in
+           *.*.*.*/*)
+               local="$1" ; shift
+               ;;
+           local)
+               local="$2" ; shift 2
+               ;;
+           broadcast|brd)
+               # For now assume this is always '+'.
+               if [ "$2" != "+" ] ; then
+                   not_implemented "addr add ... brd $2 ..."
+               fi
+               shift 2
+               ;;
+           dev)
+               dev="$2" ; shift 2
+               ;;
+           *)
+               not_implemented "$@"
+       esac
+    done
+    if [ -z "$dev" ] ; then
+       not_implemented "addr add (without dev)"
+    fi
+    mkdir -p "${FAKE_IP_STATE}/addresses"
+    pf="${FAKE_IP_STATE}/addresses/${dev}-primary"
+    sf="${FAKE_IP_STATE}/addresses/${dev}-secondary"
+    # We could lock here... but we should be the only ones playing
+    # around here with these stubs.
+    if [ ! -f "$pf" ] ; then
+       echo "$local" >"$pf"
+    elif grep -Fq "$local" "$pf" ; then 
+       echo "RTNETLINK answers: File exists" >&2
+       exit 254
+    elif [ -f "$sf" ] && grep -Fq "$local" "$sf" ; then 
+       echo "RTNETLINK answers: File exists" >&2
+       exit 254
+    else
+       echo "$local" >>"$sf"
+    fi
+}
+
+ip_addr_del ()
+{
+    local=""
+    dev=""
+    while [ -n "$1" ] ; do
+       case "$1" in
+           *.*.*.*/*)
+               local="$1" ; shift
+               ;;
+           local)
+               local="$2" ; shift 2
+               ;;
+           dev)
+               dev="$2" ; shift 2
+               ;;
+           *)
+               not_implemented "addr del ... $1 ..."
+       esac
+    done
+    if [ -z "$dev" ] ; then
+       not_implemented "addr del (without dev)"
+    fi
+    mkdir -p "${FAKE_IP_STATE}/addresses"
+    pf="${FAKE_IP_STATE}/addresses/${dev}-primary"
+    sf="${FAKE_IP_STATE}/addresses/${dev}-secondary"
+    # We could lock here... but we should be the only ones playing
+    # around here with these stubs.
+    if [ ! -f "$pf" ] ; then
+       echo "RTNETLINK answers: Cannot assign requested address" >&2
+       exit 254
+    elif grep -Fq "$local" "$pf" ; then
+       # Remove primaries AND SECONDARIES.
+       rm -f "$pf" "$sf"
+    elif [ -f "$sf" ] && grep -Fq "$local" "$sf" ; then 
+       grep -Fv "$local" "$sf" >"${sf}.new"
+       mv "${sf}.new" "$sf"
+    else
+       echo "RTNETLINK answers: Cannot assign requested address" >&2
+       exit 254
+    fi
+}
+
+######################################################################
+
+ip_rule ()
+{
+    case "$1" in
+       show|list|"") shift ; ip_rule_show "$@" ;;
+       add)          shift ; ip_rule_add  "$@" ;;
+       del*)         shift ; ip_rule_del  "$@" ;;
+       *) not_implemented "$1 in \"$orig_args\"" ;;
+    esac
+
+}
+
+# All non-default rules are in $FAKE_IP_STATE_RULES/rules.  As with
+# the real version, rules can be repeated.  Deleting just deletes the
+# 1st match.
+
+ip_rule_show ()
+{
+    ip_rule_show_1 ()
+    {
+       _pre="$1"
+       _table="$2"
+       _selectors="$3"
+       # potentially more options
+
+       printf "%d:\t%s lookup %s \n" $_pre "$_selectors" "$_table"
+    }
+
+    ip_rule_show_some ()
+    {
+       _min="$1"
+       _max="$2"
+
+       [ -f "${FAKE_IP_STATE}/rules" ] || return
+
+       while read _pre _table _selectors ; do
+           # Only print those in range
+           [ $_min -le $_pre -a $_pre -le $_max ] || continue
+
+           ip_rule_show_1 $_pre "$_table" "$_selectors"
+       done <"${FAKE_IP_STATE}/rules"
+    }
+
+    ip_rule_show_1 0 "local" "from all"
+
+    ip_rule_show_some 1 32765
+
+    ip_rule_show_1 32766 "main" "from all"
+    ip_rule_show_1 32767 "default" "from all"
+
+    ip_rule_show_some 32768 2147483648
+}
+
+ip_rule_common ()
+{
+    _from=""
+    _pre=""
+    _table=""
+    while [ -n "$1" ] ; do
+       case "$1" in
+           from)  _from="$2"  ; shift 2 ;;
+           pref)  _pre="$2"   ; shift 2 ;;
+           table) _table="$2" ; shift 2 ;;
+           *) not_implemented "$1 in \"$orig_args\"" ;;
+       esac
+    done
+
+    [ -n "$_pre" ]   || not_implemented "ip rule without \"pref\""
+    ip_check_table "rule"
+    # Relax this if more selectors added later...
+    [ -n "$_from" ]  || not_implemented "ip rule without \"from\""
+}
+
+ip_rule_add ()
+{
+    ip_rule_common "$@"
+
+    _f="${FAKE_IP_STATE}/rules"
+    touch "$_f"
+    (
+       flock 0
+       # Filter order must be consistent with the comparison in ip_rule_del()
+       echo "$_pre $_table${_from:+ from }$_from" >>"$_f"
+    ) <"$_f"
+}
+
+ip_rule_del ()
+{
+    ip_rule_common "$@"
+
+    _f="${FAKE_IP_STATE}/rules"
+    touch "$_f"
+    (
+       flock 0
+       _tmp="$(mktemp)"
+       _found=false
+       while read _p _t _s ; do
+           if ! $_found && \
+               [ "$_p" = "$_pre" -a "$_t" = "$_table" -a \
+               "$_s" = "${_from:+from }$_from" ] ; then
+               # Found.  Skip this one but not future ones.
+               _found=true
+           else
+               echo "$_p $_t $_s" >>"$_tmp"
+           fi
+       done
+       if cmp -s "$_tmp" "$_f" ; then
+           # No changes, must not have found what we wanted to delete
+           echo "RTNETLINK answers: No such file or directory" >&2
+           rm -f "$_tmp"
+           exit 2
+       else
+           mv "$_tmp" "$_f"
+       fi
+    ) <"$_f" || exit $?
+}
+
+######################################################################
+
+ip_route ()
+{
+    case "$1" in
+       show|list)    shift ; ip_route_show  "$@" ;;
+       flush)        shift ; ip_route_flush "$@" ;;
+       add)          shift ; ip_route_add   "$@" ;;
+       *) not_implemented "$1 in \"ip route\"" ;;
+    esac
+}
+
+ip_route_common ()
+{
+    [ "$1" = table ] || not_implemented "$1 in \"$orig_args\""
+    _table="$2"
+
+    ip_check_table "route"
+}
+
+# Routes are in a file per table in the directory
+# $FAKE_IP_STATE/routes.  These routes just use the table ID
+# that is passed and don't do any lookup.  This could be "improved" if
+# necessary.
+
+ip_route_show ()
+{
+    ip_route_common "$@"
+
+    # Missing file is just an empty table
+    cat "$FAKE_IP_STATE/routes/${_table}" 2>/dev/null || true
+}
+
+ip_route_flush ()
+{
+    ip_route_common "$@"
+
+    rm -f "$FAKE_IP_STATE/routes/${_table}"
+}
+
+ip_route_add ()
+{
+    _prefix=""
+    _dev=""
+    _gw=""
+    _table=""
+
+    while [ -n "$1" ] ; do
+       case "$1" in
+           *.*.*.*/*|*.*.*.*) _prefix="$1" ; shift 1 ;;
+           local) _prefix="$2" ; shift 2 ;;
+           dev)   _dev="$2"   ; shift 2 ;;
+           via)   _gw="$2"    ; shift 2 ;;
+           table) _table="$2" ; shift 2 ;;
+           *) not_implemented "$1 in \"$orig_args\"" ;;
+       esac
+    done
+
+    ip_check_table "route"
+    [ -n "$_prefix" ] || not_implemented "ip route without inet prefix in \"$orig_args\""
+    [ -n "$_dev" ] || not_implemented "ip route without \"dev\" in \"$orig_args\""
+
+    # Alias or add missing bits
+    case "$_prefix" in
+       0.0.0.0/0) _prefix="default" ;;
+       */*) : ;;
+       *) _prefix="${_prefix}/32" ;;
+    esac
+
+    _f="$FAKE_IP_STATE/routes/${_table}"
+    mkdir -p "$FAKE_IP_STATE/routes"
+    touch "$_f"
+
+    (
+       flock 0
+
+       if [ -n "$_gw" ] ; then
+           echo "${_prefix} via ${_gw} dev ${_dev} "
+       else
+           echo "${_prefix} dev ${_dev}  scope link "
+       fi >>"$_f"
+    ) <"$_f"
+}
+
+
+######################################################################
+
+orig_args="$*"
+
+case "$1" in
+    link)   shift ; ip_link  "$@" ;;
+    addr*)  shift ; ip_addr  "$@" ;;
+    rule)   shift ; ip_rule  "$@" ;;
+    route)  shift ; ip_route "$@" ;;
+    *) not_implemented "$1" ;;
+esac
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/iptables b/ctdb/tests/eventscripts/stubs/iptables
new file mode 100755 (executable)
index 0000000..2c65f7b
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+# Always succeed.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/kill b/ctdb/tests/eventscripts/stubs/kill
new file mode 100755 (executable)
index 0000000..b69e3e6
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# Always succeed.  This means that kill -0 will always find a
+# process and anything else will successfully kill.  This should
+# exercise a good avriety of code paths.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/killall b/ctdb/tests/eventscripts/stubs/killall
new file mode 100755 (executable)
index 0000000..1e182e1
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# Always succeed.  This means that killall -0 will always find a
+# process and anything else will successfully kill.  This should
+# exercise a good avriety of code paths.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/multipath b/ctdb/tests/eventscripts/stubs/multipath
new file mode 100755 (executable)
index 0000000..64f95e7
--- /dev/null
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+usage ()
+{
+    die "usage: ${0} -ll device"
+}
+    
+[ "$1" = "-ll" ] || usage
+shift
+[ $# -eq 1 ] || usage
+
+device="$1"
+
+if [ -n "$FAKE_MULTIPATH_HANG" ] ; then
+    FAKE_SLEEP_REALLY="yes" sleep 999
+fi
+
+path1_state="active"
+path2_state="enabled"
+
+for i in $FAKE_MULTIPATH_FAILURES ; do
+    if [ "$device" = "$i" ] ; then
+       path1_state="inactive"
+       path2_state="inactive"
+       break
+    fi
+done
+
+       cat <<EOF
+${device} (AUTO-01234567) dm-0 ,
+size=10G features='0' hwhandler='0' wp=rw
+|-+- policy='round-robin 0' prio=1 status=${path1_state}
+| \`- #:#:#:# vda 252:0  active ready running
+\`-+- policy='round-robin 0' prio=1 status=${path2_state}
+  \`- #:#:#:# vdb 252:16 active ready running
+EOF
diff --git a/ctdb/tests/eventscripts/stubs/net b/ctdb/tests/eventscripts/stubs/net
new file mode 100755 (executable)
index 0000000..3f96413
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+# Always succeed for now...
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/netstat b/ctdb/tests/eventscripts/stubs/netstat
new file mode 100755 (executable)
index 0000000..bd542bb
--- /dev/null
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+prog="netstat"
+
+# Pretty that we're the shell and that this command could not be
+# found.
+if [ "$FAKE_NETSTAT_NOT_FOUND" = "yes" ] ; then
+    echo "sh: ${prog}: command not found" >&2
+    exit 127
+fi
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: $prog [ -t | --unix ] [ -n ] [ -a ] [ -l ]
+
+A fake netstat stub that prints items depending on the variables
+FAKE_NETSTAT_TCP_ESTABLISHED, FAKE_TCP_LISTEN,
+FAKE_NETSTAT_UNIX_LISTEN, depending on command-line options.
+
+Note that -n is ignored.
+
+EOF
+    exit 1
+}
+
+# Defaults.
+tcp=false
+unix=false
+all=false
+listen=false
+
+parse_options ()
+{
+    # $POSIXLY_CORRECT means that the command passed to onnode can
+    # take options and getopt won't reorder things to make them
+    # options to this script.
+    _temp=$(POSIXLY_CORRECT=1 getopt -n "$prog" -o "tnalh" -l unix -l help -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$_temp"
+
+    while true ; do
+       case "$1" in
+           -n) shift ;;
+           -a) all=true ; shift ;;
+           -t) tcp=true ; shift ;;
+           -l) listen=true ; shift ;;
+           --unix) unix=true ; shift ;;
+           --) shift ; break ;;
+           -h|--help|*) usage ;; # * shouldn't happen, so this is reasonable.
+       esac
+    done
+
+    [ $# -gt 0 ] && usage
+
+    # If neither -t or --unix specified then print all.
+    $tcp || $unix || { tcp=true ; unix=true ; }
+}
+
+parse_options "$@"
+
+if $tcp ; then
+    if $listen ; then
+       echo "Active Internet connections (servers only)"
+    elif $all ; then
+       echo "Active Internet connections (servers and established)"
+    else
+       echo "Active Internet connections (w/o servers)"
+    fi
+
+    echo "Proto Recv-Q Send-Q Local Address           Foreign Address         State"
+
+    tcp_fmt="tcp        0      0 %-23s %-23s %s\n"
+    for i in $FAKE_NETSTAT_TCP_ESTABLISHED ; do
+       src="${i%|*}"
+       dst="${i#*|}"
+       printf "$tcp_fmt" $src $dst "ESTABLISHED"
+    done
+    while read src dst ; do
+       printf "$tcp_fmt" $src $dst "ESTABLISHED"
+    done <"$FAKE_NETSTAT_TCP_ESTABLISHED_FILE"
+
+    if $all || $listen ; then
+       for i in $FAKE_TCP_LISTEN ; do
+           printf "$tcp_fmt" $i "0.0.0.0:*" "LISTEN"
+       done
+    fi
+fi
+
+if $unix ; then
+    if $listen ; then
+       echo "Active UNIX domain sockets (servers only)"
+    elif $all ; then
+       echo "Active UNIX domain sockets (servers and established)"
+    else
+       echo "Active UNIX domain sockets (w/o servers)"
+    fi
+    
+    echo "Proto RefCnt Flags       Type       State         I-Node   Path"
+
+    unix_fmt="unix  2      [ ACC ]     STREAM     LISTENING     %-8d %s\n"
+    if $all || $listen ; then
+       for i in $FAKE_NETSTAT_UNIX_LISTEN ; do
+           printf "$unix_fmt" 12345 "$i"
+       done
+    fi
+fi
diff --git a/ctdb/tests/eventscripts/stubs/nmap b/ctdb/tests/eventscripts/stubs/nmap
new file mode 100755 (executable)
index 0000000..f01fe32
--- /dev/null
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+prog="nmap"
+
+# Pretty that we're the shell and that this command could not be
+# found.
+if [ "$FAKE_NMAP_NOT_FOUND" = "yes" ] ; then
+    echo "sh: ${prog}: command not found" >&2
+    exit 127
+fi
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: $prog -n -oG - -PS 127.0.0.1 -p <port>[,<port> ...]
+
+A fake nmap stub that prints items depending on the variable
+FAKE_TCP_LISTEN and the ports specified.
+
+Note that all options apart from -p are ignored.
+
+EOF
+    exit 1
+}
+
+ports=""
+
+parse_options ()
+{
+    _temp=$(getopt -n "$prog" -a -o "np:" -l help -l PS: -l oG: -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$_temp"
+
+    while true ; do
+       case "$1" in
+           -n) shift ;;
+           --oG|--PS) shift 2 ;;
+           -p) ports="${ports}${ports:+ }${2//,/ }" ; shift 2 ;;
+           --) shift ; break ;;
+           -h|--help|*) usage ;; # * shouldn't happen, so this is reasonable.
+       esac
+    done
+
+    [ $# -gt 0 ] && usage
+
+    [ -n "$ports" ] || usage
+}
+
+# For printing out...
+args="$*"
+
+parse_options "$@"
+
+port_states=""
+
+for p in $ports ; do
+    pn=$(getent services "$p" | sed -e 's@[[:space:]].*@@')
+    for i in $FAKE_TCP_LISTEN ; do
+       lp="${i##*:}"
+       if [ "$p" = "$lp" ] ; then
+           port_states="${port_states}${port_states:+, }${p}/open/tcp//${pn}///"
+           continue 2
+       fi
+    done
+    port_states="${port_states}${port_states:+, }${p}/closed/tcp//${pn}///"
+done
+
+cat <<EOF
+# Nmap 5.21 scan initiated $(date) as: nmap $args
+Host: 127.0.0.1 ()     Status: Up
+Host: 127.0.0.1 ()     Ports: $port_states
+# Nmap done at $(date) -- 1 IP address (1 host up) scanned in 0.04 seconds
+EOF
diff --git a/ctdb/tests/eventscripts/stubs/pidof b/ctdb/tests/eventscripts/stubs/pidof
new file mode 100755 (executable)
index 0000000..b6ad6d8
--- /dev/null
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+case "$1" in
+    nfsd)
+       echo "$FAKE_NFSD_THREAD_PIDS"
+       ;;
+    *)
+       echo "pidof: \"$1\" not implemented"
+       exit 1
+esac
diff --git a/ctdb/tests/eventscripts/stubs/pkill b/ctdb/tests/eventscripts/stubs/pkill
new file mode 100755 (executable)
index 0000000..b3f1de5
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# Always succeed.  This means that pkill -0 will always find a
+# process and anything else will successfully kill.  This should
+# exercise a good avriety of code paths.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/ps b/ctdb/tests/eventscripts/stubs/ps
new file mode 100755 (executable)
index 0000000..5abeaf9
--- /dev/null
@@ -0,0 +1,12 @@
+#!/bin/sh
+
+cat <<EOF
+USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
+root         2  0.0  0.0      0     0 ?        S    Aug28   0:00 [kthreadd]
+root         3  0.0  0.0      0     0 ?        S    Aug28   0:43  \_ [ksoftirqd/0]
+...
+root         1  0.0  0.0   2976   624 ?        Ss   Aug28   0:07 init [2]  
+root       495  0.0  0.0   3888  1640 ?        Ss   Aug28   0:00 udevd --daemon
+...
+[MORE FAKE ps OUTPUT]
+EOF
diff --git a/ctdb/tests/eventscripts/stubs/rpc.lockd b/ctdb/tests/eventscripts/stubs/rpc.lockd
new file mode 100755 (executable)
index 0000000..e71f6cd
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+# Restart always "works".  However, the test infrastructure may
+# continue to mark the service as down, so that's what matters.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/rpc.mountd b/ctdb/tests/eventscripts/stubs/rpc.mountd
new file mode 100755 (executable)
index 0000000..e71f6cd
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+# Restart always "works".  However, the test infrastructure may
+# continue to mark the service as down, so that's what matters.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/rpc.rquotad b/ctdb/tests/eventscripts/stubs/rpc.rquotad
new file mode 100755 (executable)
index 0000000..e71f6cd
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+# Restart always "works".  However, the test infrastructure may
+# continue to mark the service as down, so that's what matters.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/rpc.statd b/ctdb/tests/eventscripts/stubs/rpc.statd
new file mode 100755 (executable)
index 0000000..e71f6cd
--- /dev/null
@@ -0,0 +1,6 @@
+#!/bin/sh
+
+# Restart always "works".  However, the test infrastructure may
+# continue to mark the service as down, so that's what matters.
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/rpcinfo b/ctdb/tests/eventscripts/stubs/rpcinfo
new file mode 100755 (executable)
index 0000000..dd175f3
--- /dev/null
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+prog="rpcinfo"
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: $prog -u host program [version]
+
+A fake rpcinfo stub that succeeds for items in FAKE_RPCINFO_SERVICES,
+depending on command-line options.
+
+Note that "-u host" is ignored.
+
+EOF
+    exit 1
+}
+
+parse_options ()
+{
+    # $POSIXLY_CORRECT means that the command passed to onnode can
+    # take options and getopt won't reorder things to make them
+    # options to this script.
+    _temp=$(POSIXLY_CORRECT=1 getopt -n "$prog" -o "u:h" -l unix -l help -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$_temp"
+
+    while true ; do
+       case "$1" in
+           -u) shift 2 ;;  # ignore
+           --) shift ; break ;;
+           -h|--help|*) usage ;; # * shouldn't happen, so this is reasonable.
+       esac
+    done
+
+    [ 1 -le $# -a $# -le 2 ] || usage
+
+    p="$1"
+    v="$2"
+}
+
+parse_options "$@"
+
+for i in ${FAKE_RPCINFO_SERVICES} ; do
+    # This is stupidly cummulative, but needs to happen after the
+    # initial split of the list above.
+    IFS="${IFS}:"
+    set -- $i
+    # $1 = program, $2 = low version, $3 = high version
+    
+    if [ "$1" = "$p" ] ; then
+       if [ -n "$v" ] ; then
+           if [ "$2" -le "$v" -a "$v" -le "$3" ] ; then
+               echo "program ${p} version ${v} ready and waiting"
+               exit 0
+           else
+               echo "rpcinfo: RPC: Program/version mismatch; low version = ${2}, high version = ${3}" >&2
+               echo "program ${p} version ${v} is not available"
+               exit 1
+           fi
+       else
+           for j in $(seq $2 $3) ; do
+               echo "program ${p} version ${j} ready and waiting"
+           done
+           exit 0
+       fi
+    fi
+done
+
+echo "rpcinfo: RPC: Program not registered" >&2
+if [ -n "$v" ] ; then
+    echo "program ${p} version ${v} is not available"
+else
+    echo "program ${p} is not available"
+fi
+
+exit 1
diff --git a/ctdb/tests/eventscripts/stubs/service b/ctdb/tests/eventscripts/stubs/service
new file mode 100755 (executable)
index 0000000..5f47b55
--- /dev/null
@@ -0,0 +1,64 @@
+#!/bin/sh
+
+service_status_dir="${EVENTSCRIPTS_TESTS_VAR_DIR}/service_fake_status"
+mkdir -p "$service_status_dir"
+
+service="$1"
+flag="${service_status_dir}/${service}"
+
+start()
+{
+    if [ -f "$flag" ] ; then
+       echo "service: can't start ${service} - already running"
+       exit 1
+    else
+       touch "$flag"
+       echo "Starting ${service}: OK"
+    fi
+}
+
+stop ()
+{
+    if [ -f "$flag" ] ; then
+       echo "Stopping ${service}: OK"
+       rm -f "$flag"
+    else
+       echo "service: can't stop ${service} - not running"
+       exit 1
+    fi
+}
+
+case "$2" in
+    start)
+       start
+       ;;
+    stop)
+       stop
+       ;;
+    restart|reload)
+       stop
+       start
+       ;;
+    status)
+       if [ -f "$flag" ] ; then
+           echo "$service running"
+           exit 0
+       else
+           echo "$service not running"
+           exit 3
+       fi
+       ;;
+    force-started)
+       # For test setup...
+       touch "$flag"
+       ;;
+    force-stopped)
+       # For test setup...
+       rm -f "$flag"
+       ;;
+    *)
+       echo "service $service $2 not supported"
+       exit 1
+esac
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/sleep b/ctdb/tests/eventscripts/stubs/sleep
new file mode 100755 (executable)
index 0000000..e454244
--- /dev/null
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+if [ "$FAKE_SLEEP_REALLY" = "yes" ] ; then
+    /bin/sleep "$@"
+elif [ -n "$FAKE_SLEEP_FORCE" ] ; then
+    /bin/sleep "$FAKE_SLEEP_FORCE"
+else
+    :
+fi
diff --git a/ctdb/tests/eventscripts/stubs/tdbdump b/ctdb/tests/eventscripts/stubs/tdbdump
new file mode 100755 (executable)
index 0000000..986c5c5
--- /dev/null
@@ -0,0 +1,9 @@
+#!/bin/sh
+
+if [ "$FAKE_TDB_IS_OK" = "yes" ] ; then
+    echo "TDB good"
+    exit 0
+else
+    echo "TDB busted"
+    exit 1
+fi
diff --git a/ctdb/tests/eventscripts/stubs/tdbtool b/ctdb/tests/eventscripts/stubs/tdbtool
new file mode 100755 (executable)
index 0000000..c6c0a16
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+if [ -z "$1" ] ; then
+    if [ "$FAKE_TDBTOOL_SUPPORTS_CHECK" = "yes" ] ; then
+       echo "check"
+    fi
+fi
+
+if [ "$FAKE_TDB_IS_OK" = "yes" ] ; then
+    echo "Database integrity is OK"
+else
+    echo "Database is busted"
+fi
+
+exit 0
diff --git a/ctdb/tests/eventscripts/stubs/testparm b/ctdb/tests/eventscripts/stubs/testparm
new file mode 100755 (executable)
index 0000000..aac5b18
--- /dev/null
@@ -0,0 +1,51 @@
+#!/bin/sh
+
+not_implemented ()
+{
+    echo "testparm: option \"$1\" not implemented in stub" >&2
+    exit 2
+}
+
+# Ensure that testparm always uses our canned configuration instead of
+# the global one, unless some other file is specified. 
+
+file=""
+parameter=""
+for i ; do
+    case "$i" in
+       --parameter-name=*) parameter="${i#--parameter-name=}" ;;
+       -*) : ;;
+       *) file="$i" ;;
+    esac
+done
+
+# Just hard-code parameter requests for now.  Later on they could be
+# parsed out of the file.
+case "$parameter" in
+    security) echo "ADS" ; exit 0 ;;
+    smb*ports) echo "445, 139" ; exit 0 ;;
+    ?*) not_implemented "--parameter-name=$parameter" ;;
+    # Fall through if $parameter not set
+esac
+
+if [ -n "$file" ] ; then
+    # This should include the shares, since this is used when the
+    # samba eventscript caches the output.
+    cat "$file"
+else
+    # We force our own smb.conf and add the shares.
+    cat "${CTDB_ETCDIR}/samba/smb.conf"
+
+    for i in $FAKE_SHARES ; do
+       bi=$(basename "$i")
+cat <<EOF
+
+[${bi}]
+       path            = $i
+       comment         = fake share $bi
+       guest ok        = no
+       read only       = no
+       browseable      = yes
+EOF
+    done
+fi
diff --git a/ctdb/tests/eventscripts/stubs/wbinfo b/ctdb/tests/eventscripts/stubs/wbinfo
new file mode 100755 (executable)
index 0000000..4fc6b98
--- /dev/null
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+if [ "$FAKE_WBINFO_FAIL" = "yes" ] ; then
+   exit 1
+fi
+
+exit 0
diff --git a/ctdb/tests/onnode/0001.sh b/ctdb/tests/onnode/0001.sh
new file mode 100755 (executable)
index 0000000..2853374
--- /dev/null
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.102 <<
+-n 192.168.1.102 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0002.sh b/ctdb/tests/onnode/0002.sh
new file mode 100755 (executable)
index 0000000..c3c8c77
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE -q all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+-n 192.168.1.101 hostname
+-n 192.168.1.102 hostname
+-n 192.168.1.103 hostname
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0003.sh b/ctdb/tests/onnode/0003.sh
new file mode 100755 (executable)
index 0000000..d79bca2
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE -p all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+[192.168.1.101] -n 192.168.1.101 hostname
+[192.168.1.102] -n 192.168.1.102 hostname
+[192.168.1.103] -n 192.168.1.103 hostname
+[192.168.1.104] -n 192.168.1.104 hostname
+EOF
+
+simple_test -s $cmd
diff --git a/ctdb/tests/onnode/0004.sh b/ctdb/tests/onnode/0004.sh
new file mode 100755 (executable)
index 0000000..d0986b2
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE -pq all hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+-n 192.168.1.101 hostname
+-n 192.168.1.102 hostname
+-n 192.168.1.103 hostname
+-n 192.168.1.104 hostname
+EOF
+
+simple_test -s $cmd
diff --git a/ctdb/tests/onnode/0005.sh b/ctdb/tests/onnode/0005.sh
new file mode 100755 (executable)
index 0000000..0eccbb0
--- /dev/null
@@ -0,0 +1,13 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE 3 hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0006.sh b/ctdb/tests/onnode/0006.sh
new file mode 100755 (executable)
index 0000000..b027850
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE -v 3 hostname"
+
+define_test "$cmd" "all nodes OK"
+
+required_result <<EOF
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0070.sh b/ctdb/tests/onnode/0070.sh
new file mode 100755 (executable)
index 0000000..b071e80
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE ok hostname"
+
+define_test "$cmd" "all nodes OK"
+
+ctdb_set_output <<EOF
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:PartiallyOnline:ThisNode:
+:0:192.168.1.101:0:0:0:0:0:0:0:Y:
+:1:192.168.1.102:0:0:0:0:0:0:0:N:
+:2:192.168.1.103:0:0:0:0:0:0:0:N:
+:3:192.168.1.104:0:0:0:0:0:0:0:N:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.102 <<
+-n 192.168.1.102 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0071.sh b/ctdb/tests/onnode/0071.sh
new file mode 100755 (executable)
index 0000000..d594323
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE ok hostname"
+
+define_test "$cmd" "2nd node disconnected"
+
+ctdb_set_output <<EOF 
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:PartiallyOnline:ThisNode:
+:0:192.168.1.101:0:0:0:0:0:0:0:Y:
+:1:192.168.1.102:1:0:0:0:0:0:0:N:
+:2:192.168.1.103:0:0:0:0:0:0:0:N:
+:3:192.168.1.104:0:0:0:0:0:0:0:N:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0072.sh b/ctdb/tests/onnode/0072.sh
new file mode 100755 (executable)
index 0000000..cb29e3b
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE ok hostname"
+
+define_test "$cmd" "2nd node disconnected, extra status columns"
+
+ctdb_set_output <<EOF
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:X1:X2:X3:X4:
+:0:192.168.1.101:0:0:0:0:0:0:0:0:0:0:
+:1:192.168.1.102:1:0:0:0:0:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:0:0:0:0:0:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.101 <<
+-n 192.168.1.101 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0075.sh b/ctdb/tests/onnode/0075.sh
new file mode 100755 (executable)
index 0000000..4276e9c
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE con hostname"
+
+define_test "$cmd" "1st node disconnected"
+
+ctdb_set_output <<EOF
+:Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped:Inactive:PartiallyOnline:ThisNode:
+:0:192.168.1.101:1:0:0:0:0:0:0:N:
+:1:192.168.1.102:0:0:0:0:0:0:0:Y:
+:2:192.168.1.103:0:0:0:0:0:0:0:N:
+:3:192.168.1.104:0:0:0:0:0:0:0:N:
+EOF
+
+required_result <<EOF
+
+>> NODE: 192.168.1.102 <<
+-n 192.168.1.102 hostname
+
+>> NODE: 192.168.1.103 <<
+-n 192.168.1.103 hostname
+
+>> NODE: 192.168.1.104 <<
+-n 192.168.1.104 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0080.sh b/ctdb/tests/onnode/0080.sh
new file mode 100755 (executable)
index 0000000..bca478a
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE recmaster hostname"
+
+define_test "$cmd" "node 1 (192.168.1.102) is recmaster"
+
+ctdb_set_output <<EOF
+1
+EOF
+
+required_result <<EOF
+-n 192.168.1.102 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0081.sh b/ctdb/tests/onnode/0081.sh
new file mode 100755 (executable)
index 0000000..412db87
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE lvsmaster hostname"
+
+define_test "$cmd" "no lvsmaster"
+
+ctdb_set_output 255 <<EOF
+There is no LVS master
+EOF
+
+required_result 1 <<EOF
+onnode: No lvsmaster available
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0090.sh b/ctdb/tests/onnode/0090.sh
new file mode 100755 (executable)
index 0000000..dd50c51
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE natgw hostname"
+
+define_test "$cmd" "no natgw"
+
+ctdb_set_output <<EOF
+-1 0.0.0.0
+:0:192.168.1.101:0:0:0:0:0:
+:1:192.168.1.102:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:
+EOF
+
+required_result 1 <<EOF
+onnode: No natgwlist available
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/0091.sh b/ctdb/tests/onnode/0091.sh
new file mode 100755 (executable)
index 0000000..528eec1
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+cmd="$ONNODE natgw hostname"
+
+define_test "$cmd" "node 2 (192.168.1.103) is natgw"
+
+ctdb_set_output <<EOF
+2 192.168.1.103
+:0:192.168.1.101:0:0:0:0:0:
+:1:192.168.1.102:0:0:0:0:0:
+:2:192.168.1.103:0:0:0:0:0:
+:3:192.168.1.104:0:0:0:0:0:
+EOF
+
+required_result <<EOF
+-n 192.168.1.103 hostname
+EOF
+
+simple_test $cmd
diff --git a/ctdb/tests/onnode/README b/ctdb/tests/onnode/README
new file mode 100644 (file)
index 0000000..5bb6952
--- /dev/null
@@ -0,0 +1,36 @@
+onnode unit tests
+=================
+
+Examples:
+
+* ../run_tests.sh .
+
+  Run all tests, displaying output.
+
+* ../run_tests.sh -s .
+
+  Run all tests, displaying output and a summary.
+
+* ../run_tests.sh -sq .
+
+  Run all tests, displaying only a summary.
+
+* ONNODE=onnode-buggy-001 ../run_tests.sh -s .
+
+  Run against stubs/onnode-buggy-001 instead of default onnode version.
+
+  Add more buggy versions of onnode to this directory as bugs are
+  fixed to enable test validation using this feature.
+
+* ../run_tests.sh ./009*.sh
+
+  Run only the specified tests.
+
+* ONNODE="stubs/onnode-buggy-001" ../run_tests.sh -X ./0090.sh
+  ../run_tests.sh -X ./0090.sh
+
+  Debug the specified test or test failure by tracing onnode with
+  "bash -x".  The test will fail because the bash trace output will be
+  included in the test output.
+
+  To see if the test pases, the -X can be dropped...
diff --git a/ctdb/tests/onnode/nodes b/ctdb/tests/onnode/nodes
new file mode 100644 (file)
index 0000000..e2fe268
--- /dev/null
@@ -0,0 +1,4 @@
+192.168.1.101
+192.168.1.102
+192.168.1.103
+192.168.1.104
diff --git a/ctdb/tests/onnode/scripts/local.sh b/ctdb/tests/onnode/scripts/local.sh
new file mode 100644 (file)
index 0000000..9973a55
--- /dev/null
@@ -0,0 +1,86 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+# Set indirectly by run_tests at top level.
+unset CTDB_NODES_SOCKETS
+
+# Default to just "onnode".
+: ${ONNODE:=onnode}
+
+# Augment PATH with relevant stubs/ directories.
+
+if [ -d "${TEST_SUBDIR}/stubs" ] ; then
+    PATH="${TEST_SUBDIR}/stubs:$PATH"
+fi
+
+# Find CTDB nodes file.
+if [ -z "$CTDB_NODES_FILE" ] ; then
+    if [ -r "${TEST_SUBDIR}/nodes" ] ; then
+       CTDB_NODES_FILE="${TEST_SUBDIR}/nodes"
+    else
+       CTDB_NODES_FILE="${CTDB_BASE:-/etc/ctdb}/nodes"
+    fi
+fi
+
+export CTDB_NODES_FILE
+
+export ONNODE_TESTS_VAR_DIR="${TEST_VAR_DIR}/unit_onnode"
+mkdir -p "$ONNODE_TESTS_VAR_DIR"
+
+if [ -z "$CTDB_BASE" ] ; then
+    export CTDB_BASE=$(dirname "$CTDB_NODES_FILE")
+fi
+
+define_test ()
+{
+    _f=$(basename "$0")
+
+    echo "$_f $1 - $2"
+}
+
+# Set output for ctdb command.  Option 1st argument is return code.
+ctdb_set_output ()
+{
+    _out="$ONNODE_TESTS_VAR_DIR/ctdb.out"
+    cat >"$_out"
+
+    _rc="$ONNODE_TESTS_VAR_DIR/ctdb.rc"
+    echo "${1:-0}" >"$_rc"
+
+    trap "rm -f $_out $_rc" 0
+}
+
+_extra_header ()
+{
+    cat <<EOF
+CTDB_NODES_FILE="${CTDB_NODES_FILE}"
+CTDB_BASE="$CTDB_BASE"
+$(which ctdb)
+
+EOF
+}
+
+simple_test ()
+{
+    _sort="cat"
+    if [ "$1" = "-s" ] ; then
+       shift
+       _sort="sort"
+    fi
+
+    if $TEST_COMMAND_TRACE ; then
+       _onnode=$(which "$1") ; shift
+       _out=$(bash -x "$_onnode" "$@" 2>&1)
+    else
+       _out=$("$@" 2>&1)
+    fi
+    _rc=$?
+    _out=$(echo "$_out" | $_sort )
+
+    # Can't do this inline or it affects return code
+    _extra_header="$(_extra_header)"
+
+    # Get the return code back into $?
+    (exit $_rc)
+
+    result_check "$_extra_header"
+}
diff --git a/ctdb/tests/onnode/stubs/ctdb b/ctdb/tests/onnode/stubs/ctdb
new file mode 100755 (executable)
index 0000000..e420d25
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# Fake ctdb client for onnode tests.
+
+cmd=$(echo "$*" | sed -r -e 's@[[:space:]]+@_@g')
+
+out="${ONNODE_TESTS_VAR_DIR}/ctdb.out"
+if [ -r "$out" ] ; then
+    cat "$out"
+
+    rc="${ONNODE_TESTS_VAR_DIR}/ctdb.rc"
+    if [ -r "$rc" ] ; then
+       exit $(cat "$rc")
+    fi
+
+    exit 0
+fi
+
+f="${ONNODE_TESTCASE_DIR}/ctdb.d/${cmd}.sh"
+if [ -x "$f" ] ; then
+    "$f"
+    exit $?
+fi
+
+f="${ONNODE_TESTCASE_DIR}/ctdb.d/${cmd}.out"
+if [ -r "$f" ] ; then
+    cat "$f"
+    exit 0
+fi
+
+echo "fake ctdb: no implementation for \"$*\""
+
+exit 1
diff --git a/ctdb/tests/onnode/stubs/onnode-buggy-001 b/ctdb/tests/onnode/stubs/onnode-buggy-001
new file mode 100755 (executable)
index 0000000..77a1207
--- /dev/null
@@ -0,0 +1,376 @@
+#!/bin/bash
+
+# Run commands on CTDB nodes.
+
+# See http://ctdb.samba.org/ for more information about CTDB.
+
+# Copyright (C) Martin Schwenke  2008
+
+# Based on an earlier script by Andrew Tridgell and Ronnie Sahlberg.
+
+# Copyright (C) Andrew Tridgell  2007
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+   
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+   
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+prog=$(basename $0)
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: onnode [OPTION] ... <NODES> <COMMAND> ...
+  options:
+    -c          Run in current working directory on specified nodes.
+    -o <prefix> Save standard output from each node to file <prefix>.<ip>
+    -p          Run command in parallel on specified nodes.
+    -q          Do not print node addresses (overrides -v).
+    -n          Allow nodes to be specified by name.
+    -f          Specify nodes file, overrides CTDB_NODES_FILE.
+    -v          Print node address even for a single node.
+  <NODES>       "all", "any", "ok" (or "healthy"), "con" (or "connected"),
+                "rm" (or "recmaster"), "lvs" (or "lvsmaster"),
+                "natgw" (or "natgwlist"); or
+                a node number (0 base); or
+                a hostname (if -n is specified); or
+                list (comma separated) of <NODES>; or
+                range (hyphen separated) of node numbers.
+EOF
+    exit 1
+
+}
+
+invalid_nodespec ()
+{
+    echo "Invalid <nodespec>" >&2 ; echo >&2
+    usage
+}
+
+# Defaults.
+current=false
+parallel=false
+verbose=false
+quiet=false
+prefix=""
+names_ok=false
+
+ctdb_base="${CTDB_BASE:-/etc/ctdb}"
+
+parse_options ()
+{
+    # $POSIXLY_CORRECT means that the command passed to onnode can
+    # take options and getopt won't reorder things to make them
+    # options ot onnode.
+    local temp
+    # Not on the previous line - local returns 0!
+    temp=$(POSIXLY_CORRECT=1 getopt -n "$prog" -o "cf:hno:pqv" -l help -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$temp"
+
+    while true ; do
+       case "$1" in
+           -c) current=true ; shift ;;
+           -f) CTDB_NODES_FILE="$2" ; shift 2 ;;
+           -n) names_ok=true ; shift ;;
+           -o) prefix="$2" ; shift 2 ;;
+           -p) parallel=true ; shift ;;
+           -q) quiet=true ; shift ;;
+           -v) verbose=true ; shift ;;
+           --) shift ; break ;;
+           -h|--help|*) usage ;; # Shouldn't happen, so this is reasonable.
+       esac
+    done
+
+    [ $# -lt 2 ] && usage
+
+    nodespec="$1" ; shift
+    command="$@"
+}
+
+echo_nth ()
+{
+    local n="$1" ; shift
+
+    shift $n
+    local node="$1"
+
+    if [ -n "$node" -a "$node" != "#DEAD" ] ; then
+       echo $node
+    else
+       echo "${prog}: \"node ${n}\" does not exist" >&2
+       exit 1
+    fi
+}
+
+parse_nodespec ()
+{
+    # Subshell avoids hacks to restore $IFS.
+    (
+       IFS=","
+       for i in $1 ; do
+           case "$i" in
+               *-*) seq "${i%-*}" "${i#*-}" 2>/dev/null || invalid_nodespec ;;
+               # Separate lines for readability.
+               all|any|ok|healthy|con|connected) echo "$i" ;;
+               rm|recmaster|lvs|lvsmaster|natgw|natgwlist) echo "$i" ;;
+               *)
+                   [ $i -gt -1 ] 2>/dev/null || $names_ok || invalid_nodespec
+                   echo $i
+           esac
+       done
+    )
+}
+
+ctdb_status_output="" # cache
+get_nodes_with_status ()
+{
+    local all_nodes="$1"
+    local status="$2"
+
+    local bits
+    case "$status" in
+       healthy)
+           bits="0:0:0:0:0:0"
+           ;;
+       connected)
+           bits="0:[0-1]:[0-1]:[0-1]:[0-1]:[0-1]"
+           ;;
+       *)
+           invalid_nodespec
+    esac
+
+    if [ -z "$ctdb_status_output" ] ; then
+       # FIXME: need to do something if $CTDB_NODES_SOCKETS is set.
+       ctdb_status_output=$(ctdb -Y status 2>/dev/null)
+       if [ $? -ne 0 ] ; then
+           echo "${prog}: unable to get status of CTDB nodes" >&2
+           exit 1
+       fi
+       ctdb_status_output="${ctdb_status_output#* }"
+    fi
+
+    local nodes=""
+    local i
+    for i in $ctdb_status_output ; do
+       # Try removing bits from end.
+       local t="${i%:${bits}:}"
+       if [ "$t" != "$i" ] ; then
+           # Succeeded.  Get address.  NOTE: this is an optimisation.
+           # It might be better to get the node number and then get
+           # the nth node to get the address.  This would make things
+           # more consistent if $ctdb_base/nodes actually contained
+           # hostnames.
+           nodes="${nodes} ${t#:*:}"
+       fi
+    done
+
+    echo $nodes
+}
+
+ctdb_props="" # cache
+get_node_with_property ()
+{
+    local all_nodes="$1"
+    local prop="$2"
+
+    local prop_node=""
+    if [ "${ctdb_props##:${prop}:}" = "$ctdb_props" ] ; then
+       prop_node=$(ctdb "$prop" -Y 2>/dev/null)
+       # We only want the first line.
+       local nl="
+"
+       prop_node="${prop_node%%${nl}*}"
+       if [ $? -eq 0 ] ; then
+           ctdb_props="${ctdb_props}${ctdb_props:+ }:${prop}:${prop_node}"
+       else
+           prop_node=""
+       fi
+    else
+       prop_node="${ctdb_props##:${prop}:}"
+       prop_node="${prop_node%% *}"
+    fi
+    if [ -n "$prop_node" ] ; then
+       echo_nth "$prop_node" $all_nodes
+    else
+       echo "${prog}: No ${prop} available" >&2
+       exit 1
+    fi
+}
+
+get_any_available_node ()
+{
+    local all_nodes="$1"
+
+    # We do a recursive onnode to find which nodes are up and running.
+    local out=$($0 -pq all ctdb pnn 2>&1)
+    local line
+    while read line ; do 
+       local pnn="${line#PNN:}"
+       if [ "$pnn" != "$line" ] ; then
+           echo_nth "$pnn" $all_nodes
+           return 0
+       fi
+       # Else must be an error message from a down node.
+    done <<<"$out"
+    return 1
+}
+
+get_nodes ()
+{
+    local all_nodes
+
+    if [ -n "$CTDB_NODES_SOCKETS" ] ; then 
+       all_nodes="$CTDB_NODES_SOCKETS"
+    else
+       local f="${ctdb_base}/nodes"
+       if [ -n "$CTDB_NODES_FILE" ] ; then
+           f="$CTDB_NODES_FILE"
+           if [ ! -e "$f" -a "${f#/}" = "$f" ] ; then
+               # $f is relative, try in $ctdb_base
+               f="${ctdb_base}/${f}"
+           fi
+       fi
+
+       if [ ! -r "$f" ] ; then
+           echo "${prog}: unable to open nodes file  \"${f}\"" >&2
+           exit 1
+       fi
+
+       all_nodes=$(sed -e 's@#.*@@g' -e 's@ *@@g' -e 's@^$@#DEAD@' "$f")
+    fi
+
+    local nodes=""
+    local n
+    for n in $(parse_nodespec "$1") ; do
+       [ $? != 0 ] && exit 1  # Required to catch exit in above subshell.
+       case "$n" in
+           all)
+               echo "${all_nodes//#DEAD/}"
+               ;;
+           any)
+               get_any_available_node "$all_nodes" || exit 1
+               ;;
+           ok|healthy) 
+               get_nodes_with_status "$all_nodes" "healthy" || exit 1
+               ;;
+           con|connected) 
+               get_nodes_with_status "$all_nodes" "connected" || exit 1
+               ;;
+           rm|recmaster)
+               get_node_with_property "$all_nodes" "recmaster" || exit 1
+               ;;
+           lvs|lvsmaster)
+               get_node_with_property "$all_nodes" "lvsmaster" || exit 1
+               ;;
+           natgw|natgwlist)
+               get_node_with_property "$all_nodes" "natgwlist" || exit 1
+               ;;
+           [0-9]|[0-9][0-9]|[0-9][0-9][0-9])
+               echo_nth $n $all_nodes
+               ;;
+           *)
+               $names_ok || invalid_nodespec
+               echo $n
+       esac
+    done
+}
+
+fakessh ()
+{
+    CTDB_SOCKET="$1" sh -c "$2" 3>/dev/null
+}
+
+stdout_filter ()
+{
+    if [ -n "$prefix" ] ; then
+       cat >"${prefix}.${n//\//_}"
+    elif $verbose && $parallel ; then
+       sed -e "s@^@[$n] @"
+    else
+       cat
+    fi
+}
+
+stderr_filter ()
+{
+    if $verbose && $parallel ; then
+       sed -e "s@^@[$n] @"
+    else
+       cat
+    fi
+}
+
+######################################################################
+
+parse_options "$@"
+
+$current && command="cd $PWD && $command"
+
+ssh_opts=
+if [ -n "$CTDB_NODES_SOCKETS" ] ; then
+    SSH=fakessh
+else 
+    # Could "2>/dev/null || true" but want to see errors from typos in file.
+    [ -r "${ctdb_base}/onnode.conf" ] && . "${ctdb_base}/onnode.conf"
+    [ -n "$SSH" ] || SSH=ssh
+    if [ "$SSH" = "ssh" ] ; then
+       ssh_opts="-n"
+    else
+       : # rsh? All bets are off!
+    fi
+fi
+
+######################################################################
+
+nodes=$(get_nodes "$nodespec")
+[ $? != 0 ] && exit 1   # Required to catch exit in above subshell.
+
+if $quiet ; then
+    verbose=false
+else
+    # If $nodes contains a space or a newline then assume multiple nodes.
+    nl="
+"
+    [ "$nodes" != "${nodes%[ ${nl}]*}" ] && verbose=true
+fi
+
+pids=""
+trap 'kill -TERM $pids 2>/dev/null' INT TERM
+# There's a small race here where the kill can fail if no processes
+# have been added to $pids and the script is interrupted.  However,
+# the part of the window where it matter is very small.
+retcode=0
+for n in $nodes ; do
+    set -o pipefail 2>/dev/null
+    if $parallel ; then
+       { exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; } &
+       pids="${pids} $!"
+    else
+       if $verbose ; then
+           echo >&2 ; echo ">> NODE: $n <<" >&2
+       fi
+
+       { exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; }
+       [ $? = 0 ] || retcode=$?
+    fi
+done
+
+$parallel && {
+    for p in $pids; do
+       wait $p
+       [ $? = 0 ] || retcode=$?
+    done
+}
+
+exit $retcode
diff --git a/ctdb/tests/onnode/stubs/ssh b/ctdb/tests/onnode/stubs/ssh
new file mode 100755 (executable)
index 0000000..7be778f
--- /dev/null
@@ -0,0 +1,2 @@
+#!/bin/sh
+echo "$*"
diff --git a/ctdb/tests/recover.sh b/ctdb/tests/recover.sh
new file mode 100755 (executable)
index 0000000..c626441
--- /dev/null
@@ -0,0 +1,107 @@
+#!/bin/sh
+
+killall -q ctdbd
+
+echo "Starting 4 ctdb daemons"
+bin/ctdbd --recovery-daemon --nlist tests/4nodes.txt
+bin/ctdbd --recovery-daemon --nlist tests/4nodes.txt --listen=127.0.0.2 --socket=/tmp/ctdb.socket.127.0.0.2
+bin/ctdbd --recovery-daemon --nlist tests/4nodes.txt --listen=127.0.0.3 --socket=/tmp/ctdb.socket.127.0.0.3
+bin/ctdbd --recovery-daemon --nlist tests/4nodes.txt --listen=127.0.0.4 --socket=/tmp/ctdb.socket.127.0.0.4
+
+echo
+echo "Attaching to some databases"
+bin/ctdb_control attach test1.tdb || exit 1
+bin/ctdb_control attach test2.tdb || exit 1
+bin/ctdb_control attach test3.tdb || exit 1
+bin/ctdb_control attach test4.tdb || exit 1
+
+echo "Clearing all databases to make sure they are all empty"
+bin/ctdb_control getdbmap 0 | egrep "^dbid:" | sed -e "s/^dbid://" -e "s/ .*$//" | while read DB; do
+       seq 0 3 | while read NODE; do
+               bin/ctdb_control cleardb $NODE $DB
+       done
+done
+
+
+echo
+echo
+echo "Printing all databases on all nodes. they should all be empty"
+echo "============================================================="
+bin/ctdb_control getdbmap 0 | egrep "^dbid:" | sed -e "s/^.*name://" -e "s/ .*$//" | while read DBNAME; do
+       seq 0 3 | while read NODE; do
+               echo "Content of DBNAME:$DBNAME NODE:$NODE :"
+               bin/ctdb_control catdb $DBNAME $NODE
+       done
+done
+
+echo
+echo
+echo "Populating the databases"
+./bin/ctdb_control writerecord 0 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control setdmaster 0 0x220c2a7b 1
+
+./bin/ctdb_control writerecord 1 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control writerecord 1 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control setdmaster 1 0x220c2a7b 2
+
+./bin/ctdb_control writerecord 2 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control writerecord 2 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control writerecord 2 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control setdmaster 2 0x220c2a7b 3
+
+./bin/ctdb_control writerecord 3 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control writerecord 3 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control writerecord 3 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control writerecord 3 0x220c2a7b testkey1 testdata1
+./bin/ctdb_control setdmaster 3 0x220c2a7b 3
+
+
+echo
+echo
+echo "Printing all databases on all nodes. there should be a record there"
+echo "============================================================="
+bin/ctdb_control getdbmap 0 | egrep "^dbid:" | sed -e "s/^.*name://" -e "s/ .*$//" | while read DBNAME; do
+       seq 0 3 | while read NODE; do
+               echo "Content of DBNAME:$DBNAME NODE:$NODE :"
+               bin/ctdb_control catdb $DBNAME $NODE
+       done
+done
+
+echo
+echo
+echo "killing off node #2"
+echo "==================="
+CTDBPID=`./bin/ctdb_control getpid 2 | sed -e "s/Pid://"`
+kill $CTDBPID
+sleep 1
+
+
+echo
+echo
+echo "wait 3 seconds to let the recovery daemon do its job"
+echo "===================================================="
+sleep 3
+
+echo
+echo
+echo "Printing all databases on all nodes."
+echo "The databases should be the same now on all nodes"
+echo "and the record will have been migrated to node 0"
+echo "================================================="
+echo "Node 0:"
+bin/ctdb_control catdb test4.tdb 0
+echo "Node 1:"
+bin/ctdb_control catdb test4.tdb 1
+echo "Node 3:"
+bin/ctdb_control catdb test4.tdb 3
+echo "nodemap:"
+bin/ctdb_control getnodemap 0
+
+echo
+echo
+echo "Traverse the cluster and dump the database"
+bin/ctdb_control catdb test4.tdb
+
+
+#leave the ctdb daemons running   so one can look at the box in more detail
+#killall -q ctdbd
diff --git a/ctdb/tests/run_cluster_tests.sh b/ctdb/tests/run_cluster_tests.sh
new file mode 120000 (symlink)
index 0000000..5236e32
--- /dev/null
@@ -0,0 +1 @@
+run_tests.sh
\ No newline at end of file
diff --git a/ctdb/tests/run_tests.sh b/ctdb/tests/run_tests.sh
new file mode 100755 (executable)
index 0000000..5fcd89d
--- /dev/null
@@ -0,0 +1,46 @@
+#!/bin/sh
+
+test_dir=$(dirname "$0")
+
+case $(basename "$0") in
+    *run_cluster_tests*)
+       # Running on a cluster:
+       # * print summary, run any integration tests against cluster
+       # * default to running: all integration tests, no unit tests
+       opts="-s"
+       tests="simple complex"
+       ;;
+    *)
+       # Running on local machine:
+       # * print summary, run any integration tests against local daemons
+       # * default to running: all unit tests, simple integration tests
+       opts="-s -l"
+       tests="onnode takeover tool eventscripts simple"
+       # If running in the source tree then use a fixed TEST_VAR_DIR.
+       # If this script is installed using the INSTALL script then
+       # TEST_BIN_DIR will be set, so use this as the test.
+       if [ -z "$TEST_BIN_DIR" ] ; then
+           opts="${opts} -V ${test_dir}/var"
+       fi
+esac
+
+# Allow options to be passed to this script.  However, if any options
+# are passed there must be a "--" between the options and the tests.
+# This makes it easy to handle options that take arguments.
+case "$1" in
+    -*)
+       while [ -n "$1" ] ; do
+           case "$1" in
+               --) shift ; break ;;
+               *) opts="$opts $1" ; shift ;;
+           esac
+       done
+esac
+
+# If no tests are specified, then run the defaults.
+[ -n "$1" ] || set -- $tests
+
+"${test_dir}/scripts/run_tests" $opts "$@" || exit 1
+
+echo "All OK"
+exit 0
diff --git a/ctdb/tests/scripts/common.sh b/ctdb/tests/scripts/common.sh
new file mode 100644 (file)
index 0000000..64a176b
--- /dev/null
@@ -0,0 +1,41 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+# Common variables and functions for all CTDB tests.
+
+# This expands the most probable problem cases like "." and "..".
+TEST_SUBDIR=$(dirname "$0")
+if [ $(dirname "$TEST_SUBDIR") = "." ] ; then
+    TEST_SUBDIR=$(cd "$TEST_SUBDIR" ; pwd)
+fi
+
+_test_dir=$(dirname "$TEST_SUBDIR")
+
+# If we are running from within the source tree then, depending on the
+# tests that we're running, we may need to add the top level bin/ and
+# tools/ subdirectories to $PATH.  This means we need a way of
+# determining if we're running from within the source tree.  There is
+# no use looking outside the tests/ subdirectory because anything
+# above that level may be meaningless and outside our control.
+# Therefore, we'll use existence of $_test_dir/run_tests.sh to
+# indicate that we're running in-tree - on a system where the tests
+# have been installed, this file will be absent (renamed and placed in
+# some bin/ directory).
+if [ -f "${_test_dir}/run_tests.sh" ] ; then
+    ctdb_dir=$(dirname "$_test_dir")
+
+    _tools_dir="${ctdb_dir}/tools"
+    if [ -d "$_tools_dir" ] ; then
+       PATH="${_tools_dir}:$PATH"
+    fi
+fi
+
+_test_bin_dir="${TEST_BIN_DIR:-${_test_dir}/bin}"
+if [ -d "$_test_bin_dir" ] ; then
+    PATH="${_test_bin_dir}:$PATH"
+fi
+
+# Print a message and exit.
+die ()
+{
+    echo "$1" >&2 ; exit ${2:-1}
+}
diff --git a/ctdb/tests/scripts/integration.bash b/ctdb/tests/scripts/integration.bash
new file mode 100644 (file)
index 0000000..040a360
--- /dev/null
@@ -0,0 +1,980 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+. "${TEST_SCRIPTS_DIR}/common.sh"
+
+# If we're not running on a real cluster then we need a local copy of
+# ctdb (and other stuff) in $PATH and we will use local daemons.
+if [ -n "$TEST_LOCAL_DAEMONS" ] ; then
+    export CTDB_NODES_SOCKETS=""
+    for i in $(seq 0 $(($TEST_LOCAL_DAEMONS - 1))) ; do
+       CTDB_NODES_SOCKETS="${CTDB_NODES_SOCKETS}${CTDB_NODES_SOCKETS:+ }${TEST_VAR_DIR}/sock.${i}"
+    done
+
+    # Use in-tree binaries if running against local daemons.
+    # Otherwise CTDB need to be installed on all nodes.
+    if [ -n "$ctdb_dir" -a -d "${ctdb_dir}/bin" ] ; then
+       PATH="${ctdb_dir}/bin:${PATH}"
+        export CTDB_LOCK_HELPER="${ctdb_dir}/bin/ctdb_lock_helper"
+    fi
+
+    export CTDB_NODES="${TEST_VAR_DIR}/nodes.txt"
+fi
+
+######################################################################
+
+export CTDB_TIMEOUT=60
+
+if [ -n "$CTDB_TEST_REMOTE_DIR" ] ; then
+    CTDB_TEST_WRAPPER="${CTDB_TEST_REMOTE_DIR}/test_wrap"
+else
+    _d=$(cd ${TEST_SCRIPTS_DIR}; echo $PWD)
+    CTDB_TEST_WRAPPER="$_d/test_wrap"
+fi
+export CTDB_TEST_WRAPPER
+
+# If $VALGRIND is set then use it whenever ctdb is called, but only if
+# $CTDB is not already set.
+[ -n "$CTDB" ] || export CTDB="${VALGRIND}${VALGRIND:+ }ctdb"
+
+# why???
+PATH="${TEST_SCRIPTS_DIR}:${PATH}"
+
+######################################################################
+
+ctdb_check_time_logs ()
+{
+    local threshold=20
+
+    local jump=false
+    local prev=""
+    local ds_prev=""
+    local node=""
+
+    out=$(onnode all tail -n 20 "${TEST_VAR_DIR}/ctdb.test.time.log" 2>&1)
+
+    if [ $? -eq 0 ] ; then
+       local line
+       while read line ; do
+           case "$line" in
+               \>\>\ NODE:\ *\ \<\<)
+                   node="${line#>> NODE: }"
+                   node=${node% <<*}
+                   ds_prev=""
+                   ;;
+               *\ *)
+                   set -- $line
+                   ds_curr="$1${2:0:1}"
+                   if [ -n "$ds_prev" ] && \
+                       [ $(($ds_curr - $ds_prev)) -ge $threshold ] ; then
+                       echo "Node $node had time jump of $(($ds_curr - $ds_prev))ds between $(date +'%T' -d @${ds_prev%?}) and $(date +'%T' -d @${ds_curr%?})"
+                       jump=true
+                   fi
+                   prev="$line"
+                   ds_prev="$ds_curr"
+                   ;;
+           esac
+       done <<<"$out"
+    else
+       echo Error getting time logs
+    fi
+    if $jump ; then
+       echo "Check time sync (test client first):"
+       date
+       onnode -p all date
+       echo "Information from test client:"
+       hostname
+       top -b -n 1
+       echo "Information from cluster nodes:"
+       onnode all "top -b -n 1 ; echo '/proc/slabinfo' ; cat /proc/slabinfo"
+    fi
+}
+
+ctdb_test_exit ()
+{
+    local status=$?
+
+    trap - 0
+
+    [ $(($testfailures+0)) -eq 0 -a $status -ne 0 ] && testfailures=$status
+    status=$(($testfailures+0))
+
+    # Avoid making a test fail from this point onwards.  The test is
+    # now complete.
+    set +e
+
+    echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."
+
+    if [ -z "$TEST_LOCAL_DAEMONS" -a -n "$CTDB_TEST_TIME_LOGGING" -a \
+       $status -ne 0 ] ; then
+       ctdb_check_time_logs
+    fi
+
+    eval "$ctdb_test_exit_hook" || true
+    unset ctdb_test_exit_hook
+
+    if $ctdb_test_restart_scheduled || ! cluster_is_healthy ; then
+
+       restart_ctdb
+    else
+       # This could be made unconditional but then we might get
+       # duplication from the recovery in restart_ctdb.  We want to
+       # leave the recovery in restart_ctdb so that future tests that
+       # might do a manual restart mid-test will benefit.
+       echo "Forcing a recovery..."
+       onnode 0 $CTDB recover
+    fi
+
+    exit $status
+}
+
+ctdb_test_exit_hook_add ()
+{
+    ctdb_test_exit_hook="${ctdb_test_exit_hook}${ctdb_test_exit_hook:+ ; }$*"
+}
+
+ctdb_test_init ()
+{
+    scriptname=$(basename "$0")
+    testfailures=0
+    ctdb_test_restart_scheduled=false
+
+    trap "ctdb_test_exit" 0
+}
+
+########################################
+
+# Sets: $out
+try_command_on_node ()
+{
+    local nodespec="$1" ; shift
+
+    local verbose=false
+    local onnode_opts=""
+
+    while [ "${nodespec#-}" != "$nodespec" ] ; do
+       if [ "$nodespec" = "-v" ] ; then
+           verbose=true
+       else
+           onnode_opts="$nodespec"
+       fi
+       nodespec="$1" ; shift
+    done
+
+    local cmd="$*"
+
+    out=$(onnode -q $onnode_opts "$nodespec" "$cmd" 2>&1) || {
+
+       echo "Failed to execute \"$cmd\" on node(s) \"$nodespec\""
+       echo "$out"
+       return 1
+    }
+
+    if $verbose ; then
+       echo "Output of \"$cmd\":"
+       echo "$out"
+    fi
+}
+
+sanity_check_output ()
+{
+    local min_lines="$1"
+    local regexp="$2" # Should be anchored as necessary.
+    local output="$3"
+
+    local ret=0
+
+    local num_lines=$(echo "$output" | wc -l)
+    echo "There are $num_lines lines of output"
+    if [ $num_lines -lt $min_lines ] ; then
+       echo "BAD: that's less than the required number (${min_lines})"
+       ret=1
+    fi
+
+    local status=0
+    local unexpected # local doesn't pass through status of command on RHS.
+    unexpected=$(echo "$output" | egrep -v "$regexp") || status=$?
+
+    # Note that this is reversed.
+    if [ $status -eq 0 ] ; then
+       echo "BAD: unexpected lines in output:"
+       echo "$unexpected" | cat -A
+       ret=1
+    else
+       echo "Output lines look OK"
+    fi
+
+    return $ret
+}
+
+sanity_check_ips ()
+{
+    local ips="$1" # list of "ip node" lines
+
+    echo "Sanity checking IPs..."
+
+    local x ipp prev
+    prev=""
+    while read x ipp ; do
+       [ "$ipp" = "-1" ] && break
+       if [ -n "$prev" -a "$ipp" != "$prev" ] ; then
+           echo "OK"
+           return 0
+       fi
+       prev="$ipp"
+    done <<<"$ips"
+
+    echo "BAD: a node was -1 or IPs are only assigned to one node"
+    echo "Are you running an old version of CTDB?"
+    return 1
+}
+
+# This returns a list of "ip node" lines in $out
+all_ips_on_node()
+{
+    local node=$@
+    try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
+}
+
+_select_test_node_and_ips ()
+{
+    all_ips_on_node 0
+
+    test_node=""  # this matches no PNN
+    test_node_ips=""
+    local ip pnn
+    while read ip pnn ; do
+       if [ -z "$test_node" -a "$pnn" != "-1" ] ; then
+           test_node="$pnn"
+       fi
+       if [ "$pnn" = "$test_node" ] ; then
+            test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
+       fi
+    done <<<"$out" # bashism to avoid problem setting variable in pipeline.
+
+    echo "Selected node ${test_node} with IPs: ${test_node_ips}."
+    test_ip="${test_node_ips%% *}"
+
+    [ -n "$test_node" ] || return 1
+}
+
+select_test_node_and_ips ()
+{
+    local timeout=10
+    while ! _select_test_node_and_ips ; do
+       echo "Unable to find a test node with IPs assigned"
+       if [ $timeout -le 0 ] ; then
+           echo "BAD: Too many attempts"
+           return 1
+       fi
+       sleep_for 1
+       timeout=$(($timeout - 1))
+    done
+
+    return 0
+}
+
+#######################################
+
+# Wait until either timeout expires or command succeeds.  The command
+# will be tried once per second.
+wait_until ()
+{
+    local timeout="$1" ; shift # "$@" is the command...
+
+    local negate=false
+    if [ "$1" = "!" ] ; then
+       negate=true
+       shift
+    fi
+
+    echo -n "<${timeout}|"
+    local t=$timeout
+    while [ $t -gt 0 ] ; do
+       local rc=0
+       "$@" || rc=$?
+       if { ! $negate && [ $rc -eq 0 ] ; } || \
+           { $negate && [ $rc -ne 0 ] ; } ; then
+           echo "|$(($timeout - $t))|"
+           echo "OK"
+           return 0
+       fi
+       echo -n .
+       t=$(($t - 1))
+       sleep 1
+    done
+
+    echo "*TIMEOUT*"
+
+    return 1
+}
+
+sleep_for ()
+{
+    echo -n "=${1}|"
+    for i in $(seq 1 $1) ; do
+       echo -n '.'
+       sleep 1
+    done
+    echo '|'
+}
+
+_cluster_is_healthy ()
+{
+    $CTDB nodestatus all >/dev/null && \
+       node_has_status 0 recovered
+}
+
+cluster_is_healthy ()
+{
+    if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
+       echo "Cluster is HEALTHY"
+       return 0
+    else
+       echo "Cluster is UNHEALTHY"
+       if ! ${ctdb_test_restart_scheduled:-false} ; then
+           echo "DEBUG AT $(date '+%F %T'):"
+           local i
+           for i in "onnode -q 0 $CTDB status" "onnode -q 0 onnode all $CTDB scriptstatus" ; do
+               echo "$i"
+               $i || true
+           done
+       fi
+       return 1
+    fi
+}
+
+wait_until_healthy ()
+{
+    local timeout="${1:-120}"
+
+    echo "Waiting for cluster to become healthy..."
+
+    wait_until 120 _cluster_is_healthy
+}
+
+# This function is becoming nicely overloaded.  Soon it will collapse!  :-)
+node_has_status ()
+{
+    local pnn="$1"
+    local status="$2"
+
+    local bits fpat mpat rpat
+    case "$status" in
+       (unhealthy)    bits="?:?:?:1:*" ;;
+       (healthy)      bits="?:?:?:0:*" ;;
+       (disconnected) bits="1:*" ;;
+       (connected)    bits="0:*" ;;
+       (banned)       bits="?:1:*" ;;
+       (unbanned)     bits="?:0:*" ;;
+       (disabled)     bits="?:?:1:*" ;;
+       (enabled)      bits="?:?:0:*" ;;
+       (stopped)      bits="?:?:?:?:1:*" ;;
+       (notstopped)   bits="?:?:?:?:0:*" ;;
+       (frozen)       fpat='^[[:space:]]+frozen[[:space:]]+1$' ;;
+       (unfrozen)     fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
+       (monon)        mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
+       (monoff)       mpat='^Monitoring mode:DISABLED \(1\)$' ;;
+       (recovered)    rpat='^Recovery mode:NORMAL \(0\)$' ;;
+       *)
+           echo "node_has_status: unknown status \"$status\""
+           return 1
+    esac
+
+    if [ -n "$bits" ] ; then
+       local out x line
+
+       out=$($CTDB -Y status 2>&1) || return 1
+
+       {
+            read x
+            while read line ; do
+               # This needs to be done in 2 steps to avoid false matches.
+               local line_bits="${line#:${pnn}:*:}"
+               [ "$line_bits" = "$line" ] && continue
+               [ "${line_bits#${bits}}" != "$line_bits" ] && return 0
+            done
+           return 1
+       } <<<"$out" # Yay bash!
+    elif [ -n "$fpat" ] ; then
+       $CTDB statistics -n "$pnn" | egrep -q "$fpat"
+    elif [ -n "$mpat" ] ; then
+       $CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
+    elif [ -n "$rpat" ] ; then
+        $CTDB status -n "$pnn" | egrep -q "$rpat"
+    else
+       echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
+       return 1
+    fi
+}
+
+wait_until_node_has_status ()
+{
+    local pnn="$1"
+    local status="$2"
+    local timeout="${3:-30}"
+    local proxy_pnn="${4:-any}"
+
+    echo "Waiting until node $pnn has status \"$status\"..."
+
+    if ! wait_until $timeout onnode $proxy_pnn $CTDB_TEST_WRAPPER node_has_status "$pnn" "$status" ; then
+       for i in "onnode -q any $CTDB status" "onnode -q any onnode all $CTDB scriptstatus" ; do
+           echo "$i"
+           $i || true
+       done
+
+       return 1
+    fi
+
+}
+
+# Useful for superficially testing IP failover.
+# IPs must be on nodes matching nodeglob.
+# If the first argument is '!' then the IPs must not be on nodes
+# matching nodeglob.
+ips_are_on_nodeglob ()
+{
+    local negating=false
+    if [ "$1" = "!" ] ; then
+       negating=true ; shift
+    fi
+    local nodeglob="$1" ; shift
+    local ips="$*"
+
+    local out
+
+    all_ips_on_node 1
+
+    for check in $ips ; do
+       while read ip pnn ; do
+           if [ "$check" = "$ip" ] ; then
+               case "$pnn" in
+                   ($nodeglob) if $negating ; then return 1 ; fi ;;
+                   (*) if ! $negating ; then return 1 ; fi  ;;
+               esac
+               ips="${ips/${ip}}" # Remove from list
+               break
+           fi
+           # If we're negating and we didn't see the address then it
+           # isn't hosted by anyone!
+           if $negating ; then
+               ips="${ips/${check}}"
+           fi
+       done <<<"$out" # bashism to avoid problem setting variable in pipeline.
+    done
+
+    ips="${ips// }" # Remove any spaces.
+    [ -z "$ips" ]
+}
+
+wait_until_ips_are_on_nodeglob ()
+{
+    echo "Waiting for IPs to fail over..."
+
+    wait_until 60 ips_are_on_nodeglob "$@"
+}
+
+node_has_some_ips ()
+{
+    local node="$1"
+
+    local out
+
+    all_ips_on_node 1
+
+    while read ip pnn ; do
+       if [ "$node" = "$pnn" ] ; then
+           return 0
+       fi
+    done <<<"$out" # bashism to avoid problem setting variable in pipeline.
+
+    return 1
+}
+
+wait_until_node_has_some_ips ()
+{
+    echo "Waiting for node to have some IPs..."
+
+    wait_until 60 node_has_some_ips "$@"
+}
+
+ip2ipmask ()
+{
+    _ip="$1"
+
+    ip addr show to "$_ip" | awk '$1 == "inet" { print $2 }'
+}
+
+#######################################
+
+daemons_stop ()
+{
+    echo "Attempting to politely shutdown daemons..."
+    onnode 1 $CTDB shutdown -n all || true
+
+    echo "Sleeping for a while..."
+    sleep_for 1
+
+    local pat="ctdbd --socket=.* --nlist .* --nopublicipcheck"
+    if pgrep -f "$pat" >/dev/null ; then
+       echo "Killing remaining daemons..."
+       pkill -f "$pat"
+
+       if pgrep -f "$pat" >/dev/null ; then
+           echo "Once more with feeling.."
+           pkill -9 -f "$pat"
+       fi
+    fi
+
+    rm -rf "${TEST_VAR_DIR}/test.db"
+}
+
+daemons_setup ()
+{
+    mkdir -p "${TEST_VAR_DIR}/test.db/persistent"
+
+    local public_addresses_all="${TEST_VAR_DIR}/public_addresses_all"
+    local no_public_addresses="${TEST_VAR_DIR}/no_public_addresses.txt"
+    rm -f $CTDB_NODES $public_addresses_all $no_public_addresses
+
+    # If there are (strictly) greater than 2 nodes then we'll randomly
+    # choose a node to have no public addresses.
+    local no_public_ips=-1
+    [ $TEST_LOCAL_DAEMONS -gt 2 ] && no_public_ips=$(($RANDOM % $TEST_LOCAL_DAEMONS))
+    echo "$no_public_ips" >$no_public_addresses
+
+    # When running certain tests we add and remove eventscripts, so we
+    # need to be able to modify the events.d/ directory.  Therefore,
+    # we use a temporary events.d/ directory under $TEST_VAR_DIR.  We
+    # copy the actual test eventscript(s) in there from the original
+    # events.d/ directory that sits alongside $TEST_SCRIPT_DIR.
+    local top=$(dirname "$TEST_SCRIPTS_DIR")
+    local events_d="${top}/events.d"
+    mkdir -p "${TEST_VAR_DIR}/events.d"
+    cp -p "${events_d}/"* "${TEST_VAR_DIR}/events.d/"
+
+    local i
+    for i in $(seq 1 $TEST_LOCAL_DAEMONS) ; do
+       if [ "${CTDB_USE_IPV6}x" != "x" ]; then
+           echo ::$i >>"$CTDB_NODES"
+           ip addr add ::$i/128 dev lo
+       else
+           echo 127.0.0.$i >>"$CTDB_NODES"
+           # 2 public addresses on most nodes, just to make things interesting.
+           if [ $(($i - 1)) -ne $no_public_ips ] ; then
+               echo "192.168.234.$i/24 lo" >>"$public_addresses_all"
+               echo "192.168.234.$(($i + $TEST_LOCAL_DAEMONS))/24 lo" >>"$public_addresses_all"
+           fi
+       fi
+    done
+}
+
+daemons_start_1 ()
+{
+    local pnn="$1"
+    shift # "$@" gets passed to ctdbd
+
+    local public_addresses_all="${TEST_VAR_DIR}/public_addresses_all"
+    local public_addresses_mine="${TEST_VAR_DIR}/public_addresses.${pnn}"
+    local no_public_addresses="${TEST_VAR_DIR}/no_public_addresses.txt"
+
+    local no_public_ips=-1
+    [ -r $no_public_addresses ] && read no_public_ips <$no_public_addresses
+
+    if  [ "$no_public_ips" = $pnn ] ; then
+       echo "Node $no_public_ips will have no public IPs."
+    fi
+
+    local node_ip=$(sed -n -e "$(($pnn + 1))p" "$CTDB_NODES")
+    local ctdb_options="--sloppy-start --reclock=${TEST_VAR_DIR}/rec.lock --nlist $CTDB_NODES --nopublicipcheck --listen=${node_ip} --event-script-dir=${TEST_VAR_DIR}/events.d --logfile=${TEST_VAR_DIR}/daemon.${pnn}.log -d 3 --log-ringbuf-size=10000 --dbdir=${TEST_VAR_DIR}/test.db --dbdir-persistent=${TEST_VAR_DIR}/test.db/persistent --dbdir-state=${TEST_VAR_DIR}/test.db/state"
+
+    if [ $pnn -eq $no_public_ips ] ; then
+       ctdb_options="$ctdb_options --public-addresses=/dev/null"
+    else
+       cp "$public_addresses_all" "$public_addresses_mine"
+       ctdb_options="$ctdb_options --public-addresses=$public_addresses_mine"
+    fi
+
+    # We'll use "pkill -f" to kill the daemons with
+    # "--socket=.* --nlist .* --nopublicipcheck" as context.
+    $VALGRIND ctdbd --socket="${TEST_VAR_DIR}/sock.$pnn" $ctdb_options "$@" ||return 1
+}
+
+daemons_start ()
+{
+    # "$@" gets passed to ctdbd
+
+    echo "Starting $TEST_LOCAL_DAEMONS ctdb daemons..."
+
+    for i in $(seq 0 $(($TEST_LOCAL_DAEMONS - 1))) ; do
+       daemons_start_1 $i "$@"
+    done
+}
+
+#######################################
+
+_ctdb_hack_options ()
+{
+    local ctdb_options="$*"
+
+    case "$ctdb_options" in
+       *--start-as-stopped*)
+           export CTDB_START_AS_STOPPED="yes"
+    esac
+}
+
+_restart_ctdb ()
+{
+    _ctdb_hack_options "$@"
+
+    if [ -e /etc/redhat-release ] ; then
+       service ctdb restart
+    else
+       /etc/init.d/ctdb restart
+    fi
+}
+
+_ctdb_start ()
+{
+    _ctdb_hack_options "$@"
+
+    /etc/init.d/ctdb start
+}
+
+setup_ctdb ()
+{
+    if [ -n "$CTDB_NODES_SOCKETS" ] ; then
+       daemons_setup
+    fi
+}
+
+# Common things to do after starting one or more nodes.
+_ctdb_start_post ()
+{
+    onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
+
+    echo "Setting RerecoveryTimeout to 1"
+    onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
+
+    # In recent versions of CTDB, forcing a recovery like this blocks
+    # until the recovery is complete.  Hopefully this will help the
+    # cluster to stabilise before a subsequent test.
+    echo "Forcing a recovery..."
+    onnode -q 0 $CTDB recover
+    sleep_for 1
+
+    echo "ctdb is ready"
+}
+
+# This assumes that ctdbd is not running on the given node.
+ctdb_start_1 ()
+{
+    local pnn="$1"
+    shift # "$@" is passed to ctdbd start.
+
+    echo -n "Starting CTDB on node ${pnn}..."
+
+    if [ -n "$CTDB_NODES_SOCKETS" ] ; then
+       daemons_start_1 $pnn "$@"
+    else
+       onnode $pnn $CTDB_TEST_WRAPPER _ctdb_start "$@"
+    fi
+
+    # If we're starting only 1 node then we're doing something weird.
+    ctdb_restart_when_done
+}
+
+restart_ctdb ()
+{
+    # "$@" is passed to ctdbd start.
+
+    echo -n "Restarting CTDB"
+    if $ctdb_test_restart_scheduled ; then
+       echo -n " (scheduled)"
+    fi
+    echo "..."
+
+    local i
+    for i in $(seq 1 5) ; do
+       if [ -n "$CTDB_NODES_SOCKETS" ] ; then
+           daemons_stop
+           daemons_start "$@"
+       else
+           onnode -p all $CTDB_TEST_WRAPPER _restart_ctdb "$@"
+       fi || {
+           echo "Restart failed.  Trying again in a few seconds..."
+           sleep_for 5
+           continue
+       }
+
+       onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || {
+           echo "Cluster didn't become healthy.  Restarting..."
+           continue
+       }
+
+       echo "Setting RerecoveryTimeout to 1"
+       onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
+
+       # In recent versions of CTDB, forcing a recovery like this
+       # blocks until the recovery is complete.  Hopefully this will
+       # help the cluster to stabilise before a subsequent test.
+       echo "Forcing a recovery..."
+       onnode -q 0 $CTDB recover
+       sleep_for 1
+
+       # Cluster is still healthy.  Good, we're done!
+       if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
+           echo "Cluster became UNHEALTHY again [$(date)]"
+           onnode -p all ctdb status -Y 2>&1
+           onnode -p all ctdb scriptstatus 2>&1
+           echo "Restarting..."
+           continue
+       fi
+
+       echo "Doing a sync..."
+       onnode -q 0 $CTDB sync
+
+       echo "ctdb is ready"
+       return 0
+    done
+
+    echo "Cluster UNHEALTHY...  too many attempts..."
+    onnode -p all ctdb status -Y 2>&1
+    onnode -p all ctdb scriptstatus 2>&1
+
+    # Try to make the calling test fail
+    status=1
+    return 1
+}
+
+ctdb_restart_when_done ()
+{
+    ctdb_test_restart_scheduled=true
+}
+
+get_ctdbd_command_line_option ()
+{
+    local pnn="$1"
+    local option="$2"
+
+    try_command_on_node "$pnn" "$CTDB getpid" || \
+       die "Unable to get PID of ctdbd on node $pnn"
+
+    local pid="${out#*:}"
+    try_command_on_node "$pnn" "ps -p $pid -o args hww" || \
+       die "Unable to get command-line of PID $pid"
+
+    # Strip everything up to and including --option
+    local t="${out#*--${option}}"
+    # Strip leading '=' or space if present
+    t="${t#=}"
+    t="${t# }"
+    # Strip any following options and print
+    echo "${t%% -*}"
+}
+
+#######################################
+
+install_eventscript ()
+{
+    local script_name="$1"
+    local script_contents="$2"
+
+    if [ -z "$TEST_LOCAL_DAEMONS" ] ; then
+       # The quoting here is *very* fragile.  However, we do
+       # experience the joy of installing a short script using
+       # onnode, and without needing to know the IP addresses of the
+       # nodes.
+       onnode all "f=\"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\" ; echo \"Installing \$f\" ; echo '${script_contents}' > \"\$f\" ; chmod 755 \"\$f\""
+    else
+       f="${TEST_VAR_DIR}/events.d/${script_name}"
+       echo "$script_contents" >"$f"
+       chmod 755 "$f"
+    fi
+}
+
+uninstall_eventscript ()
+{
+    local script_name="$1"
+
+    if [ -z "$TEST_LOCAL_DAEMONS" ] ; then
+       onnode all "rm -vf \"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\""
+    else
+       rm -vf "${TEST_VAR_DIR}/events.d/${script_name}"
+    fi
+}
+
+#######################################
+
+# This section deals with the 99.ctdb_test eventscript.
+
+# Metafunctions: Handle a ctdb-test file on a node.
+# given event.
+ctdb_test_eventscript_file_create ()
+{
+    local pnn="$1"
+    local type="$2"
+
+    try_command_on_node $pnn touch "/tmp/ctdb-test-${type}.${pnn}"
+}
+
+ctdb_test_eventscript_file_remove ()
+{
+    local pnn="$1"
+    local type="$2"
+
+    try_command_on_node $pnn rm -f "/tmp/ctdb-test-${type}.${pnn}"
+}
+
+ctdb_test_eventscript_file_exists ()
+{
+    local pnn="$1"
+    local type="$2"
+
+    try_command_on_node $pnn test -f "/tmp/ctdb-test-${type}.${pnn}" >/dev/null 2>&1
+}
+
+
+# Handle a flag file on a node that is removed by 99.ctdb_test on the
+# given event.
+ctdb_test_eventscript_flag ()
+{
+    local cmd="$1"
+    local pnn="$2"
+    local event="$3"
+
+    ctdb_test_eventscript_file_${cmd} "$pnn" "flag-${event}"
+}
+
+
+# Handle a trigger that causes 99.ctdb_test to fail it's monitor
+# event.
+ctdb_test_eventscript_unhealthy_trigger ()
+{
+    local cmd="$1"
+    local pnn="$2"
+
+    ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-trigger"
+}
+
+# Handle the file that 99.ctdb_test created to show that it has marked
+# a node unhealthy because it detected the above trigger.
+ctdb_test_eventscript_unhealthy_detected ()
+{
+    local cmd="$1"
+    local pnn="$2"
+
+    ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-detected"
+}
+
+# Handle a trigger that causes 99.ctdb_test to timeout it's monitor
+# event.  This should cause the node to be banned.
+ctdb_test_eventscript_timeout_trigger ()
+{
+    local cmd="$1"
+    local pnn="$2"
+    local event="$3"
+
+    ctdb_test_eventscript_file_${cmd} "$pnn" "${event}-timeout"
+}
+
+# Note that the eventscript can't use the above functions!
+ctdb_test_eventscript_install ()
+{
+
+    local script='#!/bin/sh
+out=$(ctdb pnn)
+pnn="${out#PNN:}"
+
+rm -vf "/tmp/ctdb-test-flag-${1}.${pnn}"
+
+trigger="/tmp/ctdb-test-unhealthy-trigger.${pnn}"
+detected="/tmp/ctdb-test-unhealthy-detected.${pnn}"
+timeout_trigger="/tmp/ctdb-test-${1}-timeout.${pnn}"
+case "$1" in
+    monitor)
+        if [ -e "$trigger" ] ; then
+            echo "${0}: Unhealthy because \"$trigger\" detected"
+            touch "$detected"
+            exit 1
+        elif [ -e "$detected" -a ! -e "$trigger" ] ; then
+            echo "${0}: Healthy again, \"$trigger\" no longer detected"
+            rm "$detected"
+        fi
+
+       ;;
+    *)
+        if [ -e "$timeout_trigger" ] ; then
+            echo "${0}: Sleeping for a long time because \"$timeout_trigger\" detected"
+            sleep 9999
+        fi
+       ;;
+       *)
+
+esac
+
+exit 0
+'
+    install_eventscript "99.ctdb_test" "$script"
+}
+
+ctdb_test_eventscript_uninstall ()
+{
+    uninstall_eventscript "99.ctdb_test"
+}
+
+# Note that this only works if you know all other monitor events will
+# succeed.  You also need to install the eventscript before using it.
+wait_for_monitor_event ()
+{
+    local pnn="$1"
+
+    echo "Waiting for a monitor event on node ${pnn}..."
+    ctdb_test_eventscript_flag create $pnn "monitor"
+
+    wait_until 120 ! ctdb_test_eventscript_flag exists $pnn "monitor"
+
+}
+
+#######################################
+
+nfs_test_setup ()
+{
+    select_test_node_and_ips
+
+    nfs_first_export=$(showmount -e $test_ip | sed -n -e '2s/ .*//p')
+
+    echo "Creating test subdirectory..."
+    try_command_on_node $test_node "mktemp -d --tmpdir=$nfs_first_export"
+    nfs_test_dir="$out"
+    try_command_on_node $test_node "chmod 777 $nfs_test_dir"
+
+    nfs_mnt_d=$(mktemp -d)
+    nfs_local_file="${nfs_mnt_d}/${nfs_test_dir##*/}/TEST_FILE"
+    nfs_remote_file="${nfs_test_dir}/TEST_FILE"
+
+    ctdb_test_exit_hook_add nfs_test_cleanup
+
+    echo "Mounting ${test_ip}:${nfs_first_export} on ${nfs_mnt_d} ..."
+    mount -o timeo=1,hard,intr,vers=3 \
+       ${test_ip}:${nfs_first_export} ${nfs_mnt_d}
+}
+
+nfs_test_cleanup ()
+{
+    rm -f "$nfs_local_file"
+    umount -f "$nfs_mnt_d"
+    rmdir "$nfs_mnt_d"
+    onnode -q $test_node rmdir "$nfs_test_dir"
+}
+
+
+
+#######################################
+
+# Make sure that $CTDB is set.
+: ${CTDB:=ctdb}
+
+local="${TEST_SUBDIR}/scripts/local.bash"
+if [ -r "$local" ] ; then
+    . "$local"
+fi
diff --git a/ctdb/tests/scripts/run_tests b/ctdb/tests/scripts/run_tests
new file mode 100755 (executable)
index 0000000..171e819
--- /dev/null
@@ -0,0 +1,273 @@
+#!/bin/bash
+
+usage() {
+    cat <<EOF
+Usage: run_tests [OPTIONS] [TESTS]
+
+Options:
+  -s           Print a summary of tests results after running all tests
+  -l           Use local daemons for integration tests
+  -e           Exit on the first test failure
+  -V <dir>     Use <dir> as TEST_VAR_DIR
+  -C           Clean up - kill daemons and remove TEST_VAR_DIR when done
+  -v           Verbose - print test output for non-failures (only some tests)
+  -A           Use "cat -A" to print test output (only some tests)
+  -D           Show diff between failed/expected test output (some tests only)
+  -X           Trace certain scripts run by tests using -x (only some tests)
+  -d           Print descriptions of tests instead of filenames (dodgy!)
+  -H           No headers - for running single test with other wrapper
+  -q           Quiet - don't show tests being run (hint: use with -s)
+  -x           Trace this script with the -x option
+EOF
+    exit 1
+}
+
+# Print a message and exit.
+die ()
+{
+    echo "$1" >&2 ; exit ${2:-1}
+}
+
+######################################################################
+
+with_summary=false
+with_desc=false
+quiet=false
+exit_on_fail=false
+no_header=false
+
+export TEST_VERBOSE=false
+export TEST_COMMAND_TRACE=false
+export TEST_CAT_RESULTS_OPTS=""
+export TEST_DIFF_RESULTS=false
+export TEST_LOCAL_DAEMONS  # No default, developer can "override"!
+export TEST_VAR_DIR=""
+export TEST_CLEANUP=false
+
+temp=$(getopt -n "$prog" -o "xdehlqsvV:XACDH" -l help -- "$@")
+
+[ $? != 0 ] && usage
+
+eval set -- "$temp"
+
+while true ; do
+    case "$1" in
+       -x) set -x; shift ;;
+       -d) with_desc=true ; shift ;;  # 4th line of output is description
+       -e) exit_on_fail=true ; shift ;;
+       -l) TEST_LOCAL_DAEMONS="3" ; shift ;;
+       -q) quiet=true ; shift ;;
+       -s) with_summary=true ; shift ;;
+       -v) TEST_VERBOSE=true ; shift ;;
+       -V) TEST_VAR_DIR="$2" ; shift 2 ;;
+       -X) TEST_COMMAND_TRACE=true ; shift ;;
+       -A) TEST_CAT_RESULTS_OPTS="-A" ; shift ;;
+       -C) TEST_CLEANUP=true ; shift ;;
+       -D) TEST_DIFF_RESULTS=true ; shift ;;
+       -H) no_header=true ; shift ;;
+       --) shift ; break ;;
+       *) usage ;;
+    esac
+done
+
+if $quiet ; then
+    show_progress() { cat >/dev/null ; }
+else
+    show_progress() { cat ; }
+fi
+
+######################################################################
+
+ctdb_test_begin ()
+{
+    local name="$1"
+
+    teststarttime=$(date '+%s')
+    testduration=0
+
+    echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
+    echo "Running test $name ($(date '+%T'))"
+    echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
+}
+
+ctdb_test_end ()
+{
+    local name="$1" ; shift
+    local status="$1" ; shift
+    # "$@" is command-line
+
+    local interp="SKIPPED"
+    local statstr=" (reason $*)"
+    if [ -n "$status" ] ; then
+       if [ $status -eq 0 ] ; then
+           interp="PASSED"
+           statstr=""
+           echo "ALL OK: $*"
+       else
+           interp="FAILED"
+           statstr=" (status $status)"
+       fi
+    fi
+
+    testduration=$(($(date +%s)-$teststarttime))
+
+    echo "=========================================================================="
+    echo "TEST ${interp}: ${name}${statstr} (duration: ${testduration}s)"
+    echo "=========================================================================="
+
+}
+
+ctdb_test_run ()
+{
+    local name="$1" ; shift
+
+    [ -n "$1" ] || set -- "$name"
+
+    $no_header || ctdb_test_begin "$name"
+
+    local status=0
+    "$@" || status=$?
+
+    $no_header || ctdb_test_end "$name" "$status" "$*"
+
+    return $status
+}
+
+######################################################################
+
+tests_total=0
+tests_passed=0
+tests_failed=0
+summary=""
+
+if ! which mktemp >/dev/null 2>&1 ; then
+    # Not perfect, but it will do...
+    mktemp ()
+    {
+       _dir=false
+       if [ "$1" = "-d" ] ; then
+           _dir=true
+       fi
+       _t="${TMPDIR:-/tmp}/tmp.$$.$RANDOM"
+       (
+           umask 077
+           if $_dir ; then
+               mkdir "$_t"
+           else
+               >"$_t"
+           fi
+       )
+       echo "$_t"
+    }
+fi
+
+tf=$(mktemp)
+sf=$(mktemp)
+
+set -o pipefail
+
+run_one_test ()
+{
+    _f="$1"
+
+    [ -x "$_f" ] || die "test \"$_f\" is not executable"
+    tests_total=$(($tests_total + 1))
+
+    ctdb_test_run "$_f" | tee "$tf" | show_progress
+    status=$?
+    if [ $status -eq 0 ] ; then
+       tests_passed=$(($tests_passed + 1))
+    else
+       tests_failed=$(($tests_failed + 1))
+    fi
+    if $with_summary ; then
+       if [ $status -eq 0 ] ; then
+           _t=" PASSED "
+       else
+           _t="*FAILED*"
+       fi
+       if $with_desc ; then
+           desc=$(tail -n +4 $tf | head -n 1)
+           _f="$desc"
+       fi
+       echo "$_t $_f" >>"$sf"
+    fi
+}
+
+find_and_run_one_test ()
+{
+    _t="$1"
+    _dir="$2"
+
+    _f="${_dir}${_dir:+/}${_t}"
+
+    if [ -d "$_f" ] ; then
+       for _i in $(ls "${_f%/}/"*".sh" 2>/dev/null) ; do
+           run_one_test "$_i"
+           if $exit_on_fail && [ $status -ne 0 ] ; then
+               break
+           fi
+       done
+    elif [ -f "$_f" ] ; then
+       run_one_test "$_f"
+    else
+       status=127
+    fi
+}
+
+[ -n "$TEST_VAR_DIR" ] || TEST_VAR_DIR=$(mktemp -d)
+mkdir -p "$TEST_VAR_DIR"
+# Must be absolute
+TEST_VAR_DIR=$(cd "$TEST_VAR_DIR"; echo "$PWD")
+echo "TEST_VAR_DIR=$TEST_VAR_DIR"
+
+export TEST_SCRIPTS_DIR=$(dirname "$0")
+
+for f ; do
+    find_and_run_one_test "$f"
+
+    if [ $status -eq 127 ] ; then
+       # Find the the top-level tests directory
+       tests_dir=$(dirname $(cd $TEST_SCRIPTS_DIR; echo $PWD))
+       # Strip off current directory from beginning, if there, just
+       # to make paths more friendly.
+       tests_dir=${tests_dir#$PWD/}
+       find_and_run_one_test "$f" "$tests_dir"
+    fi
+
+    if [ $status -eq 127 ] ; then
+           die "test \"$f\" is not recognised"
+    fi
+
+    if $exit_on_fail && [ $status -ne 0 ] ; then
+           break
+    fi
+done
+
+rm -f "$tf"
+
+if $with_summary ; then
+    echo
+    cat "$sf"
+    echo
+    echo "${tests_passed}/${tests_total} tests passed"
+fi
+
+rm -f "$sf"
+
+echo
+
+if $TEST_CLEANUP ; then
+    echo "Removing TEST_VAR_DIR=$TEST_VAR_DIR"
+    rm -rf "$TEST_VAR_DIR"
+else
+    echo "Not cleaning up TEST_VAR_DIR=$TEST_VAR_DIR"
+fi
+
+if $no_header || $exit_on_fail ; then
+    exit $status
+elif [ $tests_failed -gt 0 ] ; then
+    exit 1
+else
+    exit 0
+fi
diff --git a/ctdb/tests/scripts/test_wrap b/ctdb/tests/scripts/test_wrap
new file mode 100755 (executable)
index 0000000..176310e
--- /dev/null
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Execute the given command.  The intention is that it is a function
+# from "${TEST_SCRIPTS_DIR}/integration.bash".
+
+PATH="$(dirname $0):${PATH}"
+
+TEST_SCRIPTS_DIR=$(dirname $0)
+
+# We need the test binaries (i.e. tests/bin/) to be in $PATH.  If they
+# aren't already in $PATH then we know that tests/bin/ sits alongside
+# tests/scripts/.
+f="ctdb_bench"
+if [ ! $(which $f >/dev/null 2>&1) ] ; then
+    d=$(dirname "$TEST_SCRIPTS_DIR")/bin
+    [ -x "$d/$f" ] && PATH="$d:$PATH"
+fi
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+"$@"
diff --git a/ctdb/tests/scripts/unit.sh b/ctdb/tests/scripts/unit.sh
new file mode 100644 (file)
index 0000000..c7c2b7a
--- /dev/null
@@ -0,0 +1,141 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+. "${TEST_SCRIPTS_DIR}/common.sh"
+
+# Common variables and functions for CTDB unit tests.
+
+# Set the required result for a test.
+# - Argument 1 is exit code.
+# - Argument 2, if present is the required test output but "--"
+#   indicates empty output.
+# If argument 2 is not present or null then read required test output
+# from stdin.
+required_result ()
+{
+    required_rc="${1:-0}"
+    if [ -n "$2" ] ; then
+       if [ "$2" = "--" ] ; then
+           required_output=""
+       else
+           required_output="$2"
+       fi
+    else
+       if ! tty -s ; then
+           required_output=$(cat)
+       else
+           required_output=""
+       fi
+    fi
+}
+
+ok ()
+{
+    required_result 0 "$@"
+}
+
+ok_null ()
+{
+    ok --
+}
+
+result_print ()
+{
+    _passed="$1"
+    _out="$2"
+    _rc="$3"
+    _extra_header="$4"
+
+    if "$TEST_VERBOSE" || ! $_passed ; then
+       if [ -n "$_extra_header" ] ; then
+           cat <<EOF
+
+##################################################
+$_extra_header
+EOF
+       fi
+
+cat <<EOF
+--------------------------------------------------
+Output (Exit status: ${_rc}):
+--------------------------------------------------
+EOF
+       echo "$_out" | cat $TEST_CAT_RESULTS_OPTS
+    fi
+
+    if ! $_passed ; then
+       cat <<EOF
+--------------------------------------------------
+Required output (Exit status: ${required_rc}):
+--------------------------------------------------
+EOF
+       echo "$required_output" | cat $TEST_CAT_RESULTS_OPTS
+
+       if $TEST_DIFF_RESULTS ; then
+           _outr=$(mktemp)
+           echo "$required_output" >"$_outr"
+
+           _outf=$(mktemp)
+           echo "$_fout" >"$_outf"
+
+           cat <<EOF
+--------------------------------------------------
+Diff:
+--------------------------------------------------
+EOF
+           diff -u "$_outr" "$_outf" | cat -A
+           rm "$_outr" "$_outf"
+       fi
+    fi
+}
+
+result_footer ()
+{
+    _passed="$1"
+    _extra_footer="$2"
+
+    if "$TEST_VERBOSE" || ! $_passed ; then
+       if [ -n "$_extra_footer" ] ; then
+           cat <<EOF
+--------------------------------------------------
+$_extra_footer
+--------------------------------------------------
+EOF
+       fi
+    fi
+
+    if $_passed ; then
+       echo "PASSED"
+       return 0
+    else
+       echo
+       echo "FAILED"
+       return 1
+    fi
+}
+
+result_check ()
+{
+    _rc=$?
+
+    _extra_header="$1"
+
+    if [ -n "$OUT_FILTER" ] ; then
+       _fout=$(echo "$_out" | eval sed -r $OUT_FILTER)
+    else
+       _fout="$_out"
+    fi
+
+    if [ "$_fout" = "$required_output" -a $_rc = $required_rc ] ; then
+       _passed=true
+    else
+       _passed=false
+    fi
+
+    result_print "$_passed" "$_out" "$_rc" "$_extra_header"
+    result_footer "$_passed"
+}
+
+local="${TEST_SUBDIR}/scripts/local.sh"
+if [ -r "$local" ] ; then
+    . "$local"
+fi
diff --git a/ctdb/tests/simple/00_ctdb_init.sh b/ctdb/tests/simple/00_ctdb_init.sh
new file mode 100755 (executable)
index 0000000..bd15fd7
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Restart the ctdbd daemons of a CTDB cluster.
+
+No error if ctdbd is not already running on the cluster.
+
+Prerequisites:
+
+* Nodes must be accessible via 'onnode'.
+
+Steps:
+
+1. Restart the ctdb daemons on all nodes using a method according to
+   the test environment and platform.
+
+Expected results:
+
+* The cluster is healthy within a reasonable timeframe.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+setup_ctdb
+restart_ctdb
diff --git a/ctdb/tests/simple/00_ctdb_onnode.sh b/ctdb/tests/simple/00_ctdb_onnode.sh
new file mode 100755 (executable)
index 0000000..3bc8f8b
--- /dev/null
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Use 'onnode' to confirm connectivity between all cluster nodes.
+
+Steps:
+
+1. Do a recursive "onnode all" to make sure all the nodes can connect
+   to each other.  On a cluster this ensures that SSH keys are known
+   between all hosts, which will stop output being corrupted with
+   messages about nodes being added to the list of known hosts.
+
+Expected results:
+
+* 'onnode' works between all nodes.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+
+# 
+
+echo "Checking connectivity between nodes..."
+onnode all onnode -p all hostname
+
+# We're seeing some weirdness with CTDB controls timing out.  We're
+# wondering if time is jumping forward, so this creates a time log on
+# each node that we can examine later if tests fail weirdly.
+if [ -z "$TEST_LOCAL_DAEMONS" -a -n "$CTDB_TEST_TIME_LOGGING" ] ; then
+    echo "Starting time logging on each node..."
+    f="${TEST_VAR_DIR}/ctdb.test.time.log"
+    onnode -p all "[ -f $f ] || while : ; do date '+%s %N' ; sleep 1 ; done >$f 2>&1 </dev/null &"  &
+fi
diff --git a/ctdb/tests/simple/01_ctdb_version.sh b/ctdb/tests/simple/01_ctdb_version.sh
new file mode 100755 (executable)
index 0000000..3e1ed3e
--- /dev/null
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify the output of the 'ctdb version' command.
+
+This test assumes an RPM-based installation and needs to be skipped on
+non-RPM systems.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run the 'ctdb version' command on one of the cluster nodes.
+3. Compare the version displayed with that listed by the rpm command
+   for the ctdb package.
+
+Expected results:
+
+* The 'ctdb version' command displays the ctdb version number.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+if ! try_command_on_node -v 0 "rpm -q ctdb" ; then
+    echo "No useful output from rpm, SKIPPING rest of test".
+    exit 0
+fi
+rpm_ver="${out#ctdb-}"
+# Some version of RPM append the architecture to the version.
+# And also remove the release suffix.
+arch=$(uname -m)
+rpm_ver="${rpm_ver%-*.${arch}}"
+
+try_command_on_node -v 0 "$CTDB version"
+ctdb_ver="${out#CTDB version: }"
+
+if [ "$ctdb_ver" = "$rpm_ver" ] ; then
+    echo "OK: CTDB version = RPM version"
+else
+    echo "BAD: CTDB version != RPM version"
+    testfailures=1
+fi
diff --git a/ctdb/tests/simple/02_ctdb_listvars.sh b/ctdb/tests/simple/02_ctdb_listvars.sh
new file mode 100755 (executable)
index 0000000..2f709a8
--- /dev/null
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb listvars' shows a list of all tunable variables.
+
+This test simply checks that at least 5 sane looking lines are
+printed.  It does not check that the list is complete or that the
+values are sane.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb listvars' and verify that it shows a list of tunable
+   variables and their current values.
+
+Expected results:
+
+* 'ctdb listvars' works as expected.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node -v 0 "$CTDB listvars"
+
+sanity_check_output \
+    5 \
+    '^[[:alpha:]][[:alnum:]]+[[:space:]]*=[[:space:]]*[[:digit:]]+$' \
+    "$out"
diff --git a/ctdb/tests/simple/03_ctdb_getvar.sh b/ctdb/tests/simple/03_ctdb_getvar.sh
new file mode 100755 (executable)
index 0000000..a58aa3b
--- /dev/null
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb getvar' works correctly.
+
+Expands on the steps below as it actually checks the values of all
+variables listed by 'ctdb listvars'.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb getvars <varname>' with a valid variable name (possibly
+   obtained via 'ctdb listvars'.
+3. Verify that the command displays the correct value of the variable
+   (corroborate with the value shown by 'ctdb listvars'.
+
+Expected results:
+
+* 'ctdb getvar' shows the correct value of the variable.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node -v 0 "$CTDB listvars"
+
+echo "Veryifying all variable values using \"ctdb getvar\"..."
+
+echo "$out" |
+while read var x val ; do
+    try_command_on_node 0 "$CTDB getvar $var"
+
+    val2="${out#*= }"
+
+    if [ "$val" != "$val2" ] ; then
+       echo "MISMATCH on $var: $val != $val2"
+       exit 1
+    fi
+done
diff --git a/ctdb/tests/simple/04_ctdb_setvar.sh b/ctdb/tests/simple/04_ctdb_setvar.sh
new file mode 100755 (executable)
index 0000000..5012e31
--- /dev/null
@@ -0,0 +1,79 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb setvar' works correctly.
+
+Doesn't strictly follow the procedure outlines below, since it doesn't
+pick a variable from the output of 'ctdb listvars'.  However, it
+verifies the value with 'ctdb getvar' in addition to 'ctdb listvars'.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Get a list of all the ctdb tunable variables, using the 'ctdb
+   listvars' command.
+3. Set the value of one of the variables using the 'setvar' control on
+   one of the nodes.  E.g. 'ctdb setvar DeterministicIPs 0'.
+4. Verify that the 'listvars' control now shows the new value for the
+   variable.
+
+Expected results:
+
+* After setting a value using 'ctdb setvar', 'ctdb listvars' shows the
+  modified value of the variable.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+var="RecoverTimeout"
+
+try_command_on_node -v 0 $CTDB getvar $var
+
+val="${out#*= }"
+
+echo "Going to try incrementing it..."
+
+incr=$(($val + 1))
+
+try_command_on_node 0 $CTDB setvar $var $incr
+
+echo "That seemed to work, let's check the value..."
+
+try_command_on_node -v 0 $CTDB getvar $var
+
+newval="${out#*= }"
+
+if [ "$incr" != "$newval" ] ; then
+    echo "Nope, that didn't work..."
+    exit 1
+fi
+
+echo "Look's good!  Now verifying with \"ctdb listvars\""
+try_command_on_node -v 0 "$CTDB listvars | grep '^$var'"
+
+check="${out#*= }"
+
+if [ "$incr" != "$check" ] ; then
+    echo "Nope, that didn't work..."
+    exit 1
+fi
+
+echo "Look's good!  Putting the old value back..."
+cmd="$CTDB setvar $var $val"
+try_command_on_node 0 $cmd
diff --git a/ctdb/tests/simple/05_ctdb_listnodes.sh b/ctdb/tests/simple/05_ctdb_listnodes.sh
new file mode 100755 (executable)
index 0000000..a84e4af
--- /dev/null
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb listnodes' shows the list of nodes in a ctdb cluster.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb listnodes' on all the nodes of the cluster.
+3. Verify that one all the nodes the command displays a list of
+   current cluster nodes.
+
+Expected results:
+
+* 'ctdb listnodes' displays the correct information.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node -v 0 "$CTDB listnodes"
+
+num_nodes=$(echo "$out" | wc -l)
+
+# Each line should look like an IP address.
+sanity_check_output \
+    2 \
+    '^[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+$' \
+    "$out"
+
+out_0="$out"
+
+echo "Checking other nodes..."
+
+n=1
+while [ $n -lt $num_nodes ] ; do
+    echo -n "Node ${n}: "
+    try_command_on_node $n "$CTDB listnodes"
+    if [ "$out_0" = "$out" ] ; then
+       echo "OK"
+    else
+       echo "DIFFERs from node 0:"
+       echo "$out"
+       testfailures=1
+    fi
+    n=$(($n + 1))
+done
diff --git a/ctdb/tests/simple/06_ctdb_getpid.sh b/ctdb/tests/simple/06_ctdb_getpid.sh
new file mode 100755 (executable)
index 0000000..7152ad4
--- /dev/null
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb getpid' works as expected.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb getpid -n <number>' on the nodes to check the PID of the
+   ctdbd process.
+3. Verify that the output is valid.
+4. Verify that with the '-n all' option the command shows the PIDs on
+   all the nodes
+
+Expected results:
+
+* 'ctdb getpid' shows valid output.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# This is an attempt at being independent of the number of nodes
+# reported by "ctdb getpid -n all".
+try_command_on_node 0 "$CTDB listnodes | wc -l"
+num_nodes="$out"
+echo "There are $num_nodes nodes..."
+
+# Call getpid a few different ways and make sure the answer is always the same.
+
+try_command_on_node -v 0 "onnode -q all $CTDB getpid"
+pids_onnode="$out"
+
+try_command_on_node -v 0 "$CTDB getpid -n all"
+pids_getpid_all="$out"
+
+cmd=""
+n=0
+while [ $n -lt $num_nodes ] ; do
+    cmd="${cmd}${cmd:+; }$CTDB getpid -n $n"
+    n=$(($n + 1))
+done
+try_command_on_node -v 0 "( $cmd )"
+pids_getpid_n="$out"
+
+if [ "$pids_onnode" = "$pids_getpid_all" -a \
+    "$pids_getpid_all" = "$pids_getpid_n" ] ; then
+    echo "They're the same... cool!"
+else
+    echo "Error: they differ."
+    testfailures=1
+fi
+
+echo "Checking each PID for validity"
+
+n=0
+while [ $n -lt $num_nodes ] ; do
+    read line
+    pid=${line#Pid:}
+    try_command_on_node $n "ls -l /proc/${pid}/exe | sed -e 's@.*/@@'"
+    echo -n "Node ${n}, PID ${pid} looks to be running \"$out\" - "
+    if [ "$out" = "ctdbd" ] ; then
+       echo "GOOD!"
+    elif [ -n "$VALGRIND" -a "$out" = "memcheck" ] ; then
+       # We could check cmdline too if this isn't good enough.
+       echo "GOOD enough!"
+    else
+       echo "BAD!"
+       testfailures=1
+    fi
+    n=$(($n + 1))
+done <<<"$pids_onnode"
diff --git a/ctdb/tests/simple/07_ctdb_process_exists.sh b/ctdb/tests/simple/07_ctdb_process_exists.sh
new file mode 100755 (executable)
index 0000000..83205aa
--- /dev/null
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb process-exists' shows correct information.
+
+The implementation is creative about how it gets PIDs for existing and
+non-existing processes.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. On one of the cluster nodes, get the PID of an existing process
+   (using ps wax).
+3. Run 'ctdb process-exists <pid>' on the node and verify that the
+   correct output is shown.
+4. Run 'ctdb process-exists <pid>' with a pid of a non-existent
+   process and verify that the correct output is shown.
+
+Expected results:
+
+* 'ctdb process-exists' shows the correct output.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+test_node=1
+
+# Create a background process on $test_node that will last for 60 seconds.
+# It should still be there when we check.
+try_command_on_node $test_node 'sleep 60 >/dev/null 2>&1 & echo $!'
+pid="$out"
+
+echo "Checking for PID $pid on node $test_node"
+# set -e is good, but avoid it here
+status=0
+onnode 0 "$CTDB process-exists ${test_node}:${pid}" || status=$?
+echo "$out"
+
+if [ $status -eq 0 ] ; then
+    echo "OK"
+else
+    echo "BAD"
+    testfailures=1
+fi
+
+# Now just echo the PID of the shell from the onnode process on node
+# 2.  This PID will disappear and PIDs shouldn't roll around fast
+# enough to trick the test...  but there is a chance that will happen!
+try_command_on_node $test_node 'echo $$'
+pid="$out"
+
+echo "Checking for PID $pid on node $test_node"
+try_command_on_node -v 0 "! $CTDB process-exists ${test_node}:${pid}"
diff --git a/ctdb/tests/simple/08_ctdb_isnotrecmaster.sh b/ctdb/tests/simple/08_ctdb_isnotrecmaster.sh
new file mode 100755 (executable)
index 0000000..138f59c
--- /dev/null
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify the operation of 'ctdb isnotrecmaster'.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb isnotrecmaster' on each node.
+
+3. Verify that only 1 node shows the output 'This node is the
+   recmaster' and all the other nodes show the output 'This node is
+   not the recmaster'.
+
+Expected results:
+
+* 'ctdb isnotrecmaster' shows the correct output.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+cmd="$CTDB isnotrecmaster || true"
+try_command_on_node all "$cmd"
+echo "Output of \"$cmd\":"
+echo "$out"
+
+num_all_lines=$(echo "$out" |  wc -l)
+num_rm_lines=$(echo "$out" | fgrep -c 'this node is the recmaster') || true
+num_not_rm_lines=$(echo "$out" | fgrep -c 'this node is not the recmaster') || true
+
+if [ $num_rm_lines -eq 1 ] ; then
+    echo "OK, there is only 1 recmaster"
+else
+    echo "BAD, there are ${num_rm_lines} nodes claiming to be the recmaster"
+    testfailures=1
+fi
+
+if [ $(($num_all_lines - $num_not_rm_lines)) -eq 1 ] ; then
+    echo "OK, all the other nodes claim not to be the recmaster"
+else
+    echo "BAD, there are only ${num_not_rm_lines} nodes claiming not to be the recmaster"
+    testfailures=1
+fi
diff --git a/ctdb/tests/simple/09_ctdb_ping.sh b/ctdb/tests/simple/09_ctdb_ping.sh
new file mode 100755 (executable)
index 0000000..ab6ba14
--- /dev/null
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify the operation of the 'ctdb ping' command.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run the 'ctdb ping' command on one of the nodes and verify that it
+   shows valid and expected output. 
+3. Shutdown one of the cluster nodes, using the 'ctdb shutdown'
+   command. 
+4. Run the 'ctdb ping -n <node>' command from another node to this
+   node. 
+5. Verify that the command is not successful since th ctdb daemon is
+   not running on the node.
+
+Expected results:
+
+* The 'ctdb ping' command shows valid and expected output.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+try_command_on_node -v 0 "$CTDB ping -n 1"
+
+sanity_check_output \
+    1 \
+    '^response from 1 time=-?[.0-9]+ sec[[:space:]]+\([[:digit:]]+ clients\)$' \
+    "$out"
+
+try_command_on_node -v 0 "$CTDB shutdown -n 1"
+
+wait_until_node_has_status 1 disconnected 30 0
+
+try_command_on_node -v 0 "! $CTDB ping -n 1"
+
+sanity_check_output \
+    1 \
+    "(: ctdb_control error: ('ctdb_control to disconnected node'|'node is disconnected')|Unable to get ping response from node 1|Node 1 is DISCONNECTED|ctdb_control for getpnn failed|: Can not access node. Node is not operational\.|Node 1 has status DISCONNECTED\|UNHEALTHY\|INACTIVE$)" \
+    "$out"
diff --git a/ctdb/tests/simple/11_ctdb_ip.sh b/ctdb/tests/simple/11_ctdb_ip.sh
new file mode 100755 (executable)
index 0000000..c1aec0e
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb ip' shows the correct output.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb ip' on one of the nodes and verify the list of IP
+   addresses displayed (cross check the result with the output of
+   'ip addr show' on the node).
+3. Verify that colon-separated output is generated with the -Y option.
+
+Expected results:
+
+* 'ctdb ip' shows the list of public IPs being served by a node.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+echo "Getting list of public IPs..."
+try_command_on_node -v 1 "$CTDB ip -n all | tail -n +2"
+ips=$(echo "$out" | sed \
+       -e 's@ node\[@ @' \
+       -e 's@\].*$@@')
+machineout=$(echo "$out" | sed -r \
+       -e 's@^| |$@:@g' \
+       -e 's@[[:alpha:]]+\[@@g' \
+       -e 's@\]@@g')
+
+if [ -z "$TEST_LOCAL_DAEMONS" ]; then
+    while read ip pnn ; do
+        try_command_on_node $pnn "ip addr show"
+        if [ "${out/inet ${ip}\/}" != "$out" ] ; then
+            echo "GOOD: node $pnn appears to have $ip assigned"
+        else
+            echo "BAD:  node $pnn does not appear to have $ip assigned"
+            testfailures=1
+        fi
+    done <<<"$ips" # bashism to avoid problem setting variable in pipeline.
+fi
+
+[ "$testfailures" != 1 ] && echo "Looks good!"
+
+cmd="$CTDB -Y ip -n all | tail -n +2"
+echo "Checking that \"$cmd\" produces expected output..."
+
+try_command_on_node 1 "$cmd"
+if [ "$out" = "$machineout" ] ; then
+    echo "Yep, looks good!"
+else
+    echo "Nope, it looks like this:"
+    echo "$out"
+    echo "Should be like this:"
+    echo "$machineout"
+    testfailures=1
+fi
diff --git a/ctdb/tests/simple/12_ctdb_getdebug.sh b/ctdb/tests/simple/12_ctdb_getdebug.sh
new file mode 100755 (executable)
index 0000000..4a4926d
--- /dev/null
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb getdebug' works as expected.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Get the current debug level on a node, using 'ctdb getdebug -n <node>'.
+3. Verify that colon-separated output is generated with the -Y option.
+4. Verify that the '-n all' option shows the debug level on all nodes.
+
+Expected results:
+
+* 'ctdb getdebug' shows the debug level on all the nodes.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes | wc -l"
+num_nodes="$out"
+
+try_command_on_node -v 1 "onnode -q all $CTDB getdebug"
+getdebug_onnode="$out"
+
+sanity_check_output \
+    $num_nodes \
+    '^Node [[:digit:]]+ is at debug level [[:alpha:]]+ \([[:digit:]]+\)$' \
+    "$out"
+
+try_command_on_node -v 1 "$CTDB getdebug -n all"
+getdebug_all="$out"
+
+cmd=""
+n=0
+while [ $n -lt $num_nodes ] ; do
+    cmd="${cmd}${cmd:+; }$CTDB getdebug -n $n"
+    n=$(($n + 1))
+done
+try_command_on_node -v 1 "$cmd"
+getdebug_n="$out"
+
+if [ "$getdebug_onnode" = "$getdebug_all" -a \
+    "$getdebug_all" = "$getdebug_n" ] ; then
+    echo "They're the same... cool!"
+else
+    echo "Error: they differ."
+    testfailures=1
+fi
+
+colons=""
+nl="
+"
+while read line ; do
+    t=$(echo "$line" | sed -r -e 's@Node [[:digit:]]+ is at debug level ([[:alpha:]]+) \((-?[[:digit:]]+)\)$@:\1:\2:@')
+    colons="${colons}${colons:+${nl}}:Name:Level:${nl}${t}"
+done <<<"$getdebug_onnode"
+
+cmd="$CTDB -Y getdebug -n all"
+echo "Checking that \"$cmd\" produces expected output..."
+
+try_command_on_node 1 "$cmd"
+if [ "$out" = "$colons" ] ; then
+    echo "Yep, looks good!"
+else
+    echo "Nope, it looks like this:"
+    echo "$out"
+    testfailures=1
+fi
diff --git a/ctdb/tests/simple/13_ctdb_setdebug.sh b/ctdb/tests/simple/13_ctdb_setdebug.sh
new file mode 100755 (executable)
index 0000000..d1d1f22
--- /dev/null
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb setdebug' works as expected.
+
+This is a little superficial.  It checks that CTDB thinks the debug
+level has been changed but doesn't actually check that logging occurs
+at the new level.
+
+A test should also be added to see if setting the debug value via a
+numerical value works too.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Get the current debug level on a node, using 'ctdb getdebug'.
+3. Change the debug level to some other value (e.g. EMERG) using
+   'ctdb setdebug'.
+4. Verify that the new debug level is correctly set using 'ctdb getdebug'.
+
+Expected results:
+
+* 'ctdb setdebug' correctly sets the debug level on a node.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+get_debug ()
+{
+    # Sets; check_debug
+    local node="$1"
+
+    local out
+    
+    try_command_on_node -v $node "$CTDB getdebug"
+    check_debug=$(echo "$out" |
+       sed -r -e 's@Node [[:digit:]]+ is at debug level ([[:alpha:]]+) \(-?[[:digit:]]+\)$@\1@')
+}
+
+set_and_check_debug ()
+{
+    local node="$1"
+    local level="$2"
+
+    echo "Setting debug level on node ${node} to ${level}."
+    try_command_on_node $node "$CTDB setdebug ${level}"
+
+    local check_debug
+    get_debug $node
+
+    if [ "$level" = "$check_debug" ] ; then
+       echo "That seemed to work... cool!"
+    else
+       echo "BAD: Debug level should have changed to \"$level\" but it is \"$check_debug\"."
+       testfailures=1
+    fi
+}
+
+get_debug 1
+initial_debug="$check_debug"
+
+new_debug="EMERG"
+[ "$initial_debug" = "$new_debug" ] && new_debug="ALERT"
+
+set_and_check_debug 1 "$new_debug"
+
+if [ "$testfailures" != 1 ] ; then
+    echo "Returning the debug level to its initial value..."
+    set_and_check_debug 1 "$initial_debug"
+fi
diff --git a/ctdb/tests/simple/14_ctdb_statistics.sh b/ctdb/tests/simple/14_ctdb_statistics.sh
new file mode 100755 (executable)
index 0000000..9cc5ac1
--- /dev/null
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb statistics' works as expected.
+
+This is pretty superficial and could do more validation.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb statistics' on a node, and verify that the output is
+   valid.
+3. Repeat the command with the '-n all' option and verify that the
+   output is valid.
+
+Expected results:
+
+* 'ctdb statistics' shows valid output on all the nodes.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+pattern='^(CTDB version 1|Current time of statistics[[:space:]]*:.*|Statistics collected since[[:space:]]*:.*|Gathered statistics for [[:digit:]]+ nodes|[[:space:]]+[[:alpha:]_]+[[:space:]]+[[:digit:]]+|[[:space:]]+(node|client|timeouts|locks)|[[:space:]]+([[:alpha:]_]+_latency|max_reclock_[[:alpha:]]+)[[:space:]]+[[:digit:]-]+\.[[:digit:]]+[[:space:]]sec|[[:space:]]*(locks_latency|reclock_ctdbd|reclock_recd|call_latency|lockwait_latency|childwrite_latency)[[:space:]]+MIN/AVG/MAX[[:space:]]+[-.[:digit:]]+/[-.[:digit:]]+/[-.[:digit:]]+ sec out of [[:digit:]]+|[[:space:]]+(hop_count_buckets|lock_buckets):[[:space:][:digit:]]+)$'
+
+try_command_on_node -v 1 "$CTDB statistics"
+
+sanity_check_output 40 "$pattern" "$out"
+
+try_command_on_node -v 1 "$CTDB statistics -n all"
+
+sanity_check_output 40 "$pattern" "$out"
diff --git a/ctdb/tests/simple/15_ctdb_statisticsreset.sh b/ctdb/tests/simple/15_ctdb_statisticsreset.sh
new file mode 100755 (executable)
index 0000000..eaa60d6
--- /dev/null
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb statisticsreset' works as expected.
+
+This is pretty superficial.  It just checks that a few particular
+items reduce.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb statisticsreset' on all nodes and verify that it executes
+   successfully.
+
+Expected results:
+
+* 'ctdb statisticsreset' executes successfully.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes | wc -l"
+num_nodes="$out"
+
+get_stat ()
+{
+    local label="$1"
+    local out="$2"
+
+    echo "$out" | sed -rn -e "s@^[[:space:]]+${label}[[:space:]]+([[:digit:]])@\1@p" | head -1
+}
+
+check_reduced ()
+{
+    local label="$1"
+    local before="$2"
+    local after="$3"
+
+    if [ $after -lt $before ] ; then
+       echo "GOOD: ${label} reduced from ${before} to ${after}"
+    else
+       echo "BAD: ${label} did not reduce from ${before} to ${after}"
+       testfailures=1
+    fi
+}
+
+n=0
+while [ $n -lt $num_nodes ] ; do
+    echo "Getting initial statistics for node ${n}..."
+    
+    try_command_on_node -v $n $CTDB statistics
+
+    before_req_control=$(get_stat "req_control" "$out")
+    before_reply_control=$(get_stat "reply_control" "$out")
+    before_node_packets_recv=$(get_stat "node_packets_recv" "$out")
+
+    try_command_on_node $n $CTDB statisticsreset
+
+    try_command_on_node -v $n $CTDB statistics
+
+    after_req_control=$(get_stat "req_control" "$out")
+    after_reply_control=$(get_stat "reply_control" "$out")
+    after_node_packets_recv=$(get_stat "node_packets_recv" "$out")
+
+    check_reduced "req_control" "$before_req_control" "$after_req_control"
+    check_reduced "reply_control" "$before_reply_control" "$after_reply_control"
+    check_reduced "node_packets_recv" "$before_node_packets_recv" "$after_node_packets_recv"
+
+    n=$(($n + 1))
+done
diff --git a/ctdb/tests/simple/16_ctdb_config_add_ip.sh b/ctdb/tests/simple/16_ctdb_config_add_ip.sh
new file mode 100755 (executable)
index 0000000..b770bd6
--- /dev/null
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that an IP address can be added to a node using 'ctdb addip'.
+
+This test goes to some trouble to figure out which IP address to add
+but assumes a 24-bit subnet mask.  It does not handle IPv6.  It does
+not do any network level checks that the new IP address is reachable
+but simply trusts 'ctdb ip' that the address has been added.  There is
+also an extra prerequisite that the node being added to already has
+public addresses - this is difficult to avoid if the extra address is
+to be sensibly chosen.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Use 'ctdb ip' on one of the nodes to list the IP addresses being
+   served.
+3. Add an additional public address to be served by the node, using
+   'ctdb addip'.
+4. Verify that this IP address has been added to the list of IP
+   addresses being served by the node, using the 'ctdb ip' command.
+
+Expected results:
+
+* 'ctdb ip' adds an IP address to the list of public IP addresses
+  being served by a node.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+echo "Getting list of public IPs..."
+all_ips_on_node 0
+
+# When selecting test_node we just want a node that has public IPs.
+# This will work and is economically semi-randomly.  :-)
+read x test_node <<<"$out"
+
+test_node_ips=""
+all_ips=""
+while read ip pnn ; do
+    all_ips="${all_ips}${all_ips:+ }${ip}"
+    [ "$pnn" = "$test_node" ] && \
+       test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
+done <<<"$out"
+
+echo "Selected node ${test_node} with IPs: $test_node_ips"
+
+# Try to find a free IP adddress.  This is inefficient but should
+# succeed quickly.
+try_command_on_node $test_node "ip addr show"
+all_test_node_ips=$(echo "$out" | sed -rn -e 's@^[[:space:]]+inet[[:space:]]+([[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+\.[[:digit:]]+/[[:digit:]]+).*[[:space:]]([^[:space:]]+)+$@\1:\2@p')
+
+add_ip=""
+
+# Use an IP already on one of the nodes, remove the last octet and
+# loop through the possible IP addreses.
+for i in $test_node_ips ; do
+    prefix="${i%.*}"
+    for j in $(seq 101 199) ; do
+       try="${prefix}.${j}"
+       # Try to make sure it isn't used anywhere!
+
+       # First, make sure it isn't an existing public address on the
+       # cluster.
+       for k in $all_ips ; do
+           [ "$try" = "$k" ] && continue 2
+       done
+
+       # Also make sure it isn't some other address in use on the
+       # node.
+       for k in $all_test_node_ips ; do
+           [ "$try" = "${k%/*}" ] && continue 2
+       done
+
+       # Get the interface details for $i, which our address is a
+       # close relative of.  This should never fail but it can't hurt
+       # to be careful...
+       try_command_on_node $test_node "ctdb ip -v -Y"
+       while IFS=":" read x ip pnn iface x ; do
+           if [ "$i" = "$ip" ]; then
+               add_ip="$try/32:$iface"
+               break 3
+           fi
+       done <<<"$out"
+    done
+done
+
+if [ -z "$add_ip" ] ; then
+    echo "BAD: Unable to find IP address to add."
+    exit 1
+fi
+
+echo "Adding IP: ${add_ip/:/ on interface }"
+try_command_on_node $test_node $CTDB addip ${add_ip/:/ }
+
+echo "Waiting for IP to be added..."
+if wait_until 60 ips_are_on_nodeglob $test_node ${add_ip%/*} ; then
+    echo "That worked!"
+else
+    echo "BAD: IP didn't get added."
+    try_command_on_node $test_node $CTDB ip -n all
+    exit 1
+fi
diff --git a/ctdb/tests/simple/17_ctdb_config_delete_ip.sh b/ctdb/tests/simple/17_ctdb_config_delete_ip.sh
new file mode 100755 (executable)
index 0000000..1ad9f33
--- /dev/null
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that a node's public IP address can be deleted using 'ctdb deleteip'.
+
+This test does not do any network level checks that the IP address is
+no longer reachable but simply trusts 'ctdb ip' that the address has
+been deleted.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Use 'ctdb ip' on one of the nodes to list the IP addresses being
+   served.
+3. Delete one public IP address being be served by the node, using
+   'ctdb delip'.
+4. Verify that the delete IP address is no longer listed using the
+   all_ips_on_node helper function.
+
+Expected results:
+
+* 'ctdb delip' removes an IP address from the list of public IP
+  addresses being served by a node.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+echo "Getting list of public IPs..."
+all_ips_on_node -v 0
+
+# Select an IP/node to remove.
+num_ips=$(echo "$out" | wc -l)
+num_to_remove=$(($RANDOM % $num_ips))
+
+# Find the details in the list.
+i=0
+while [ $i -le $num_to_remove ] ; do
+    read ip_to_remove test_node
+    i=$(($i + 1))
+done <<<"$out"
+
+echo "Attempting to remove ${ip_to_remove} from node ${test_node}."
+try_command_on_node $test_node $CTDB delip $ip_to_remove
+
+echo "Sleeping..."
+sleep_for 1
+
+test_node_ips=""
+while read ip pnn ; do
+    [ "$pnn" = "$test_node" ] && \
+       test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
+done <<<"$out" # bashism to avoid problem setting variable in pipeline.
+
+if [ "${test_node_ips/${ip_to_remove}}" = "$test_node_ips" ] ; then
+    echo "GOOD: That worked!"
+else
+    echo "BAD: The remove IP address is still there!"
+    testfailures=1
+fi
diff --git a/ctdb/tests/simple/18_ctdb_reloadips.sh b/ctdb/tests/simple/18_ctdb_reloadips.sh
new file mode 100755 (executable)
index 0000000..760e476
--- /dev/null
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that IPs can be rearrranged using 'ctdb reloadips'.
+
+Various sub-tests that remove addresses from the public_addresses file
+on a node or delete the entire contents of the public_addresses file.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Expected results:
+
+* When addresses are deconfigured "ctdb ip" no longer reports them and
+  when added they are seen again.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Emptying public addresses file on $test_node"
+
+addresses=$(get_ctdbd_command_line_option $test_node "public-addresses")
+echo "Public addresses file on node $test_node is \"$addresses\""
+backup="${addresses}.$$"
+
+restore_public_addresses ()
+{
+    try_command_on_node $test_node "mv $backup $addresses >/dev/null 2>&1 || true"
+}
+ctdb_test_exit_hook_add restore_public_addresses
+
+try_command_on_node $test_node "mv $addresses $backup && touch $addresses"
+
+try_command_on_node any $CTDB reloadips all
+
+echo "Getting list of public IPs on node $test_node"
+try_command_on_node $test_node "$CTDB ip | tail -n +2"
+
+if [ -n "$out" ] ; then
+    cat <<EOF
+BAD: node $test_node still has ips:
+$out
+EOF
+    exit 1
+fi
+
+echo "GOOD: no IPs left on node $test_node"
+
+echo "Restoring addresses"
+restore_public_addresses
+
+try_command_on_node any $CTDB reloadips all
+
+echo "Getting list of public IPs on node $test_node"
+try_command_on_node $test_node "$CTDB ip | tail -n +2"
+
+if [ -z "$out" ] ; then
+    echo "BAD: node $test_node has no ips"
+    exit 1
+fi
+
+cat <<EOF
+GOOD: node $test_node has these addresses:
+$out
+EOF
+
+try_command_on_node any $CTDB sync
+
+select_test_node_and_ips
+
+echo "Removing IP $test_ip from node $test_node"
+
+try_command_on_node $test_node "mv $addresses $backup && grep -v '^${test_ip}/' $backup >$addresses"
+
+try_command_on_node any $CTDB reloadips all
+
+try_command_on_node $test_node $CTDB ip
+
+if grep "^${test_ip} " <<<"$out" ; then
+    cat <<EOF
+BAD: node $test_node can still host IP $test_ip:
+$out
+EOF
+    exit 1
+fi
+
+cat <<EOF
+GOOD: node $test_node is no longer hosting IP $test_ip:
+$out
+EOF
diff --git a/ctdb/tests/simple/20_delip_iface_gc.sh b/ctdb/tests/simple/20_delip_iface_gc.sh
new file mode 100755 (executable)
index 0000000..bc43567
--- /dev/null
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that an interface is deleted when all IPs on it are deleted.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+echo "Getting public IPs information..."
+try_command_on_node -v any "$CTDB ip -v -n all -Y | tail -n +2"
+ip_info="$out"
+
+# Select the first node and find out its interfaces
+test_node=$(awk -F: 'NR == 1 { print $3}' <<<"$ip_info")
+ifaces=$(awk -F: -v tn=$test_node '$3 == tn { print $6 }' <<<"$ip_info" | sed 's@, @ @g' | xargs -n 1 | sort -u)
+echo "Selected test node ${test_node} with interfaces: ${ifaces}"
+
+# Delete all IPs on each interface...  deleting IPs from one interface
+# can cause other interfaces to disappear, so we need to be careful...
+for i in $ifaces ; do
+    try_command_on_node $test_node "$CTDB ifaces -Y"
+    info=$(awk -F: -v iface="$i" '$2 == iface { print $0 }' <<<"$out")
+
+    if [ -z "$info" ] ; then
+       echo "Interface ${i} missing... assuming already deleted!"
+       continue
+    fi
+
+    echo "Deleting IPs on interface ${i}, with this information:"
+    echo " $info"
+
+    try_command_on_node $test_node "$CTDB ip -v -Y | tail -n +2"
+    awk -F: -v i="$i" \
+       '$6 == i { print $2 }' <<<"$out" |
+    while read ip ; do
+       echo "  $ip"
+       try_command_on_node $test_node "$CTDB delip $ip"
+    done
+
+    try_command_on_node $test_node "$CTDB ifaces -Y"
+    info=$(awk -F: -v iface="$i" '$2 == iface { print $0 }' <<<"$out")
+    
+    if [ -z "$info" ] ; then
+       echo "GOOD: Interface ${i} has been garbage collected"
+    else
+       echo "BAD: Interface ${i} still exists"
+       echo "$out"
+       exit 1
+    fi
+done
diff --git a/ctdb/tests/simple/23_ctdb_moveip.sh b/ctdb/tests/simple/23_ctdb_moveip.sh
new file mode 100755 (executable)
index 0000000..7c09e58
--- /dev/null
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that  'ctdb moveip' allows movement of public IPs between cluster nodes.
+
+To work, this test unsets DeterministicIPs and sets NoIPFailback.
+
+This test does not do any network level checks that the IP address is
+no longer reachable but simply trusts 'ctdb ip' that the address has
+been deleted.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Use 'ctdb ip' on one of the nodes to list the IP addresses being
+   served.
+3. Use 'ctdb moveip' to move an address from one node to another.
+4. Verify that the IP is no longer being hosted by the first node and is now being hosted by the second node.
+
+Expected results:
+
+* 'ctdb moveip' allows an IP address to be moved between cluster nodes.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+try_command_on_node 0 "$CTDB listnodes | wc -l"
+num_nodes="$out"
+echo "There are $num_nodes nodes..."
+
+if [ $num_nodes -lt 2 ] ; then
+    echo "Less than 2 nodes!"
+    exit 1
+fi
+
+echo "Getting list of public IPs..."
+all_ips_on_node -v 0
+
+sanity_check_ips "$out"
+
+# Select an IP/node to move.
+num_ips=$(echo "$out" | wc -l)
+num_to_move=$(($RANDOM % $num_ips))
+
+# Find the details in the list.
+i=0
+while [ $i -le $num_to_move ] ; do
+    read ip_to_move test_node
+    i=$(($i + 1))
+done <<<"$out"
+
+# Can only move address to a node that is willing to host $ip_to_move.
+# This inefficient but shouldn't take long or get stuck.
+to_node=$test_node
+while [ $test_node -eq $to_node ] ; do
+    n=$(($RANDOM % $num_ips))
+    i=0
+    while [ $i -le $n ] ; do
+       read x to_node
+       i=$(($i + 1))
+    done <<<"$out"
+done
+
+echo "Turning off DeterministicIPs..."
+try_command_on_node 0 $CTDB setvar DeterministicIPs 0 -n all
+
+echo "Turning on NoIPFailback..."
+try_command_on_node 0 $CTDB setvar NoIPFailback 1 -n all
+
+echo "Attempting to move ${ip_to_move} from node ${test_node} to node ${to_node}."
+try_command_on_node $test_node $CTDB moveip $ip_to_move $to_node
+
+if wait_until_ips_are_on_nodeglob "[!${test_node}]" $ip_to_move ; then
+    echo "IP moved from ${test_node}."
+else
+    echo "BAD: IP didn't move from ${test_node}."
+    exit 1
+fi
+
+if wait_until_ips_are_on_nodeglob "$to_node" $ip_to_move ; then
+    echo "IP moved to ${to_node}."
+else
+    echo "BAD: IP didn't move to ${to_node}."
+    exit 1
+fi
diff --git a/ctdb/tests/simple/24_ctdb_getdbmap.sh b/ctdb/tests/simple/24_ctdb_getdbmap.sh
new file mode 100755 (executable)
index 0000000..9bed590
--- /dev/null
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that  'ctdb getdbmap' operates as expected.
+
+This test creates some test databases using 'ctdb attach'.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Get the database on using 'ctdb getdbmap'.
+3. Verify that the output is valid.
+
+Expected results:
+
+* 'ctdb getdbmap' shows a valid listing of databases.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+make_temp_db_filename ()
+{
+    dd if=/dev/urandom count=1 bs=512 2>/dev/null |
+    md5sum |
+    awk '{printf "%s.tdb\n", $1}'
+}
+
+try_command_on_node -v 0 "$CTDB getdbmap"
+
+db_map_pattern='^(Number of databases:[[:digit:]]+|dbid:0x[[:xdigit:]]+ name:[^[:space:]]+ path:[^[:space:]]+)$'
+
+sanity_check_output $(($num_db_init + 1)) "$dbmap_pattern" "$out"
+
+num_db_init=$(echo "$out" | sed -n -e '1s/.*://p')
+
+for i in $(seq 1 5) ; do
+    f=$(make_temp_db_filename)
+    echo "Creating test database: $f"
+    try_command_on_node 0 $CTDB attach "$f"
+    try_command_on_node 0 $CTDB getdbmap
+    sanity_check_output $(($num_db_init + 1)) "$dbmap_pattern" "$out"
+    num=$(echo "$out" | sed -n -e '1s/^.*://p')
+    if [ $num = $(($num_db_init + $i)) ] ; then
+       echo "OK: correct number of additional databases"
+    else
+       echo "BAD: no additional database"
+       exit 1
+    fi
+    if [ "${out/name:${f} /}" != "$out" ] ; then
+       echo "OK: getdbmap knows about \"$f\""
+    else
+       echo "BAD: getdbmap does not know about \"$f\""
+       exit 1
+    fi
+done
diff --git a/ctdb/tests/simple/25_dumpmemory.sh b/ctdb/tests/simple/25_dumpmemory.sh
new file mode 100755 (executable)
index 0000000..4082da1
--- /dev/null
@@ -0,0 +1,52 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb dumpmemory' shows expected output.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run 'ctdb dumpmemory' and verify that it shows expected output
+3. Verify that the command takes the '-n all' option and that it
+   causes output for all nodes to be displayed.
+
+Expected results:
+
+* 'ctdb dumpmemory' sows valid output.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node -v 0 "$CTDB dumpmemory"
+
+pat='^([[:space:]].+[[:space:]]+contains[[:space:]]+[[:digit:]]+ bytes in[[:space:]]+[[:digit:]]+ blocks \(ref [[:digit:]]+\)[[:space:]]+0x[[:xdigit:]]+|[[:space:]]+reference to: .+|full talloc report on .+ \(total[[:space:]]+[[:digit:]]+ bytes in [[:digit:]]+ blocks\))$'
+
+sanity_check_output 10 "$pat" "$out"
+
+echo "Checking output using '-n all'..."
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+try_command_on_node 0 "$CTDB dumpmemory" -n all
+sanity_check_output 10 "$pat" "$out"
+
+if [ $(fgrep -c 'full talloc report on' <<<"$out") -eq  $num_nodes ] ; then
+    echo "OK: there looks to be output for all $num_nodes nodes"
+else
+    echo "BAD: there not look to be output for all $num_nodes nodes"
+    exit 1
+fi    
diff --git a/ctdb/tests/simple/26_ctdb_config_check_error_on_unreachable_ctdb.sh b/ctdb/tests/simple/26_ctdb_config_check_error_on_unreachable_ctdb.sh
new file mode 100755 (executable)
index 0000000..6642b17
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify an error occurs if a ctdb command is run against a node without a ctdbd.
+
+That is, check that an error message is printed if an attempt is made
+to execute a ctdb command against a node that is not running ctdbd.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Shutdown ctdb on a node using 'ctdb shutdown -n <node>'.
+3. Verify that the status of the node changes to 'DISCONNECTED'.
+4. Now run 'ctdb ip -n <node>' from another node.
+5. Verify that an error message is printed stating that the node is
+   disconnected.
+6. Execute some other commands against the shutdown node.  For example,
+   disable, enable, ban, unban, listvars.
+7. For each command, verify that an error message is printed stating
+   that the node is disconnected. 
+
+Expected results:
+
+* For a node on which ctdb is not running, all commands display an
+  error message stating that the node is disconnected.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+test_node=1
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+echo "There are $num_nodes nodes."
+
+echo "Shutting down node ${test_node}..."
+try_command_on_node $test_node $CTDB shutdown
+
+wait_until_node_has_status $test_node disconnected 30 0
+
+wait_until_node_has_status 0 recovered 30 0
+
+pat="ctdb_control error: 'ctdb_control to disconnected node'|ctdb_control error: 'node is disconnected'|Node $test_node is DISCONNECTED|Node $test_node has status DISCONNECTED\|UNHEALTHY\|INACTIVE"
+
+for i in ip disable enable "ban 0" unban listvars ; do
+    try_command_on_node -v 0 ! $CTDB $i -n $test_node
+
+    if egrep -q "$pat" <<<"$out" ; then
+       echo "OK: \"ctdb ${i}\" fails with expected \"disconnected node\" message"
+    else
+       echo "BAD: \"ctdb ${i}\" does not fail with expected \"disconnected node\" message"
+       exit 1
+    fi
+done
diff --git a/ctdb/tests/simple/31_ctdb_disable.sh b/ctdb/tests/simple/31_ctdb_disable.sh
new file mode 100755 (executable)
index 0000000..d021454
--- /dev/null
@@ -0,0 +1,57 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify the operation of 'ctdb disable'.
+
+This is a superficial test of the 'ctdb disable' command.  It trusts
+information from CTDB that indicates that the IP failover has happened
+correctly.  Another test should check that the failover has actually
+happened at the networking level.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Disable one of the nodes using 'ctdb disable -n <node>'.
+3. Verify that the status of the node changes to 'disabled'.
+4. Verify that the IP addreses served by the disabled node are failed
+   over to other nodes.
+
+Expected results:
+
+* The status of the disabled node changes as expected and IP addresses
+  failover as expected.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Disabling node $test_node"
+
+try_command_on_node 1 $CTDB disable -n $test_node
+
+# Avoid a potential race condition...
+wait_until_node_has_status $test_node disabled
+
+if wait_until_ips_are_on_nodeglob "[!${test_node}]" $test_node_ips ; then
+    echo "All IPs moved."
+else
+    echo "Some IPs didn't move."
+    testfailures=1
+fi
diff --git a/ctdb/tests/simple/32_ctdb_enable.sh b/ctdb/tests/simple/32_ctdb_enable.sh
new file mode 100755 (executable)
index 0000000..7cc3da3
--- /dev/null
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify the operation of 'ctdb enable'.
+
+This is a superficial test of the 'ctdb enable' command.  It trusts
+information from CTDB that indicates that the IP failover has happened
+correctly.  Another test should check that the failover has actually
+happened at the networking level.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Disable one of the nodes using 'ctdb disable -n <node>'.
+3. Verify that the status of the node changes to 'disabled'.
+4. Verify that the public IP addreses served by the disabled node are
+   failed over to other nodes.
+5. Enable the disabled node using 'ctdb enable -n '<node>'.
+6. Verify that the status changes back to 'OK'.
+7. Verify that some public IP addreses are failed back to the node.
+
+
+Expected results:
+
+* The status of a re-enabled node changes as expected and IP addresses
+  fail back as expected.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+########################################
+
+set -e
+
+cluster_is_healthy
+
+select_test_node_and_ips
+
+echo "Disabling node $test_node"
+try_command_on_node 1 $CTDB disable -n $test_node
+
+wait_until_node_has_status $test_node disabled
+
+if wait_until_ips_are_on_nodeglob "[!${test_node}]" $test_node_ips ; then
+    echo "All IPs moved."
+else
+    echo "Some IPs didn't move."
+    testfailures=1
+fi
+
+echo "Reenabling node $test_node"
+try_command_on_node 1 $CTDB enable -n $test_node
+
+wait_until_node_has_status $test_node enabled
+
+wait_until_node_has_some_ips "$test_node"
diff --git a/ctdb/tests/simple/41_ctdb_stop.sh b/ctdb/tests/simple/41_ctdb_stop.sh
new file mode 100755 (executable)
index 0000000..1a45d8f
--- /dev/null
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify the operation of the 'ctdb stop' command.
+
+This is a superficial test of the 'ctdb stop' command.  It trusts
+information from CTDB that indicates that the IP failover has
+happened correctly.  Another test should check that the failover
+has actually happened at the networking level.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Stop one of the nodes using the 'ctdb stop' command.
+3. Verify that the status of the node changes to 'stopped'.
+4. Verify that the public IP addresses that were being served by
+   the node are failed over to one of the other nodes.
+
+Expected results:
+
+* The status of the stopped nodes changes as expected and IP addresses
+  failover as expected.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Stopping node ${test_node}..."
+try_command_on_node 1 $CTDB stop -n $test_node
+
+wait_until_node_has_status $test_node stopped
+
+if wait_until_ips_are_on_nodeglob "[!${test_node}]" $test_node_ips ; then
+    echo "All IPs moved."
+else
+    echo "Some IPs didn't move."
+    testfailures=1
+fi
diff --git a/ctdb/tests/simple/42_ctdb_continue.sh b/ctdb/tests/simple/42_ctdb_continue.sh
new file mode 100755 (executable)
index 0000000..381baf5
--- /dev/null
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify the operation of the 'ctdb continue' command.
+
+This is a superficial test of the 'ctdb continue' command.  It trusts
+information from CTDB that indicates that the IP failover and failback
+has happened correctly.  Another test should check that the failover
+and failback has actually happened at the networking level.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Stop one of the nodes using the 'ctdb stop' command.
+3. Verify that the status of the node changes to 'stopped'.
+4. Verify that the public IP addresses that were being served by
+   the node are failed over to one of the other nodes.
+5. Use 'ctdb continue' to bring the node back online.
+6. Verify that the status of the node changes back to 'OK' and that
+   some public IP addresses move back to the node.
+
+Expected results:
+
+* The 'ctdb continue' command successfully brings a stopped node online.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Stopping node ${test_node}..."
+try_command_on_node 1 $CTDB stop -n $test_node
+
+wait_until_node_has_status $test_node stopped
+
+if wait_until_ips_are_on_nodeglob "[!${test_node}]" $test_node_ips ; then
+    echo "All IPs moved."
+else
+    echo "Some IPs didn't move."
+    testfailures=1
+fi
+
+echo "Continuing node $test_node"
+try_command_on_node 1 $CTDB continue -n $test_node
+
+wait_until_node_has_status $test_node notstopped
+
+wait_until_node_has_some_ips "$test_node"
diff --git a/ctdb/tests/simple/43_stop_recmaster_yield.sh b/ctdb/tests/simple/43_stop_recmaster_yield.sh
new file mode 100755 (executable)
index 0000000..e7a8250
--- /dev/null
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that 'ctdb stop' causes a node to yield the recovery master role.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Determine which node is the recmaster.
+2. Stop this node using the 'ctdb stop' command.
+3. Verify that the status of the node changes to 'stopped'.
+4. Verify that this node no longer has the recovery master role.
+
+Expected results:
+
+* The 'ctdb stop' command causes a node to yield the recmaster role.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+echo "Finding out which node is the recovery master..."
+try_command_on_node -v 0 "$CTDB recmaster"
+test_node=$out
+
+echo "Stopping node ${test_node} - it is the current recmaster..."
+try_command_on_node 1 $CTDB stop -n $test_node
+
+wait_until_node_has_status $test_node stopped
+
+echo "Checking which node is the recovery master now..."
+try_command_on_node -v 0 "$CTDB recmaster"
+recmaster=$out
+
+if [ "$recmaster" != "$test_node" ] ; then
+    echo "OK: recmaster moved to node $recmaster"
+else
+    echo "BAD: recmaster did not move"
+    exit 1
+fi
diff --git a/ctdb/tests/simple/51_ctdb_bench.sh b/ctdb/tests/simple/51_ctdb_bench.sh
new file mode 100755 (executable)
index 0000000..d4f7c54
--- /dev/null
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Run the ctdb_bench test and sanity check the output.
+
+This doesn't test for performance regressions or similarly anything
+useful.  Only vague sanity checking of results is done.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run ctdb_bench on all nodes with default options.
+3. Ensure that the number of +ve and -ive messages are within 1% of
+   each other.
+4. Ensure that the number of messages per second is greater than 10.
+
+Expected results:
+
+* ctdb_bench runs without error and prints reasonable results.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+echo "Running ctdb_bench on all $num_nodes nodes."
+try_command_on_node -v -pq all $CTDB_TEST_WRAPPER $VALGRIND ctdb_bench -n $num_nodes
+
+# Get the last line of output.
+while read line ; do
+    prev=$line
+done <<<"$out"
+
+pat='^(Ring: [[:digit:]]+(\.[[:digit:]]+)? msgs/sec \(\+ve=[[:digit:]]+ -ve=[[:digit:]]+\)[[:space:]]?|Waiting for cluster[[:space:]]?)+$'
+sanity_check_output 1 "$pat" "$out"
+
+# $prev should look like this:
+#    Ring: 10670.93 msgs/sec (+ve=53391 -ve=53373)
+stuff="${prev##*Ring: }"
+mps="${stuff% msgs/sec*}"
+
+if [ ${mps%.*} -ge 10 ] ; then
+    echo "OK: $mps msgs/sec >= 10 msgs/sec"
+else
+    echo "BAD: $mps msgs/sec < 10 msgs/sec"
+    exit 1
+fi
+
+stuff="${stuff#*msgs/sec (+ve=}"
+positive="${stuff%% *}"
+
+if [ $positive -gt 0 ] ; then
+    echo "OK: +ive ($positive) > 0"
+else
+    echo "BAD: +ive ($positive) = 0"
+    exit 1
+fi
+
+stuff="${stuff#*-ve=}"
+negative="${stuff%)}"
+
+if [ $negative -gt 0 ] ; then
+    echo "OK: -ive ($negative) > 0"
+else
+    echo "BAD: -ive ($negative) = 0"
+    exit 1
+fi
+
+perc_diff=$(( ($positive - $negative) * 100 / $positive ))
+perc_diff=${perc_diff#-}
+
+check_percent=5
+if [ $perc_diff -le $check_percent ] ; then
+    echo "OK: percentage difference between +ive and -ive ($perc_diff%) <= $check_percent%"
+else
+    echo "BAD: percentage difference between +ive and -ive ($perc_diff%) > $check_percent%"
+    exit 1
+fi
diff --git a/ctdb/tests/simple/52_ctdb_fetch.sh b/ctdb/tests/simple/52_ctdb_fetch.sh
new file mode 100755 (executable)
index 0000000..54405d0
--- /dev/null
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Run the ctdb_fetch test and sanity check the output.
+
+This doesn't test for performance regressions or similarly anything
+useful.  Only vague sanity checking of results is done.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run ctdb_fetch on all nodes with default options.
+3. Ensure that the number of +ve and -ive messages are within 1% of
+   each other.
+4. Ensure that the number of messages per second is greater than 10.
+
+Expected results:
+
+* ctdb_fetch runs without error and prints reasonable results.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+echo "Running ctdb_fetch on all $num_nodes nodes."
+try_command_on_node -v -pq all $CTDB_TEST_WRAPPER $VALGRIND ctdb_fetch -n $num_nodes
+
+pat='^(Fetch: [[:digit:]]+(\.[[:digit:]]+)? msgs/sec[[:space:]]?|msg_count=[[:digit:]]+ on node [[:digit:]]|Fetching final record|DATA:|Test data|Waiting for cluster[[:space:]]?|.*: Reqid wrap!|Sleeping for [[:digit:]]+ seconds|)+$'
+sanity_check_output 1 "$pat" "$out"
+
+# Filter out the performance figures:
+out_fetch=$(echo "$out" | egrep '^(Fetch: .*)+$')
+
+# Get the last line of output.
+while read line ; do
+    prev=$line
+done <<<"$out_fetch"
+
+# $prev should look like this:
+#    Fetch: 10670.93 msgs/sec
+stuff="${prev##*Fetch: }"
+mps="${stuff% msgs/sec*}"
+
+if [ ${mps%.*} -ge 10 ] ; then
+    echo "OK: $mps msgs/sec >= 10 msgs/sec"
+else
+    echo "BAD: $mps msgs/sec < 10 msgs/sec"
+    exit 1
+fi
diff --git a/ctdb/tests/simple/53_ctdb_transaction.sh b/ctdb/tests/simple/53_ctdb_transaction.sh
new file mode 100755 (executable)
index 0000000..b99b3a9
--- /dev/null
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that the ctdb_transaction test succeeds.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run two copies of ctdb_transaction on each node with a 30 second
+   timeout.
+3. Ensure that all ctdb_transaction processes complete successfully.
+
+Expected results:
+
+* ctdb_transaction runs without error.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+if test "x${CTDB_TEST_TIMELIMIT}" == "x" ; then
+       CTDB_TEST_TIMELIMIT=30
+fi
+
+t="$CTDB_TEST_WRAPPER $VALGRIND ctdb_transaction --timelimit=${CTDB_TEST_TIMELIMIT}"
+
+echo "Running ctdb_transaction on all $num_nodes nodes."
+try_command_on_node -v -pq all "$t & $t"
diff --git a/ctdb/tests/simple/54_ctdb_transaction_recovery.sh b/ctdb/tests/simple/54_ctdb_transaction_recovery.sh
new file mode 100755 (executable)
index 0000000..d796e94
--- /dev/null
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that the ctdb_transaction test succeeds.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Run two copies of ctdb_transaction on each node with a 30 second
+   timeout.
+3. Ensure that all ctdb_transaction processes complete successfully.
+
+Expected results:
+
+* ctdb_transaction runs without error.
+EOF
+}
+
+recovery_loop()
+{
+       local COUNT=1
+
+       while true ; do
+               echo Recovery $COUNT
+               try_command_on_node 0 $CTDB recover
+               sleep 2
+               COUNT=$((COUNT + 1))
+       done
+}
+
+recovery_loop_start()
+{
+       recovery_loop > /tmp/recloop.out &
+       RECLOOP_PID=$!
+       ctdb_test_exit_hook_add "kill $RECLOOP_PID >/dev/null 2>&1"
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+if test "x${CTDB_TEST_TIMELIMIT}" == "x" ; then
+       CTDB_TEST_TIMELIMIT=30
+fi
+
+t="$CTDB_TEST_WRAPPER $VALGRIND ctdb_transaction --timelimit=${CTDB_TEST_TIMELIMIT}"
+
+echo "Starting recovery loop"
+recovery_loop_start
+
+echo "Running ctdb_transaction on all $num_nodes nodes."
+try_command_on_node -v -pq all "$t & $t"
+
diff --git a/ctdb/tests/simple/60_recoverd_missing_ip.sh b/ctdb/tests/simple/60_recoverd_missing_ip.sh
new file mode 100755 (executable)
index 0000000..0734aee
--- /dev/null
@@ -0,0 +1,109 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that the reconvery daemon handles unhosted IPs properly.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+select_test_node_and_ips
+
+echo "Running test against node $test_node and IP $test_ip"
+
+# Find the interface
+try_command_on_node $test_node "$CTDB ip -v -Y | awk -F: -v ip=$test_ip '\$2 == ip { print \$4 }'"
+iface="$out"
+
+if [ -z "$TEST_LOCAL_DAEMONS" ] ; then
+    # Find the netmask
+    try_command_on_node $test_node ip addr show to $test_ip
+    mask="${out##*/}"
+    mask="${mask%% *}"
+else
+    mask="24"
+fi
+
+echo "$test_ip/$mask is on $iface"
+
+# Push out the next monitor event so it is less likely to be cancelled
+# and result in services not being restarted properly.
+try_command_on_node $test_node $CTDB eventscript monitor
+
+echo "Deleting IP $test_ip from all nodes"
+try_command_on_node -v $test_node $CTDB delip -n all $test_ip
+
+wait_until_ips_are_on_nodeglob '!' $test_node $test_ip
+
+try_command_on_node -v all $CTDB ip
+
+my_exit_hook ()
+{
+    if [ -z "$TEST_LOCAL_DAEMONS" ] ; then
+       onnode -q all $CTDB enablescript "10.interface"
+    fi
+}
+
+ctdb_test_exit_hook_add my_exit_hook
+
+# This forces us to wait until the ipreallocated associated with the
+# delips is complete.
+try_command_on_node $test_node $CTDB sync
+
+# This effectively cancels any monitor event that is in progress and
+# runs a new one
+try_command_on_node $test_node $CTDB eventscript monitor
+
+if [ -z "$TEST_LOCAL_DAEMONS" ] ; then
+    # Stop monitor events from bringing up the link status of an interface
+    try_command_on_node $test_node $CTDB disablescript 10.interface
+fi
+
+echo "Marking interface $iface down on node $test_node"
+try_command_on_node $test_node $CTDB setifacelink $iface down
+
+try_command_on_node $test_node $CTDB clearlog recoverd
+
+echo "Adding IP $test_ip to node $test_node"
+try_command_on_node $test_node $CTDB addip $test_ip/$mask $iface
+
+# Give the recovery daemon enough time to start doing IP verification
+sleep_for 15
+
+try_command_on_node $test_node $CTDB getlog recoverd
+
+msg="Public IP '$test_ip' is not assigned and we could serve it"
+
+if grep "$msg"  <<<"$out" ; then
+    echo "BAD: the recovery daemon noticed that the IP was unhosted"
+    exit 1
+else
+    echo "GOOD: the recovery daemon did not notice that the IP was unhosted"
+fi
+
+try_command_on_node $test_node $CTDB clearlog recoverd
+
+echo "Marking interface $iface up on node $test_node"
+try_command_on_node $test_node $CTDB setifacelink $iface up
+
+wait_until_ips_are_on_nodeglob $test_node $test_ip
+
+try_command_on_node -v $test_node $CTDB getlog recoverd
+
+if grep "$msg" <<<"$out" ; then
+    echo "GOOD: the recovery daemon noticed that the IP was unhosted"
+else
+    echo "BAD: the recovery daemon did not notice that the IP was unhosted"
+    exit 1
+fi
diff --git a/ctdb/tests/simple/70_recoverpdbbyseqnum.sh b/ctdb/tests/simple/70_recoverpdbbyseqnum.sh
new file mode 100755 (executable)
index 0000000..612366c
--- /dev/null
@@ -0,0 +1,232 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+The tunable RecoverPDBBySeqNum controls how we perform recovery
+on persistent databases.
+The default is that persistent databases are recovered exactly the same
+way as normal databases. That is that we recover record by record.
+
+If RecoverPDBBySeqNum is set to 1 AND if a record with the key
+"__db_sequence_number__" can be found in the database, then instead we will
+perform the recovery by picking the copy of the database from the node
+that has the highest sequence number and ignore the content on all other
+nodes.
+
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. create a persistent test database
+3. test that RecoveryPDBBySeqNum==0 and no seqnum record blends the database
+   during recovery
+4. test that RecoveryPDBBySeqNum==0 and seqnum record blends the database
+   during recovery
+5. test that RecoveryPDBBySeqNum==1 and no seqnum record blends the database
+   during recovery
+6. test that RecoveryPDBBySeqNum==1 and seqnum record does not blend the database
+   during recovery
+
+Expected results:
+
+* that 3,4,5 will blend the databases and that 6 will recovery the highest seqnum
+  database
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+# create a temporary persistent database to test with
+echo create persistent test database persistent_test.tdb
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb attach persistent_test.tdb persistent
+
+
+# set RecoverPDBBySeqNum=0
+echo "setting RecoverPDBBySeqNum to 0"
+try_command_on_node -q all $CTDB_TEST_WRAPPER ctdb setvar RecoverPDBBySeqNum 0
+
+
+
+# 3,
+# If RecoverPDBBySeqNum==0  and no __db_sequence_number__
+# recover record by record
+#
+# wipe database
+echo
+echo test that RecoverPDBBySeqNum==0 and no __db_sequence_number__ blends the database during recovery
+echo wipe the test database
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb persistent_test.tdb
+
+# add one record to node 0   key==ABC  data==ABC
+TDB=`try_command_on_node -v -q 0 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(ABC) data(ABC) on node 0"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x414243 0x070000000000000000000000000000000000000000000000414243
+#
+# add one record to node 1   key==DEF  data==DEF
+TDB=`try_command_on_node -v -q 1 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(DEF) data(DEF) on node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x444546 0x070000000000000000000000000000000000000000000000444546
+
+# force a recovery
+echo force a recovery
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb recover
+
+# check that we now have both records on node 0
+num_records=$(try_command_on_node -v -pq 0 $CTDB_TEST_WRAPPER ctdb cattdb persistent_test.tdb | grep key | egrep "ABC|DEF" | wc -l)
+[ $num_records != "2" ] && {
+    echo "BAD: we did not end up with the expected two records after the recovery"
+    exit 1
+}
+echo "OK. databases were blended"
+
+
+
+# 4,
+# If RecoverPDBBySeqNum==0  and __db_sequence_number__
+# recover record by record
+#
+# wipe database
+echo
+echo test that RecoverPDBBySeqNum==0 and __db_sequence_number__ blends the database during recovery
+echo wipe the test database
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb persistent_test.tdb
+
+echo "add __db_sequence_number__==5 record to all nodes"
+try_command_on_node -v 0 $CTDB_TEST_WRAPPER ctdb nodestatus all | grep pnn | sed -e"s/^pnn://" -e "s/ .*//" | while read PNN; do
+    TDB=`try_command_on_node -v -q $PNN $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+    try_command_on_node -q $PNN $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x5f5f64625f73657175656e63655f6e756d6265725f5f00 0x0700000000000000000000000000000000000000000000000500000000000000
+done
+
+# add one record to node 0   key==ABC  data==ABC
+TDB=`try_command_on_node -v -q 0 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(ABC) data(ABC) on node 0"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x414243 0x070000000000000000000000000000000000000000000000414243
+echo "add __db_sequence_number__==7 record to node 0"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x5f5f64625f73657175656e63655f6e756d6265725f5f00 0x0700000000000000000000000000000000000000000000000700000000000000
+
+# add one record to node 1   key==DEF  data==DEF
+TDB=`try_command_on_node -v -q 1 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(DEF) data(DEF) on node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x444546 0x070000000000000000000000000000000000000000000000444546
+echo "add __db_sequence_number__==8 record to node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x5f5f64625f73657175656e63655f6e756d6265725f5f00 0x0700000000000000000000000000000000000000000000000800000000000000
+
+# force a recovery
+echo force a recovery
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb recover
+
+# check that we now have both records on node 0
+num_records=$(try_command_on_node -v -pq 0 $CTDB_TEST_WRAPPER ctdb cattdb persistent_test.tdb | grep key | egrep "ABC|DEF" | wc -l)
+[ $num_records != "2" ] && {
+    echo "BAD: we did not end up with the expected two records after the recovery"
+    exit 1
+}
+echo "OK. databases were blended"
+
+
+
+# set RecoverPDBBySeqNum=1
+echo
+echo "setting RecoverPDBBySeqNum to 1"
+try_command_on_node -q all $CTDB_TEST_WRAPPER ctdb setvar RecoverPDBBySeqNum 1
+
+
+
+# 5,
+# If RecoverPDBBySeqNum==1  and no __db_sequence_number__
+# recover record by record
+#
+# wipe database
+echo
+echo test that RecoverPDBBySeqNum==1 and no __db_sequence_number__ blends the database during recovery
+echo wipe the test database
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb persistent_test.tdb
+
+# add one record to node 0   key==ABC  data==ABC
+TDB=`try_command_on_node -v -q 0 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(ABC) data(ABC) on node 0"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x414243 0x070000000000000000000000000000000000000000000000414243
+
+# add one record to node 1   key==DEF  data==DEF
+TDB=`try_command_on_node -v -q 1 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(DEF) data(DEF) on node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x444546 0x070000000000000000000000000000000000000000000000444546
+
+# force a recovery
+echo force a recovery
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb recover
+
+# check that we now have both records on node 0
+num_records=$(try_command_on_node -v -pq 0 $CTDB_TEST_WRAPPER ctdb cattdb persistent_test.tdb | grep key | egrep "ABC|DEF" | wc -l)
+[ $num_records != "2" ] && {
+    echo "BAD: we did not end up with the expected two records after the recovery"
+    exit 1
+}
+echo "OK. databases were blended"
+
+
+
+# 6,
+# If RecoverPDBBySeqNum==1  and __db_sequence_number__
+# recover whole database
+#
+# wipe database
+echo
+echo test that RecoverPDBBySeqNum==1 and __db_sequence_number__ does not blend the database during recovery
+echo wipe the test database
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb persistent_test.tdb
+
+echo "add __db_sequence_number__==5 record to all nodes"
+try_command_on_node -v 0 $CTDB_TEST_WRAPPER ctdb nodestatus all | grep pnn | sed -e"s/^pnn://" -e "s/ .*//" | while read PNN; do
+    TDB=`try_command_on_node -v -q $PNN $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+    try_command_on_node -q $PNN $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x5f5f64625f73657175656e63655f6e756d6265725f5f00 0x0700000000000000000000000000000000000000000000000500000000000000
+done
+
+
+# add one record to node 0   key==ABC  data==ABC
+TDB=`try_command_on_node -v -q 0 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(ABC) data(ABC) on node 0"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x414243 0x070000000000000000000000000000000000000000000000414243
+echo "add __db_sequence_number__==7 record to node 0"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x5f5f64625f73657175656e63655f6e756d6265725f5f00 0x0700000000000000000000000000000000000000000000000700000000000000
+
+# add one record to node 1   key==DEF  data==DEF
+TDB=`try_command_on_node -v -q 1 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(DEF) data(DEF) on node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x444546 0x070000000000000000000000000000000000000000000000444546
+echo "add __db_sequence_number__==8 record to node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x5f5f64625f73657175656e63655f6e756d6265725f5f00 0x0700000000000000000000000000000000000000000000000800000000000000
+
+# force a recovery
+echo force a recovery
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb recover
+
+# check that we now have both records on node 0
+num_records=$(try_command_on_node -v -pq 0 $CTDB_TEST_WRAPPER ctdb cattdb persistent_test.tdb | grep key | egrep "ABC|DEF" | wc -l)
+[ $num_records != "1" ] && {
+    echo "BAD: we did not end up with the expected single record after the recovery"
+    exit 1
+}
+
+echo "OK. databases were not blended"
+
+
+
+# set RecoverPDBBySeqNum=1
+echo "setting RecoverPDBBySeqNum back to 0"
+try_command_on_node -q all $CTDB_TEST_WRAPPER ctdb setvar RecoverPDBBySeqNum 0
diff --git a/ctdb/tests/simple/71_ctdb_wipedb.sh b/ctdb/tests/simple/71_ctdb_wipedb.sh
new file mode 100755 (executable)
index 0000000..0cd07cc
--- /dev/null
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+The command 'ctdb wipedb' is used to clear a database across the whole
+cluster.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. create a persistent test database
+3, add some records to node #0 and node #1
+4, perform wipedb on node #0 and verify the database is empty on both node 0 and 1
+
+Expected results:
+
+* that 4 will result in empty database
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+# create a temporary persistent database to test with
+echo create persistent test database persistent_test.tdb
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb attach persistent_test.tdb persistent
+
+
+# 3,
+# add one record to node 0   key==ABC  data==ABC
+TDB=`try_command_on_node -v -q 0 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(ABC) data(ABC) on node 0"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x414243 0x070000000000000000000000000000000000000000000000414243
+#
+# add one record to node 1   key==DEF  data==DEF
+TDB=`try_command_on_node -v -q 1 $CTDB_TEST_WRAPPER ctdb getdbmap | grep persistent_test.tdb | sed -e "s/.*path://" -e "s/ .*//"`
+echo "store key(DEF) data(DEF) on node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb tstore $TDB 0x444546 0x070000000000000000000000000000000000000000000000444546
+
+
+# 4,
+echo wipe the persistent test database
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb persistent_test.tdb
+echo force a recovery
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb recover
+
+# check that the database is wiped
+num_records=$(try_command_on_node -v -pq 1 $CTDB_TEST_WRAPPER ctdb cattdb persistent_test.tdb | grep key | wc -l)
+[ $num_records != "0" ] && {
+    echo "BAD: we did not end up with an empty database"
+    exit 1
+}
+echo "OK. database was wiped"
+
diff --git a/ctdb/tests/simple/72_update_record_persistent.sh b/ctdb/tests/simple/72_update_record_persistent.sh
new file mode 100755 (executable)
index 0000000..254ce19
--- /dev/null
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+UPDATE_RECORD control should be able to create new records and update
+existing records in a persistent database.
+
+Prerequisites:
+
+* An active CTDB cluster with at least one active node.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. create a persistent test database
+3, wipe the database to make sure it is empty
+4, create a new record
+5, update the record
+
+Expected results:
+
+* 4 created record found in the tdb
+* 5 updated record found in the tdb
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+TDB=persistent_test.tdb
+
+# create a temporary persistent database to test with
+echo create persistent test database $TDB
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb attach $TDB persistent
+
+
+# 3,
+echo wipe the persistent test database
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb $TDB
+echo force a recovery
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb recover
+
+# check that the database is wiped
+num_records=$(try_command_on_node -v -pq 1 $CTDB_TEST_WRAPPER ctdb cattdb $TDB | grep key | wc -l)
+[ $num_records != "0" ] && {
+    echo "BAD: we did not end up with an empty database"
+    exit 1
+}
+echo "OK. database was wiped"
+
+# 4,
+echo Create a new record in the persistent database using UPDATE_RECORD
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb_update_record_persistent  --database=$TDB --record=Update_Record_Persistent --value=FirstValue
+
+try_command_on_node -q 0 "ctdb cattdb $TDB | grep 'FirstValue' | wc -l"
+[ $out != 1 ] && {
+    echo "BAD: we did find the record after the create/update"
+    exit 1
+}
+
+# 5,
+echo Modify an existing record in the persistent database using UPDATE_RECORD
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb_update_record_persistent  --database=$TDB --record=Update_Record_Persistent --value=SecondValue
+
+try_command_on_node -q 0 "ctdb cattdb $TDB | grep 'FirstValue' | wc -l"
+[ $out != 0 ] && {
+    echo "BAD: we still found the old record after the modify/update"
+    exit 1
+}
+
+try_command_on_node -q 0 "ctdb cattdb $TDB | grep 'SecondValue' | wc -l"
+[ $out != 1 ] && {
+    echo "BAD: could not find the record after the modify/update"
+    exit 1
+}
+
+
+echo wipe the persistent test databases and clean up
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb $TDB
diff --git a/ctdb/tests/simple/73_tunable_NoIPTakeover.sh b/ctdb/tests/simple/73_tunable_NoIPTakeover.sh
new file mode 100755 (executable)
index 0000000..eee3da9
--- /dev/null
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Verify that  'ctdb setvar NoIPTakeover 1' stops ip addresses from being failed 
+over onto the node.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. Use 'ctdb ip' on one of the nodes to list the IP addresses being
+   served.
+3. Use 'ctdb moveip' to move an address from one node to another.
+4. Verify that the IP is no longer being hosted by the first node and is now being hosted by the second node.
+
+Expected results:
+
+* 'ctdb moveip' allows an IP address to be moved between cluster nodes.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+try_command_on_node 0 "$CTDB listnodes | wc -l"
+num_nodes="$out"
+echo "There are $num_nodes nodes..."
+
+if [ $num_nodes -lt 2 ] ; then
+    echo "Less than 2 nodes!"
+    exit 1
+fi
+
+
+echo "Wait until the ips are reallocated"
+sleep 30
+try_command_on_node -q 0 "$CTDB ipreallocate"
+
+num=`try_command_on_node -v 1 "$CTDB ip" | grep -v Public | egrep " 1$" | wc -l`
+echo "Number of addresses on node 1 : $num"
+
+
+echo "Turning on NoIPTakeover on node 1"
+try_command_on_node -q 1 "$CTDB setvar NoIPTakeover 1"
+try_command_on_node -q 1 "$CTDB ipreallocate"
+
+echo Disable node 1
+try_command_on_node -q 1 "$CTDB disable"
+try_command_on_node -q 1 "$CTDB ipreallocate"
+num=`try_command_on_node -v 1 "$CTDB ip" | grep -v Public | egrep " 1$" | wc -l`
+echo "Number of addresses on node 1 : $num"
+[ "$num" != "0" ] && {
+    echo "BAD: node 1 still hosts ip addresses"
+    exit 1
+}
+
+
+echo "Enable node 1 again"
+try_command_on_node -q 1 "$CTDB enable"
+sleep 30
+try_command_on_node -q 1 "$CTDB ipreallocate"
+try_command_on_node -q 1 "$CTDB ipreallocate"
+num=`try_command_on_node -v 1 "$CTDB ip" | grep -v Public | egrep " 1$" | wc -l`
+echo "Number of addresses on node 1 : $num"
+[ "$num" != "0" ] && {
+    echo "BAD: node took over ip addresses"
+    exit 1
+}
+
+
+echo "OK. ip addresses were not taken over"
+exit 0
diff --git a/ctdb/tests/simple/75_readonly_records_basic.sh b/ctdb/tests/simple/75_readonly_records_basic.sh
new file mode 100755 (executable)
index 0000000..f243ea1
--- /dev/null
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Readonly records can be activated at runtime using a ctdb command.
+If readonly records are not activated, then any attempt to fetch a readonly
+copy should be automatically upgraded to a read-write fetch_lock().
+
+If readonly delegations are present, then any attempt to aquire a read-write
+fetch_lock will trigger all delegations to be revoked before the fetch lock
+completes.
+
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Verify that the status on all of the ctdb nodes is 'OK'.
+2. create a test database and some records
+3. try to fetch readonly records, this should not result in any delegations
+4. activate readonly support
+5. try to fetch readonly records, this should result in delegations
+6. do a fetchlock  and the delegations should be revoked
+7. try to fetch readonly records, this should result in delegations
+8. do a recovery  and the delegations should be revoked
+
+Expected results:
+
+3. No delegations created when db is not in readonly mode
+4. It is possible to activate readonly support for a database
+5. Delegations should be created
+6. Delegations should be revoked
+8. Delegations should be revoked
+
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+
+# create a temporary database to test with
+echo create test database test.tdb
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb attach test.tdb
+
+
+# create some records
+try_command_on_node -q all $CTDB_TEST_WRAPPER ctdb_update_record
+
+#
+# 3
+# try readonly requests
+echo Try some readonly fetches, these should all be upgraded to full fetchlocks
+try_command_on_node -q 0,1,2 $CTDB_TEST_WRAPPER "ctdb_fetch_readonly_once </dev/null"
+
+# no delegations should have been created
+numreadonly=`try_command_on_node -v all $CTDB_TEST_WRAPPER ctdb cattdb test.tdb | grep READONLY | wc -l`
+[ "$numreadonly" != "0" ] && {
+    echo "BAD: readonly delegations were created, but the feature is not activated on the database"
+    exit 1
+}
+
+
+#
+# 4
+#
+
+echo Activating ReadOnly record support for test.tdb ...
+# activate readonly support
+try_command_on_node -q all $CTDB_TEST_WRAPPER ctdb setdbreadonly test.tdb
+numreadonly=`try_command_on_node -v 0 $CTDB_TEST_WRAPPER ctdb getdbmap | grep READONLY | wc -l`
+[ "$numreadonly" != "1" ] && {
+    echo BAD: could not activate readonly support for the test database
+    exit 1
+}
+
+
+
+#
+# 5
+#
+
+echo Create some readonly delegations ...
+# fetch record to node 0 and make it dmaster
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb_update_record
+
+# fetch readonly to node 1
+try_command_on_node -v 0 $CTDB_TEST_WRAPPER "ctdb_fetch_readonly_once </dev/null"
+
+numreadonly=`try_command_on_node -v all $CTDB_TEST_WRAPPER ctdb cattdb test.tdb | grep RO_HAVE | wc -l`
+[ "$numreadonly" != "2" ] && {
+    echo BAD: could not create readonly delegation
+    exit 1
+}
+
+
+
+
+#
+# 6
+#
+
+echo verify that a fetchlock will revoke the delegations ...
+# fetch record to node 0 and make it dmaster
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb_update_record
+
+numreadonly=`try_command_on_node -v all $CTDB_TEST_WRAPPER ctdb cattdb test.tdb | grep RO_HAVE | wc -l`
+[ "$numreadonly" != "0" ] && {
+    echo BAD: fetchlock did not revoke delegations
+    exit 1
+}
+
+
+#
+# 7
+#
+
+echo Create some readonly delegations ...
+# fetch record to node 0 and make it dmaster
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb_update_record
+
+# fetch readonly to node 1
+try_command_on_node -v 0 $CTDB_TEST_WRAPPER "ctdb_fetch_readonly_once </dev/null"
+
+numreadonly=`try_command_on_node -v all $CTDB_TEST_WRAPPER ctdb cattdb test.tdb | grep RO_HAVE | wc -l`
+[ "$numreadonly" != "2" ] && {
+    echo BAD: could not create readonly delegation
+    exit 1
+}
+
+
+
+
+#
+# 8
+#
+
+echo verify that a recovery will revoke the delegations ...
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb recover
+
+numreadonly=`try_command_on_node -v all $CTDB_TEST_WRAPPER ctdb cattdb test.tdb | grep RO_HAVE | wc -l`
+[ "$numreadonly" != "0" ] && {
+    echo BAD: recovery did not revoke delegations
+    exit 1
+}
+
+echo OK. test completed successfully
+exit 0
diff --git a/ctdb/tests/simple/76_ctdb_pdb_recovery.sh b/ctdb/tests/simple/76_ctdb_pdb_recovery.sh
new file mode 100755 (executable)
index 0000000..096b9d5
--- /dev/null
@@ -0,0 +1,111 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+The recovery process based on RSN for persistent databases is defective.
+For persistent databases sequence number based recovery method should be
+used. This test checks for the defect in the RSN based recovery method
+for persistent databases and confirms that the same issue is not observed
+when using sequence number based recovery method.
+
+Steps:
+
+1. Create a persistent database
+2. Add a record and update it few times.
+3. Delete the record
+4. Turn off one of the nodes
+5. Add a record with same key.
+6. Turn on the stopped node
+
+Expected results:
+
+* Check that the record is deleted (RSN based recovery) and record is
+  present (sequence number based recovery)
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+do_test()
+{
+# Wipe Test database
+echo "wipe test database"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb $TESTDB
+
+# Add a record   key=test1 data=value1
+# and update values
+for value in value1 value2 value3 value4 value5 ; do
+       echo "store key(test1) data($value)"
+       try_command_on_node -q 0 "(echo -ne $value > /tmp/test_data)"
+       try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb pstore $TESTDB test1 /tmp/test_data
+done
+
+# Delete record
+echo "delete key(test1)"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb pdelete $TESTDB test1
+
+# Stop a node
+echo "stop node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb stop
+
+wait_until_node_has_status 1 stopped
+
+# Add a record   key=test1 data=value2
+echo "store key(test1) data(newvalue1)"
+try_command_on_node -q 0 "(echo -ne newvalue1 > /tmp/test_data)"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb pstore $TESTDB test1 /tmp/test_data
+
+# Continue node
+echo "contine node 1"
+try_command_on_node -q 1 $CTDB_TEST_WRAPPER ctdb continue
+
+wait_until_node_has_status 1 notstopped
+
+}
+
+#
+# Main test
+#
+TESTDB="persistent_test.tdb"
+
+status=0
+
+# Create a temporary persistent database to test with
+echo "create persistent test database $TESTDB"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb attach $TESTDB persistent
+
+echo "set RecoverPDBBySeqNum to 0"
+try_command_on_node -q all $CTDB_TEST_WRAPPER ctdb setvar RecoverPDBBySeqNum 0
+
+do_test
+if try_command_on_node 0 $CTDB_TEST_WRAPPER ctdb pfetch $TESTDB test1 ; then
+       echo "GOOD: Record was not deleted (recovery by RSN worked)"
+else
+       echo "BAD: Record was not deleted"
+       status=1
+fi
+
+# Set RecoverPDBBySeqNum = 1
+echo "set RecoverPDBBySeqNum to 1"
+try_command_on_node -q all $CTDB_TEST_WRAPPER ctdb setvar RecoverPDBBySeqNum 1
+
+do_test
+if try_command_on_node 0 $CTDB_TEST_WRAPPER ctdb pfetch $TESTDB test1 ; then
+       echo "GOOD: Record was not deleted (recovery by sequnce number worked)"
+else
+       echo "BAD: Record was deleted"
+       status=1
+fi
+
+exit $status
diff --git a/ctdb/tests/simple/77_ctdb_db_recovery.sh b/ctdb/tests/simple/77_ctdb_db_recovery.sh
new file mode 100755 (executable)
index 0000000..00fa096
--- /dev/null
@@ -0,0 +1,133 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Recovery can under certain circumstances lead to old record copies
+resurrecting: Recovery selects the newest record copy purely by RSN. At
+the end of the recovery, the recovery master is the dmaster for all
+records in all (non-persistent) databases. And the other nodes locally
+hold the complete copy of the databases. The bug is that the recovery
+process does not increment the RSN on the recovery master at the end of
+the recovery. Now clients acting directly on the Recovery master will
+directly change a record's content on the recmaster without migration
+and hence without RSN bump.  So a subsequent recovery can not tell that
+the recmaster's copy is newer than the copies on the other nodes, since
+their RSN is the same. Hence, if the recmaster is not node 0 (or more
+precisely not the active node with the lowest node number), the recovery
+will choose copies from nodes with lower number and stick to these.
+
+Steps:
+
+1. Create a test database
+2. Add a record with value value1 on recovery master
+3. Force a recovery
+4. Update the record with value value2 on recovery master
+5. Force a recovery
+6. Fetch the record
+
+Expected results:
+
+* The record should have value value2 and not value1
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+#
+# Main test
+#
+TESTDB="rec_test.tdb"
+
+status=0
+
+# Make sure node 0 is not the recovery master
+echo "find out which node is recmaster"
+try_command_on_node -q any $CTDB_TEST_WRAPPER ctdb recmaster
+recmaster="$out"
+if [ "$recmaster" = "0" ]; then
+    echo "node 0 is recmaster, disable recmasterrole on node 0"
+    #
+    # Note:
+    # It should be sufficient to run "ctdb setrecmasterrole off"
+    # on node 0 and wait for election and recovery to finish.
+    # But there were problems related to this in this automatic
+    # test, so for now use "ctdb stop" and "ctdb continue".
+    #
+    echo "stop node 0"
+    try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb stop
+    wait_until_node_has_status 0 stopped
+    echo "continue node 0"
+    try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb continue
+    wait_until_node_has_status 0 notstopped
+
+    try_command_on_node -q any $CTDB_TEST_WRAPPER ctdb recmaster
+    recmaster="$out"
+    if [ "$recmaster" = "0" ]; then
+       echo "failed to move recmaster to different node"
+       exit 1
+    fi
+fi
+
+echo "Recmaster:$recmaster"
+
+# Create a temporary non-persistent database to test with
+echo "create test database $TESTDB"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb attach $TESTDB
+
+# Wipe Test database
+echo "wipe test database"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb wipedb $TESTDB
+
+# Add a record   key=test1 data=value1
+echo "store key(test1) data(value1)"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb writekey $TESTDB test1 value1
+
+# Fetch a record   key=test1
+echo "read key(test1)"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb readkey $TESTDB test1
+echo "$out"
+
+# Do a recovery
+echo "force recovery"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb recover
+
+wait_until_node_has_status $recmaster recovered
+
+# Add a record   key=test1 data=value2
+echo "store key(test1) data(value2)"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb writekey $TESTDB test1 value2
+
+# Fetch a record   key=test1
+echo "read key(test1)"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb readkey $TESTDB test1
+echo "$out"
+
+# Do a recovery
+echo "force recovery"
+try_command_on_node -q $recmaster $CTDB_TEST_WRAPPER ctdb recover
+
+wait_until_node_has_status $recmaster recovered
+
+# Verify record   key=test1
+echo "read key(test1)"
+try_command_on_node $recmaster $CTDB_TEST_WRAPPER ctdb readkey $TESTDB test1
+echo "$out"
+if [ "$out" = "Data: size:6 ptr:[value2]" ]; then
+       echo "GOOD: Recovery did not corrupt database"
+else
+       echo "BAD: Recovery corrupted database"
+       status=1
+fi
+
+exit $status
diff --git a/ctdb/tests/simple/80_ctdb_traverse.sh b/ctdb/tests/simple/80_ctdb_traverse.sh
new file mode 100755 (executable)
index 0000000..65a991a
--- /dev/null
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+Test CTDB cluster wide traverse code.
+
+Prerequisites:
+
+* An active CTDB cluster with at least 2 active nodes.
+
+Steps:
+
+1. Create a test database
+2. Add records on different nodes
+3. Run traverse
+
+Expected results:
+
+* All records are retrieved.
+
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+ctdb_test_init "$@"
+
+set -e
+
+cluster_is_healthy
+
+# Reset configuration
+ctdb_restart_when_done
+
+try_command_on_node 0 "$CTDB listnodes"
+num_nodes=$(echo "$out" | wc -l)
+
+num_records=1000
+
+TESTDB="traverse_test.tdb"
+
+echo "create test database $TESTDB"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb attach $TESTDB
+
+echo "wipe test database $TESTDB"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb wipedb $TESTDB
+
+echo "Add $num_records records to database"
+i=0
+while [ $i -lt $num_records ]; do
+       key=$(printf "key-%04x" $i)
+       value="value-$i"
+
+       n=$[ $i % $num_nodes ]
+       try_command_on_node -q $n $CTDB_TEST_WRAPPER ctdb writekey $TESTDB $key $value
+
+       i=$[ $i + 1 ]
+done
+
+echo "Start a traverse and collect records"
+try_command_on_node -q 0 $CTDB_TEST_WRAPPER ctdb catdb $TESTDB
+
+num_read=$(echo "$out" | tail -n 1 | cut -d\  -f2)
+if [ $num_read -eq $num_records ]; then
+       echo "GOOD: All $num_records records retrieved"
+       status=0
+else
+       echo "BAD: Only $num_read/$num_records records retrieved"
+       status=1
+fi
+
+exit $status
diff --git a/ctdb/tests/simple/99_daemons_shutdown.sh b/ctdb/tests/simple/99_daemons_shutdown.sh
new file mode 100755 (executable)
index 0000000..3583828
--- /dev/null
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+test_info()
+{
+    cat <<EOF
+If we running local daemons and TEST_CLEANUP is true then shutdown the daemons.
+
+No error if ctdbd is not already running on the cluster.
+
+Prerequisites:
+
+* Nodes must be accessible via 'onnode'.
+EOF
+}
+
+. "${TEST_SCRIPTS_DIR}/integration.bash"
+
+# Do not call ctdb_test_init() here.  It will setup ctdb_test_exit()
+# to run and that will find the daemons missing and restart them!
+
+if [ -n "$TEST_LOCAL_DAEMONS" ] && $TEST_CLEANUP ; then
+    daemons_stop
+fi
diff --git a/ctdb/tests/simple/README b/ctdb/tests/simple/README
new file mode 100644 (file)
index 0000000..3ac738d
--- /dev/null
@@ -0,0 +1,2 @@
+Simple integration tests.  These can be run against a pool of CTDB
+daemons running on the local machine - aka "local daemons".
diff --git a/ctdb/tests/src/ctdb_bench.c b/ctdb/tests/src/ctdb_bench.c
new file mode 100644 (file)
index 0000000..3323589
--- /dev/null
@@ -0,0 +1,262 @@
+/* 
+   simple ctdb benchmark
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_client.h"
+#include "ctdb_private.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static struct timeval tp1,tp2;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+
+static int timelimit = 10;
+static int num_records = 10;
+static int num_nodes;
+
+enum my_functions {FUNC_INCR=1, FUNC_FETCH=2};
+
+/*
+  ctdb call function to increment an integer
+*/
+static int incr_func(struct ctdb_call_info *call)
+{
+       if (call->record_data.dsize == 0) {
+               call->new_data = talloc(call, TDB_DATA);
+               if (call->new_data == NULL) {
+                       return CTDB_ERR_NOMEM;
+               }
+               call->new_data->dptr = talloc_size(call, 4);
+               call->new_data->dsize = 4;
+               *(uint32_t *)call->new_data->dptr = 0;
+       } else {
+               call->new_data = &call->record_data;
+       }
+       (*(uint32_t *)call->new_data->dptr)++;
+       return 0;
+}
+
+/*
+  ctdb call function to fetch a record
+*/
+static int fetch_func(struct ctdb_call_info *call)
+{
+       call->reply_data = &call->record_data;
+       return 0;
+}
+
+
+static int msg_count;
+static int msg_plus, msg_minus;
+
+/*
+  handler for messages in bench_ring()
+*/
+static void ring_message_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                                TDB_DATA data, void *private_data)
+{
+       int incr = *(int *)data.dptr;
+       int *count = (int *)private_data;
+       int dest;
+
+       (*count)++;
+       dest = (ctdb_get_pnn(ctdb) + num_nodes + incr) % num_nodes;
+       ctdb_client_send_message(ctdb, dest, srvid, data);
+       if (incr == 1) {
+               msg_plus++;
+       } else {
+               msg_minus++;
+       }
+}
+
+
+static void send_start_messages(struct ctdb_context *ctdb, int incr)
+{
+       /* two messages are injected into the ring, moving
+          in opposite directions */
+       int dest;
+       TDB_DATA data;
+               
+       data.dptr = (uint8_t *)&incr;
+       data.dsize = sizeof(incr);
+
+       dest = (ctdb_get_pnn(ctdb) + num_nodes + incr) % num_nodes;
+       ctdb_client_send_message(ctdb, dest, 0, data);
+}
+
+static void each_second(struct event_context *ev, struct timed_event *te, 
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       /* we kickstart the ring into action by inserting messages from node
+          with pnn 0.
+          it may happen that some other node does not yet have ctdb_bench
+          running in which case the ring is broken and the messages are lost.
+          if so, once every second try again to restart the ring
+       */
+       if (msg_plus == 0) {
+//             printf("no messages recevied, try again to kickstart the ring in forward direction...\n");
+               send_start_messages(ctdb, 1);
+       }
+       if (msg_minus == 0) {
+//             printf("no messages recevied, try again to kickstart the ring in reverse direction...\n");
+               send_start_messages(ctdb, -1);
+       }
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1, 0), each_second, ctdb);
+}
+
+static void dummy_event(struct event_context *ev, struct timed_event *te, 
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1, 0), dummy_event, ctdb);
+}
+
+/*
+  benchmark sending messages in a ring around the nodes
+*/
+static void bench_ring(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       int pnn=ctdb_get_pnn(ctdb);
+
+       if (pnn == 0) {
+               event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1, 0), each_second, ctdb);
+       } else {
+               event_add_timed(ctdb->ev, ctdb, timeval_current_ofs(1, 0), dummy_event, ctdb);
+       }
+
+       start_timer();
+       while (end_timer() < timelimit) {
+               if (pnn == 0 && msg_count % 10000 == 0 && end_timer() > 0) {
+                       printf("Ring: %.2f msgs/sec (+ve=%d -ve=%d)\r", 
+                              msg_count/end_timer(), msg_plus, msg_minus);
+                       fflush(stdout);
+               }
+               event_loop_once(ev);
+       }
+
+       printf("Ring: %.2f msgs/sec (+ve=%d -ve=%d)\n", 
+              msg_count/end_timer(), msg_plus, msg_minus);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "timelimit", 't', POPT_ARG_INT, &timelimit, 0, "timelimit", "integer" },
+               { "num-records", 'r', POPT_ARG_INT, &num_records, 0, "num_records", "integer" },
+               { NULL, 'n', POPT_ARG_INT, &num_nodes, 0, "num_nodes", "integer" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       int ret;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       if (num_nodes == 0) {
+               printf("You must specify the number of nodes\n");
+               exit(1);
+       }
+
+       ev = event_context_init(NULL);
+
+       /* initialise ctdb */
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+       if (ctdb == NULL) {
+               exit(1);
+       }
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0), "test.tdb",
+                             false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       /* setup a ctdb call function */
+       ret = ctdb_set_call(ctdb_db, incr_func,  FUNC_INCR);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_set_call() failed, ignoring return code %d\n", ret));
+       }
+       ret = ctdb_set_call(ctdb_db, fetch_func, FUNC_FETCH);
+       if (ret != 0) {
+               DEBUG(DEBUG_DEBUG,("ctdb_set_call() failed, ignoring return code %d\n", ret));
+       }
+
+       if (ctdb_client_set_message_handler(ctdb, 0, ring_message_handler,&msg_count))
+               goto error;
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       bench_ring(ctdb, ev);
+       
+error:
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_fetch.c b/ctdb/tests/src/ctdb_fetch.c
new file mode 100644 (file)
index 0000000..b900efa
--- /dev/null
@@ -0,0 +1,278 @@
+/* 
+   simple ctdb benchmark
+
+   Copyright (C) Andrew Tridgell  2006
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static struct timeval tp1,tp2;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+
+static int timelimit = 10;
+static int num_records = 10;
+static int num_nodes;
+static int msg_count;
+
+#define TESTKEY "testkey"
+
+/*
+  fetch a record
+  store a expanded record
+  send a message to next node to tell it to do the same
+*/
+static void bench_fetch_1node(struct ctdb_context *ctdb)
+{
+       TDB_DATA key, data, nulldata;
+       struct ctdb_db_context *ctdb_db;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       int dest, ret;
+       struct ctdb_record_handle *h;
+
+       key.dptr = discard_const(TESTKEY);
+       key.dsize = strlen(TESTKEY);
+
+       ctdb_db = ctdb_db_handle(ctdb, "test.tdb");
+
+       h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+       if (h == NULL) {
+               printf("Failed to fetch record '%s' on node %d\n", 
+                      (const char *)key.dptr, ctdb_get_pnn(ctdb));
+               talloc_free(tmp_ctx);
+               return;
+       }
+
+       if (data.dsize > 1000) {
+               data.dsize = 0;
+       }
+
+       if (data.dsize == 0) {
+               data.dptr = (uint8_t *)talloc_asprintf(tmp_ctx, "Test data\n");
+       }
+       data.dptr = (uint8_t *)talloc_asprintf_append((char *)data.dptr, 
+                                                     "msg_count=%d on node %d\n",
+                                                     msg_count, ctdb_get_pnn(ctdb));
+       if (data.dptr == NULL) {
+               printf("Failed to create record\n");
+               talloc_free(tmp_ctx);
+               return;
+       }
+       data.dsize = strlen((const char *)data.dptr)+1;
+
+       ret = ctdb_record_store(h, data);
+       talloc_free(h);
+       if (ret != 0) {
+               printf("Failed to store record\n");
+       }
+
+       talloc_free(tmp_ctx);
+
+       /* tell the next node to do the same */
+       nulldata.dptr = NULL;
+       nulldata.dsize = 0;
+
+       dest = (ctdb_get_pnn(ctdb) + 1) % num_nodes;
+       ctdb_client_send_message(ctdb, dest, 0, nulldata);
+}
+
+/*
+  handler for messages in bench_ring()
+*/
+static void message_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                           TDB_DATA data, void *private_data)
+{
+       msg_count++;
+       bench_fetch_1node(ctdb);
+}
+
+
+/*
+ * timeout handler - noop
+ */
+static void timeout_handler(struct event_context *ev, struct timed_event *timer,
+                           struct timeval curtime, void *private_data)
+{
+       return;
+}
+
+/*
+  benchmark the following:
+
+  fetch a record
+  store a expanded record
+  send a message to next node to tell it to do the same
+
+*/
+static void bench_fetch(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       int pnn=ctdb_get_pnn(ctdb);
+
+       if (pnn == num_nodes - 1) {
+               bench_fetch_1node(ctdb);
+       }
+       
+       start_timer();
+       event_add_timed(ev, ctdb, timeval_current_ofs(timelimit,0), timeout_handler, NULL);
+
+       while (end_timer() < timelimit) {
+               if (pnn == 0 && msg_count % 100 == 0 && end_timer() > 0) {
+                       printf("Fetch: %.2f msgs/sec\r", msg_count/end_timer());
+                       fflush(stdout);
+               }
+               if (event_loop_once(ev) != 0) {
+                       printf("Event loop failed!\n");
+                       break;
+               }
+       }
+
+       printf("Fetch: %.2f msgs/sec\n", msg_count/end_timer());
+}
+
+/*
+  handler for reconfigure message
+*/
+static void reconfigure_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                               TDB_DATA data, void *private_data)
+{
+       int *ready = (int *)private_data;
+       *ready = 1;
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "timelimit", 't', POPT_ARG_INT, &timelimit, 0, "timelimit", "integer" },
+               { "num-records", 'r', POPT_ARG_INT, &num_records, 0, "num_records", "integer" },
+               { NULL, 'n', POPT_ARG_INT, &num_nodes, 0, "num_nodes", "integer" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+       TDB_DATA key, data;
+       struct ctdb_record_handle *h;
+       int cluster_ready=0;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* talloc_enable_leak_report_full(); */
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       if (num_nodes == 0) {
+               printf("You must specify the number of nodes\n");
+               exit(1);
+       }
+
+       ev = event_context_init(NULL);
+       tevent_loop_allow_nesting(ev);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+
+       if (ctdb == NULL) {
+               printf("failed to connect to ctdb daemon.\n");
+               exit(1);
+       }
+
+       ctdb_client_set_message_handler(ctdb, CTDB_SRVID_RECONFIGURE, reconfigure_handler, 
+                                &cluster_ready);
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0), "test.tdb",
+                             false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       ctdb_client_set_message_handler(ctdb, 0, message_handler, &msg_count);
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       /* This test has a race condition. If CTDB receives the message from previous
+        * node, before this node has registered for that message, this node will never
+        * receive that message and will block on receive. Sleeping for some time will
+        * hopefully ensure that the test program on all the nodes register for messages.
+        */
+       printf("Sleeping for %d seconds\n", num_nodes);
+       sleep(num_nodes);
+       bench_fetch(ctdb, ev);
+
+       key.dptr = discard_const(TESTKEY);
+       key.dsize = strlen(TESTKEY);
+
+       printf("Fetching final record\n");
+
+       h = ctdb_fetch_lock(ctdb_db, ctdb, key, &data);
+
+       if (h == NULL) {
+               printf("Failed to fetch record '%s' on node %d\n", 
+                      (const char *)key.dptr, ctdb_get_pnn(ctdb));
+               exit(1);
+       }
+
+       printf("DATA:\n%s\n", (char *)data.dptr);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_fetch_one.c b/ctdb/tests/src/ctdb_fetch_one.c
new file mode 100644 (file)
index 0000000..ba0e183
--- /dev/null
@@ -0,0 +1,145 @@
+/* 
+   simple ctdb benchmark
+   This test just fetch_locks a record and releases it in a loop.
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static int timelimit = 10;
+static int lock_count = 0;
+
+static struct ctdb_db_context *ctdb_db;
+
+#define TESTKEY "testkey"
+
+
+static void alarm_handler(int sig)
+{
+       printf("Locks:%d\n", lock_count);
+       lock_count=0;
+
+       timelimit--;
+       if (timelimit <= 0) {
+               exit(0);
+       }
+       alarm(1);
+}
+
+/*
+       Just try locking/unlocking the same record over and over
+*/
+static void bench_fetch_one_loop(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       TDB_DATA key, data;
+
+       key.dptr = discard_const(TESTKEY);
+       key.dsize = strlen(TESTKEY);
+
+
+       while (1) {
+               TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+               struct ctdb_record_handle *h;
+
+               h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+               if (h == NULL) {
+                       printf("Failed to fetch record '%s' on node %d\n", 
+                               (const char *)key.dptr, ctdb_get_pnn(ctdb));
+                       talloc_free(tmp_ctx);
+                       continue;
+               }
+
+               talloc_free(tmp_ctx);
+               lock_count++;
+       }
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "timelimit", 't', POPT_ARG_INT, &timelimit, 0, "timelimit", "integer" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+
+       if (ctdb == NULL) {
+               printf("failed to connect to ctdb daemon.\n");
+               exit(1);
+       }
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0), "test.tdb",
+                             false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       signal(SIGALRM, alarm_handler);
+       alarm(1);
+
+       bench_fetch_one_loop(ctdb, ev);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_fetch_readonly_loop.c b/ctdb/tests/src/ctdb_fetch_readonly_loop.c
new file mode 100644 (file)
index 0000000..5944fb7
--- /dev/null
@@ -0,0 +1,145 @@
+/* 
+   simple ctdb test tool
+   This test just fetch_locks a record and releases it in a loop.
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+
+static struct ctdb_db_context *ctdb_db;
+
+const char *TESTKEY = "testkey";
+static int count;
+
+/*
+       Just try locking/unlocking a single record once
+*/
+static void fetch_lock_once(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA key, data;
+       struct ctdb_record_handle *h;
+       static time_t t = 0, t2;
+
+       key.dptr = discard_const(TESTKEY);
+       key.dsize = strlen(TESTKEY);
+
+//     printf("Trying to fetch lock the record ...\n");
+
+       h = ctdb_fetch_readonly_lock(ctdb_db, tmp_ctx, key, &data, true);
+       if (h == NULL) {
+               printf("Failed to fetch record '%s' on node %d\n", 
+                       (const char *)key.dptr, ctdb_get_pnn(ctdb));
+               talloc_free(tmp_ctx);
+               exit(10);
+       }
+
+       count++;
+       t2 = time(NULL);
+       if (t != 0 && t != t2) {
+               static int last_count = 0;
+
+               printf("count : %d\n", count - last_count);
+               last_count = count;
+       }
+       t = t2;
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       TDB_DATA key;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "record",      'r', POPT_ARG_STRING, &TESTKEY, 0, "record", "string" },
+               POPT_TABLEEND
+       };
+       int opt, ret;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(5, 0));
+       if (ctdb == NULL) {
+               exit(1);
+       }
+
+       key.dptr  = discard_const(TESTKEY);
+       key.dsize = strlen(TESTKEY);
+
+       ret = ctdb_ctrl_getvnnmap(ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb, &ctdb->vnn_map);
+       if (ret != 0) {
+               printf("failed to get vnnmap\n");
+               exit(10);
+       }
+       printf("Record:%s\n", TESTKEY);
+       printf("Lmaster : %d\n", ctdb_lmaster(ctdb, &key)); 
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(5, 0), "test.tdb", false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       while (1) {
+               fetch_lock_once(ctdb, ev);
+       }
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_fetch_readonly_once.c b/ctdb/tests/src/ctdb_fetch_readonly_once.c
new file mode 100644 (file)
index 0000000..5dc64e0
--- /dev/null
@@ -0,0 +1,117 @@
+/* 
+   simple ctdb test tool
+   This test just fetch_locks a record and releases it once.
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include <poll.h>
+
+const char *TESTKEY = "testkey";
+
+/*
+       Just try locking/unlocking a single record once
+*/
+static void fetch_readonly_once(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, TDB_DATA key)
+{
+       TDB_DATA data;
+       struct ctdb_record_handle *h;
+
+       printf("Trying to readonly fetch lock the record ...\n");
+
+       h = ctdb_fetch_readonly_lock(ctdb_db, ctdb, key, &data, 1);
+       if (h == NULL) {
+               fprintf(stderr, "Failed to get readonly lock\n");
+               exit(1);
+       }
+
+       talloc_free(h);
+       printf("Record released.\n");
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       struct event_context *ev;
+
+       TDB_DATA key;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               { "record",      'r', POPT_ARG_STRING, &TESTKEY, 0, "record", "string" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+       if (ctdb == NULL) {
+               printf("failed to connect to ctdb daemon.\n");
+               exit(1);
+       }
+
+       key.dptr  = discard_const(TESTKEY);
+       key.dsize = strlen(TESTKEY);
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(3, 0), "test.tdb",
+                             false, 0);
+       if (!ctdb_db) {
+               fprintf(stderr, "ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(10);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       fetch_readonly_once(ctdb, ctdb_db, key);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_functest.c b/ctdb/tests/src/ctdb_functest.c
new file mode 100644 (file)
index 0000000..16ca4fd
--- /dev/null
@@ -0,0 +1,189 @@
+/* 
+   Tests for tools/ctdb.c and CTDB client stubs
+
+   Copyright (C) Martin Schwenke 2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#define CTDB_TEST_OVERRIDE_MAIN
+#include "ctdb_test.c"
+
+static void test_read_nodemap(void)
+{
+       struct ctdb_context *ctdb = talloc_zero(NULL, struct ctdb_context);
+
+       ctdb_test_stubs_read_nodemap(ctdb);
+       ctdb_test_stubs_print_nodemap(ctdb);
+
+       talloc_free(ctdb);
+}
+
+static void test_read_ifaces(void)
+{
+       struct ctdb_context *ctdb = talloc_zero(NULL, struct ctdb_context);
+
+       ctdb_test_stubs_read_ifaces(ctdb);
+       ctdb_test_stubs_print_ifaces(ctdb);
+
+       talloc_free(ctdb);
+}
+
+static void test_read_vnnmap(void)
+{
+       struct ctdb_context *ctdb = talloc_zero(NULL, struct ctdb_context);
+
+       ctdb_test_stubs_read_vnnmap(ctdb);
+       ctdb_test_stubs_print_vnnmap(ctdb);
+
+       talloc_free(ctdb);
+}
+
+static void test_fake_setup(void)
+{
+       bool first = true;
+       struct ctdb_context *ctdb = talloc_zero(NULL, struct ctdb_context);
+
+       ctdb_test_stubs_fake_setup(ctdb);
+
+       if (ctdb->nodes != NULL) {
+               if (!first) {
+                       printf("\n");
+               }
+               printf("NODEMAP\n");
+               ctdb_test_stubs_print_nodemap(ctdb);
+               first = false;
+       }
+
+       if (ctdb->ifaces != NULL) {
+               if (!first) {
+                       printf("\n");
+               }
+               printf("IFACES\n");
+               ctdb_test_stubs_print_ifaces(ctdb);
+               first = false;
+       }
+
+       if (ctdb->vnn_map != NULL) {
+               if (!first) {
+                       printf("\n");
+               }
+               printf("VNNMAP\n");
+               ctdb_test_stubs_print_vnnmap(ctdb);
+               first = false;
+       }
+
+       talloc_free(ctdb);
+}
+
+static const char * decode_pnn_mode(uint32_t pnn_mode)
+{
+       int i;
+       static const struct {
+               uint32_t mode;
+               const char *name;
+       } pnn_modes[] = {
+               { CTDB_CURRENT_NODE,        "CURRENT_NODE" },
+               { CTDB_BROADCAST_ALL,       "BROADCAST_ALL" },
+               { CTDB_BROADCAST_VNNMAP,    "BROADCAST_VNNMAP" },
+               { CTDB_BROADCAST_CONNECTED, "BROADCAST_CONNECTED" },
+               { CTDB_MULTICAST,           "MULTICAST" },
+       };
+
+       for (i = 0; i < ARRAY_SIZE(pnn_modes); i++) {
+               if (pnn_mode == pnn_modes[i].mode) {
+                       return pnn_modes[i].name;
+               }
+       }
+
+       return "PNN";
+}
+
+static void print_nodes(uint32_t *nodes, uint32_t pnn_mode)
+{
+       int i;
+
+       printf("NODES:");
+       for (i = 0; i < talloc_array_length(nodes); i++) {
+               printf(" %lu", (unsigned long) nodes[i]);
+       }
+       printf("\n");
+
+       printf("PNN MODE: %s (%lu)\n",
+              decode_pnn_mode(pnn_mode), (unsigned long) pnn_mode);
+}
+
+static void test_parse_nodestring(const char *nodestring_s,
+                                 const char *dd_ok_s)
+{
+       const char *nodestring;
+       bool dd_ok;
+       struct ctdb_context *ctdb;
+       uint32_t *nodes;
+       uint32_t pnn_mode;
+
+       nodestring = strcmp("", nodestring_s) == 0 ? NULL : nodestring_s;
+
+       if (strcasecmp(dd_ok_s, "yes") == 0 ||
+           strcmp(dd_ok_s, "true") == 0) {
+               dd_ok = true;
+       } else {
+               dd_ok = false;
+       }
+
+       ctdb  = talloc_zero(NULL, struct ctdb_context);
+
+       ctdb_test_stubs_read_nodemap(ctdb);
+
+       if (parse_nodestring(ctdb, NULL, nodestring, CTDB_CURRENT_NODE, dd_ok,
+                            &nodes, &pnn_mode)) {
+               print_nodes(nodes, pnn_mode);
+       }
+
+       talloc_free(ctdb);
+}
+
+static void usage(void)
+{
+       fprintf(stderr, "usage: ctdb_tool_functest <op>\n");
+       exit(1);
+}
+
+int main(int argc, const char *argv[])
+{
+       LogLevel = DEBUG_DEBUG;
+       if (getenv("CTDB_TEST_LOGLEVEL")) {
+               LogLevel = atoi(getenv("CTDB_TEST_LOGLEVEL"));
+       }
+
+       if (argc < 2) {
+               usage();
+       }
+
+       if (argc == 2 && strcmp(argv[1], "read_nodemap") == 0) {
+               test_read_nodemap();
+       } else if (argc == 2 && strcmp(argv[1], "read_ifaces") == 0) {
+               test_read_ifaces();
+       } else if (argc == 2 && strcmp(argv[1], "read_vnnmap") == 0) {
+               test_read_vnnmap();
+       } else if (argc == 2 && strcmp(argv[1], "fake_setup") == 0) {
+               test_fake_setup();
+       } else if (argc == 4 && strcmp(argv[1], "parse_nodestring") == 0) {
+               test_parse_nodestring(argv[2], argv[3]);
+       } else {
+               usage();
+       }
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_lock_tdb.c b/ctdb/tests/src/ctdb_lock_tdb.c
new file mode 100644 (file)
index 0000000..ad2a329
--- /dev/null
@@ -0,0 +1,42 @@
+#include <stdio.h>
+#include <fcntl.h>
+
+#include "includes.h"
+
+const char *tdb_file;
+TDB_CONTEXT *tdb;
+
+void signal_handler(int signum)
+{
+       tdb_close(tdb);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+       if (argc != 2) {
+               printf("Usage: %s <tdb file>\n", argv[0]);
+               exit(1);
+       }
+
+       tdb_file = argv[1];
+
+       tdb = tdb_open(tdb_file, 0, 0, O_RDWR, 0);
+       if (tdb == NULL) {
+               fprintf(stderr, "Failed to open TDB file %s\n", tdb_file);
+               exit(1);
+       }
+
+       signal(SIGINT, signal_handler);
+
+       if (tdb_lockall(tdb) != 0) {
+               fprintf(stderr, "Failed to lock database %s\n", tdb_file);
+               tdb_close(tdb);
+               exit(1);
+       }
+
+       sleep(999999);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_persistent.c b/ctdb/tests/src/ctdb_persistent.c
new file mode 100644 (file)
index 0000000..0bf92b3
--- /dev/null
@@ -0,0 +1,268 @@
+/* 
+   simple tool to test persistent databases
+
+   Copyright (C) Andrew Tridgell  2006-2007
+   Copyright (c) Ronnie sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static struct timeval tp1,tp2;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+static int timelimit = 10;
+
+static unsigned int pnn;
+
+static TDB_DATA old_data;
+
+static int success = true;
+
+static void each_second(struct event_context *ev, struct timed_event *te, 
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+       int i;
+       uint32_t *old_counters;
+
+
+       printf("[%4u] Counters: ", getpid());
+       old_counters = (uint32_t *)old_data.dptr;
+       for (i=0;i<old_data.dsize/sizeof(uint32_t); i++) {
+               printf("%6u ", old_counters[i]);
+       }
+       printf("\n"); 
+
+       event_add_timed(ev, ctdb, timeval_current_ofs(1, 0), each_second, ctdb);
+}
+
+static void check_counters(struct ctdb_context *ctdb, TDB_DATA data)
+{
+       int i;
+       uint32_t *counters, *old_counters;
+       unsigned char *tmp_dptr;
+
+       counters     = (uint32_t *)data.dptr;
+       old_counters = (uint32_t *)old_data.dptr;
+
+       /* check that all the counters are monotonic increasing */
+       for (i=0; i<old_data.dsize/sizeof(uint32_t); i++) {
+               if (counters[i]<old_counters[i]) {
+                       printf("[%4u] ERROR: counters has decreased for node %u  From %u to %u\n", 
+                              getpid(), i, old_counters[i], counters[i]);
+                       success = false;
+               }
+       }
+
+       if (old_data.dsize != data.dsize) {
+               old_data.dsize = data.dsize;
+               tmp_dptr = talloc_realloc_size(ctdb, old_data.dptr, old_data.dsize);
+               if (tmp_dptr == NULL) {
+                       printf("[%4u] ERROR: talloc_realloc_size failed.\n", getpid());
+                       success = false;
+                       return;
+               } else {
+                       old_data.dptr = tmp_dptr;
+               }
+       }
+
+       memcpy(old_data.dptr, data.dptr, data.dsize);
+}
+
+
+
+static void test_store_records(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       TDB_DATA key;
+       struct ctdb_db_context *ctdb_db;
+
+       ctdb_db = ctdb_db_handle(ctdb, "persistent.tdb");
+
+       key.dptr = discard_const("testkey");
+       key.dsize = strlen((const char *)key.dptr)+1;
+
+       start_timer();
+       while (end_timer() < timelimit) {
+               TDB_DATA data;
+               TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+               struct ctdb_transaction_handle *h;
+               int ret;
+               uint32_t *counters;
+
+               h = ctdb_transaction_start(ctdb_db, tmp_ctx);
+               if (h == NULL) {
+                       printf("Failed to start transaction on node %d\n",
+                              ctdb_get_pnn(ctdb));
+                       talloc_free(tmp_ctx);
+                       return;
+               }
+
+               ret = ctdb_transaction_fetch(h, tmp_ctx, key, &data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to fetch record\n"));
+                       exit(1);
+               }
+
+               if (data.dsize < sizeof(uint32_t) * (pnn+1)) {
+                       unsigned char *ptr = data.dptr;
+                       
+                       data.dptr = talloc_zero_size(tmp_ctx, sizeof(uint32_t) * (pnn+1));
+                       memcpy(data.dptr, ptr, data.dsize);
+                       talloc_free(ptr);
+
+                       data.dsize = sizeof(uint32_t) * (pnn+1);
+               }
+
+               if (data.dptr == NULL) {
+                       printf("Failed to realloc array\n");
+                       talloc_free(tmp_ctx);
+                       return;
+               }
+
+               counters = (uint32_t *)data.dptr;
+
+               /* bump our counter */
+               counters[pnn]++;
+
+               ret = ctdb_transaction_store(h, key, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to store record\n"));
+                       exit(1);
+               }
+
+               ret = ctdb_transaction_commit(h);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to commit transaction\n"));
+                       //exit(1);
+               }
+
+               /* store the counters and verify that they are sane */
+               if (pnn == 0) {
+                       check_counters(ctdb, data);
+               }
+
+               talloc_free(tmp_ctx);
+       }
+
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       int unsafe_writes = 0;
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "timelimit", 't', POPT_ARG_INT, &timelimit, 0, "timelimit", "integer" },
+               { "unsafe-writes", 'u', POPT_ARG_NONE, &unsafe_writes, 0, "do not use tdb transactions when writing", NULL },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       setlinebuf(stdout);
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+       if (ctdb == NULL) {
+               printf("Could not attach to daemon\n");
+               return 1;
+       }
+
+       /* attach to a specific database */
+       if (unsafe_writes == 1) {
+               ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0),
+                                     "persistent.tdb", true, TDB_NOSYNC);
+       } else {
+               ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0),
+                                     "persistent.tdb", true, 0);
+       }
+
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       pnn = ctdb_get_pnn(ctdb);
+       printf("Starting test on node %u. running for %u seconds\n", pnn, timelimit);
+
+       if (pnn == 0) {
+               event_add_timed(ev, ctdb, timeval_current_ofs(1, 0), each_second, ctdb);
+       }
+
+       test_store_records(ctdb, ev);
+
+       if (pnn == 0) {
+               if (success != true) {
+                       printf("The test FAILED\n");
+                       return 1;
+               } else {
+                       printf("SUCCESS!\n");
+               }
+       }
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_porting_tests.c b/ctdb/tests/src/ctdb_porting_tests.c
new file mode 100644 (file)
index 0000000..0c43451
--- /dev/null
@@ -0,0 +1,305 @@
+/*
+   Test porting lib (common/system_*.c)
+
+   Copyright (C) Mathieu Parent 2013
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "include/ctdb_private.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+static struct {
+       const char *socketname;
+       const char *debuglevel;
+       pid_t helper_pid;
+       int socket;
+       int successcount;
+       int testcount;
+} globals = {
+       .socketname = "/tmp/test.sock"
+};
+
+
+
+/*
+  Socket functions
+*/
+/*
+  create a unix domain socket and bind it
+  return a file descriptor open on the socket
+*/
+static int socket_server_create(void)
+{
+       struct sockaddr_un addr;
+
+       globals.socket = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (globals.socket == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to create server socket: %s\n", strerror(errno)));
+               return -1;
+       }
+
+       set_close_on_exec(globals.socket);
+       //set_nonblocking(globals.socket);
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sun_family = AF_UNIX;
+       strncpy(addr.sun_path, globals.socketname, sizeof(addr.sun_path));
+
+       if (bind(globals.socket, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to bind on socket '%s': %s\n", globals.socketname, strerror(errno)));
+               goto failed;
+       }
+
+       if (chown(globals.socketname, geteuid(), getegid()) != 0 ||
+               chmod(globals.socketname, 0700) != 0) {
+               DEBUG(DEBUG_CRIT,("Unable to secure socket '%s': %s\n", globals.socketname, strerror(errno)));
+               goto failed;
+       }
+
+
+       if (listen(globals.socket, 100) != 0) {
+               DEBUG(DEBUG_CRIT,("Unable to listen on socket '%s': %s\n", globals.socketname, strerror(errno)));
+               goto failed;
+       }
+       return 0;
+
+failed:
+       close(globals.socket);
+       globals.socket = -1;
+       return -1;
+}
+
+static int socket_server_wait_peer(void)
+{
+       struct sockaddr_un addr;
+       socklen_t len;
+       int fd;
+
+       memset(&addr, 0, sizeof(addr));
+       len = sizeof(addr);
+       fd = accept(globals.socket, (struct sockaddr *)&addr, &len);
+       if (fd == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to accept on ctdb socket '%s': %s\n", globals.socketname, strerror(errno)));
+               return -1;
+       }
+
+       //set_nonblocking(fd);
+       set_close_on_exec(fd);
+       return fd;
+}
+
+static int socket_server_close(void)
+{
+       if (close(globals.socket) == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to close server socket: %s\n", strerror(errno)));
+               return -1;
+       }
+       if (unlink(globals.socketname) == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to remove server socket: %s\n", strerror(errno)));
+               return -1;
+       }
+       return 0;
+}
+
+static int socket_client_connect(void)
+{
+       struct sockaddr_un addr;
+       int client = 0;
+
+       client = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (client == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to create client socket: %s\n", strerror(errno)));
+               return -1;
+       }
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sun_family = AF_UNIX;
+       strncpy(addr.sun_path, globals.socketname, sizeof(addr.sun_path));
+       if (connect(client, (struct sockaddr *)&addr, sizeof(addr))==-1) {
+               DEBUG(DEBUG_CRIT,("Unable to connect to '%s': %s\n", globals.socketname, strerror(errno)));
+               close(client);
+               return -1;
+       }
+
+       return client;
+}
+
+static int socket_client_write(int client)
+{
+       if (write(client, "\0", 1) == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to write to client socket: %s\n", strerror(errno)));
+               return -1;
+       }
+       return 0;
+}
+
+static int socket_client_close(int client)
+{
+       if (close(client) == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to close client socket: %s\n", strerror(errno)));
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  forked program
+*/
+static int fork_helper(void)
+{
+       pid_t pid;
+       int i, client, max_rounds = 10;
+
+       pid = fork();
+       if (pid == -1) {
+               DEBUG(DEBUG_CRIT,("Unable to fork: %s\n", strerror(errno)));
+               return -1;
+       }
+       if (pid == 0) { // Child
+               client = socket_client_connect();
+               socket_client_write(client);
+               for (i = 1 ; i <= max_rounds ; i++ ) {
+                       DEBUG(DEBUG_DEBUG,("Child process waiting ( %d/%d)\n", i, max_rounds));
+                       sleep(1);
+               }
+               socket_client_close(client);
+               exit(0);
+       } else {
+               globals.helper_pid = pid;
+       }
+       return 0;
+}
+
+/*
+  tests
+*/
+int test_ctdb_sys_check_iface_exists(void)
+{
+       const char *fakename;
+       bool test;
+       globals.testcount++;
+       fakename = strdup("fake");
+       if (fakename == NULL) {
+               DEBUG(DEBUG_CRIT,("Unable to allocate memory\n"));
+               return -1;
+       }
+       test = ctdb_sys_check_iface_exists(fakename);
+       if(test == true) {
+               DEBUG(DEBUG_CRIT,("Test failed: Fake interface detected: %s\n", fakename));
+               return -1;
+       }
+       DEBUG(DEBUG_INFO,("Test OK: Fake interface not detected: %s\n", fakename));
+       globals.successcount++;
+       return 0;
+}
+
+int test_ctdb_get_peer_pid(void)
+{
+       int ret;
+       int fd;
+       pid_t peer_pid = 0;
+       globals.testcount++;
+       fd = socket_server_wait_peer();
+       ret = ctdb_get_peer_pid(fd, &peer_pid);
+       if (ret == -1) {
+               DEBUG(DEBUG_CRIT,("Test failed: Unable to get peer process id\n"));
+               return -1;
+       }
+       if (peer_pid <= 0) {
+               DEBUG(DEBUG_CRIT,("Test failed: Invalid peer process id: %d\n", peer_pid));
+               return -1;
+       }
+       DEBUG(DEBUG_INFO,("Test OK: Peer process id: %d\n", peer_pid));
+       globals.successcount++;
+       return 0;
+}
+
+int test_ctdb_get_process_name(void)
+{
+       char *process_name = NULL;
+       globals.testcount++;
+       process_name = ctdb_get_process_name(globals.helper_pid);
+       if ((process_name == NULL) || !strcmp(process_name, "unknown")) {
+               DEBUG(DEBUG_CRIT,("Test failed: Invalid process name of %d: %s\n", globals.helper_pid, process_name));
+               return -1;
+       }
+       DEBUG(DEBUG_INFO,("Test OK: Name of PID=%d: %s\n", globals.helper_pid, process_name));
+       globals.successcount++;
+       return 0;
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               { "socket", 0, POPT_ARG_STRING, &globals.socketname, 0, "local socket name", "filename" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+
+       LogLevel = DEBUG_INFO;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n",
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       if (globals.socketname == NULL) {
+               DEBUG(DEBUG_CRIT,("Socket name is undefined\n"));
+               exit(1);
+       }
+       if (socket_server_create()) {
+               DEBUG(DEBUG_CRIT,("Socket error: exiting\n"));
+               exit(1);
+       }
+       if (fork_helper()) {
+               DEBUG(DEBUG_CRIT,("Forking error: exiting\n"));
+               exit(1);
+       }
+       /* FIXME: Test tcp_checksum6, tcp_checksum */
+       /* FIXME: Test ctdb_sys_send_arp, ctdb_sys_send_tcp */
+       /* FIXME: Test ctdb_sys_{open,close}_capture_socket, ctdb_sys_read_tcp_packet */
+       test_ctdb_sys_check_iface_exists();
+       test_ctdb_get_peer_pid();
+       test_ctdb_get_process_name();
+       /* FIXME: Test ctdb_get_lock_info, ctdb_get_blocker_pid*/
+
+       socket_server_close();
+
+       DEBUG(DEBUG_INFO,("%d/%d tests successfull\n", globals.successcount, globals.testcount));
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_randrec.c b/ctdb/tests/src/ctdb_randrec.c
new file mode 100644 (file)
index 0000000..60d233b
--- /dev/null
@@ -0,0 +1,201 @@
+/* 
+   create a lot of random records, both current records and deleted records
+
+   Copyright (C) Andrew Tridgell  2008
+       Ronnie sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static struct timeval tp1,tp2;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+static int num_records = 10;
+static int delete_pct = 75;
+static int base_rec;
+
+static void store_records(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       TDB_DATA key, data;
+       struct ctdb_db_context *ctdb_db;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       int ret;
+       struct ctdb_record_handle *h;
+       uint32_t i=0;
+       
+       ctdb_db = ctdb_db_handle(ctdb, "test.tdb");
+
+       srandom(time(NULL) ^ getpid());
+
+       start_timer();
+
+       printf("working with %d records\n", num_records);
+       while (1) {
+               unsigned r = random() % num_records;
+               key.dptr = (uint8_t *)&r;
+               key.dsize = sizeof(r); 
+
+               h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+               if (h == NULL) {
+                       printf("Failed to fetch record '%s' on node %d\n", 
+                              (const char *)key.dptr, ctdb_get_pnn(ctdb));
+                       talloc_free(tmp_ctx);
+                       return;
+               }
+
+               if (random() % 100 < delete_pct) {
+                       data.dptr = NULL;
+                       data.dsize = 0;
+               } else {
+                       data.dptr = talloc_zero_size(h, data.dsize + sizeof(r));
+                       data.dsize += sizeof(r);
+               }
+
+               ret = ctdb_record_store(h, data);
+               if (ret != 0) {
+                       printf("Failed to store record\n");
+               }
+
+               if (data.dptr == NULL && data.dsize == 0) {
+                       struct ctdb_control_schedule_for_deletion *dd;
+                       TDB_DATA indata;
+                       int32_t status;
+
+                       indata.dsize = offsetof(struct ctdb_control_schedule_for_deletion, key) + key.dsize;
+                       indata.dptr = talloc_zero_array(ctdb, uint8_t, indata.dsize);
+                       if (indata.dptr == NULL) {
+                               printf("out of memory\n");
+                               exit(1);
+                       }
+                       dd = (struct ctdb_control_schedule_for_deletion *)(void *)indata.dptr;
+                       dd->db_id = ctdb_db->db_id;
+                       dd->hdr = *ctdb_header_from_record_handle(h);
+                       dd->keylen = key.dsize;
+                       memcpy(dd->key, key.dptr, key.dsize);
+
+                       ret = ctdb_control(ctdb,
+                                          CTDB_CURRENT_NODE,
+                                          ctdb_db->db_id,
+                                          CTDB_CONTROL_SCHEDULE_FOR_DELETION,
+                                          0, /* flags */
+                                          indata,
+                                          NULL, /* mem_ctx */
+                                          NULL, /* outdata */
+                                          &status,
+                                          NULL, /* timeout : NULL == wait forever */
+                                          NULL); /* error message */
+
+                       talloc_free(indata.dptr);
+
+                       if (ret != 0 || status != 0) {
+                               DEBUG(DEBUG_ERR, (__location__ " Error sending "
+                                                 "SCHEDULE_FOR_DELETION "
+                                                 "control.\n"));
+                       }
+               }
+
+               talloc_free(h);
+
+               if (i % 1000 == 0) {
+                       printf("%7.0f recs/second   %u total\r", 1000.0 / end_timer(), i);
+                       fflush(stdout);
+                       start_timer();
+               }
+               i++;
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "num-records", 'r', POPT_ARG_INT, &num_records, 0, "num_records", "integer" },
+               { "base-rec", 'b', POPT_ARG_INT, &base_rec, 0, "base_rec", "integer" },
+               { "delete-pct", 'p', POPT_ARG_INT, &delete_pct, 0, "delete_pct", "integer" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+
+       if (ctdb == NULL) {
+               printf("failed to connect to daemon\n");
+               exit(1);
+       }
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0), "test.tdb",
+                             false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       store_records(ctdb, ev);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_store.c b/ctdb/tests/src/ctdb_store.c
new file mode 100644 (file)
index 0000000..6920343
--- /dev/null
@@ -0,0 +1,163 @@
+/* 
+   simple tool to create a lot of records on a tdb and to read them out
+
+   Copyright (C) Andrew Tridgell  2006
+       Ronnie sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static int num_records = 10;
+static int base_rec;
+
+static void store_records(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       TDB_DATA key, data;
+       struct ctdb_db_context *ctdb_db;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       int ret;
+       struct ctdb_record_handle *h;
+       uint32_t i;
+       
+       ctdb_db = ctdb_db_handle(ctdb, "test.tdb");
+
+       printf("creating %d records\n", num_records);
+       for (i=0;i<num_records;i++) {
+               int r = base_rec + i;
+               key.dptr = (uint8_t *)&r;
+               key.dsize = sizeof(uint32_t); 
+
+               h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+               if (h == NULL) {
+                       printf("Failed to fetch record '%s' on node %d\n", 
+                              (const char *)key.dptr, ctdb_get_pnn(ctdb));
+                       talloc_free(tmp_ctx);
+                       return;
+               }
+
+               data.dptr = (uint8_t *)&i;
+               data.dsize = sizeof(uint32_t);
+
+               ret = ctdb_record_store(h, data);
+               talloc_free(h);
+               if (ret != 0) {
+                       printf("Failed to store record\n");
+               }
+               if (i % 1000 == 0) {
+                       printf("%u\r", i);
+                       fflush(stdout);
+               }
+       }
+
+       printf("fetching all %d records\n", num_records);
+       while (1) {
+               for (i=0;i<num_records;i++) {
+                       int r = base_rec + i;
+                       key.dptr = (uint8_t *)&r;
+                       key.dsize = sizeof(uint32_t); 
+
+                       h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+                       if (h == NULL) {
+                               printf("Failed to fetch record '%s' on node %d\n", 
+                                      (const char *)key.dptr, ctdb_get_pnn(ctdb));
+                               talloc_free(tmp_ctx);
+                               return;
+                       }
+                       talloc_free(h);
+               }
+               sleep(1);
+               printf(".");
+               fflush(stdout);
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "num-records", 'r', POPT_ARG_INT, &num_records, 0, "num_records", "integer" },
+               { "base-rec", 'b', POPT_ARG_INT, &base_rec, 0, "base_rec", "integer" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* talloc_enable_leak_report_full(); */
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+
+       if (ctdb == NULL) {
+               printf("failed to connect to ctdb daemon.\n");
+               exit(1);
+       }
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0), "test.tdb", false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       store_records(ctdb, ev);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_takeover_tests.c b/ctdb/tests/src/ctdb_takeover_tests.c
new file mode 100644 (file)
index 0000000..7fd989e
--- /dev/null
@@ -0,0 +1,637 @@
+/* 
+   Tests for ctdb_takeover.c
+
+   Copyright (C) Martin Schwenke 2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "ctdbd_test.c"
+
+/* This is lazy... but it is test code! */
+#define CTDB_TEST_MAX_NODES 256
+#define CTDB_TEST_MAX_IPS 1024
+
+/* Format of each line is "IP pnn" - the separator has to be at least
+ * 1 space (not a tab or whatever - a space!).
+ */
+static struct ctdb_public_ip_list *
+read_ctdb_public_ip_list(TALLOC_CTX *ctx)
+{
+       char line[1024];
+       ctdb_sock_addr addr;
+       char *t;
+       int pnn;
+       struct ctdb_public_ip_list *last = NULL;
+
+       struct ctdb_public_ip_list *ret = NULL;
+
+       while (fgets(line, sizeof(line), stdin) != NULL) {
+               
+               if ((t = strchr(line, ' ')) != NULL) {
+                       /* Make line contain just the address */
+                       *t = '\0';
+                       /* Point to PNN or leading whitespace...  */
+                       t++;
+                       pnn = (int) strtol(t, (char **) NULL, 10);
+               } else {
+                       /* Assume just an IP address, default to PNN -1 */
+                       if ((t = strchr(line, '\n')) != NULL) {
+                               *t = '\0';
+                       }
+                       pnn = -1;
+               }
+              
+               if (parse_ip(line, NULL, 0, &addr)) {
+                       if (last == NULL) {
+                               last = talloc(ctx, struct ctdb_public_ip_list);
+                       } else {
+                               last->next = talloc(ctx, struct ctdb_public_ip_list);
+                               last = last->next;
+                       }
+                       last->next = NULL;
+                       last->pnn = pnn;
+                       memcpy(&(last->addr), &addr, sizeof(addr));
+                       if (ret == NULL) {
+                               ret = last;
+                       }
+               } else {
+                       DEBUG(DEBUG_ERR, (__location__ " ERROR, bad address :%s\n", line));
+               }
+       }
+                       
+       return ret;
+}
+
+void print_ctdb_public_ip_list(struct ctdb_public_ip_list * ips)
+{
+       while (ips) {
+               printf("%s %d\n", ctdb_addr_to_str(&(ips->addr)), ips->pnn);
+               ips = ips->next;
+       }
+}
+
+/* Read some IPs from stdin, 1 per line, parse them and then print
+ * them back out. */
+void ctdb_test_read_ctdb_public_ip_list(void)
+{
+       struct ctdb_public_ip_list *l;
+
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       l = read_ctdb_public_ip_list(tmp_ctx);
+
+       print_ctdb_public_ip_list(l);
+
+       talloc_free(tmp_ctx);
+}
+
+/* Format of each line is "IP CURRENT_PNN ALLOWED_PNN,...".
+ */
+static bool
+read_ctdb_public_ip_info(TALLOC_CTX *ctx,
+                        int numnodes,
+                        struct ctdb_public_ip_list ** all_ips,
+                        struct ctdb_all_public_ips *** avail)
+{
+       char line[1024];
+       ctdb_sock_addr addr;
+       char *t, *tok;
+       struct ctdb_public_ip_list * ta;
+       int pnn, numips, curr, n, i;
+       struct ctdb_all_public_ips * a;
+
+       struct ctdb_public_ip_list *last = NULL;
+
+       *avail = talloc_array_size(ctx, sizeof(struct ctdb_all_public_ips *), CTDB_TEST_MAX_NODES);
+       memset(*avail, 0,
+              sizeof(struct ctdb_all_public_ips *) * CTDB_TEST_MAX_NODES);
+
+       numips = 0;
+       *all_ips = NULL;
+       while (fgets(line, sizeof(line), stdin) != NULL) {
+
+               /* Get rid of pesky newline */
+               if ((t = strchr(line, '\n')) != NULL) {
+                       *t = '\0';
+               }
+
+               /* Exit on an empty line */
+               if (line[0] == '\0') {
+                       break;
+               }
+
+               /* Get the IP address */
+               tok = strtok(line, " \t");
+               if (tok == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line ignored :%s\n", line));
+                       continue;
+               }
+
+               if (!parse_ip(tok, NULL, 0, &addr)) {
+                       DEBUG(DEBUG_ERR, (__location__ " ERROR, bad address :%s\n", tok));
+                       continue;
+               }
+
+               numips++;
+               if (numips > CTDB_TEST_MAX_IPS) {
+                       DEBUG(DEBUG_ERR, ("ERROR: Exceeding CTDB_TEST_MAX_IPS: %d\n", CTDB_TEST_MAX_IPS));
+                       exit(1);
+               }
+
+               /* Get the PNN */
+               pnn = -1;
+               tok = strtok(NULL, " \t");
+               if (tok != NULL) {
+                       pnn = (int) strtol(tok, (char **) NULL, 10);
+               }
+
+               /* Add address + pnn to all_ips */
+               if (last == NULL) {
+                       last = talloc(ctx, struct ctdb_public_ip_list);
+               } else {
+                       last->next = talloc(ctx, struct ctdb_public_ip_list);
+                       last = last->next;
+               }
+               last->next = NULL;
+               last->pnn = pnn;
+               memcpy(&(last->addr), &addr, sizeof(addr));
+               if (*all_ips == NULL) {
+                       *all_ips = last;
+               }
+
+               tok = strtok(NULL, " \t#");
+               if (tok == NULL) {
+                       continue;
+               }
+
+               /* Handle allowed nodes for addr */
+               t = strtok(tok, ",");
+               while (t != NULL) {
+                       n = (int) strtol(t, (char **) NULL, 10);
+                       if ((*avail)[n] == NULL) {
+                               (*avail)[n] = talloc_array(ctx, struct ctdb_all_public_ips, CTDB_TEST_MAX_IPS);
+                               (*avail)[n]->num = 0;
+                       }
+                       curr = (*avail)[n]->num;
+                       (*avail)[n]->ips[curr].pnn = pnn;
+                       memcpy(&((*avail)[n]->ips[curr].addr),
+                              &addr, sizeof(addr));
+                       (*avail)[n]->num++;
+                       t = strtok(NULL, ",");
+               }
+
+       }
+
+       /* Build list of all allowed IPs */
+       a = talloc_array(ctx, struct ctdb_all_public_ips, CTDB_TEST_MAX_IPS);
+       a->num = numips;
+       for (ta = *all_ips, i=0; ta != NULL && i < numips ; ta = ta->next, i++) {
+               a->ips[i].pnn = ta->pnn;
+               memcpy(&(a->ips[i].addr), &(ta->addr), sizeof(ta->addr));
+       }
+
+       /* Assign it to any nodes that don't have a list assigned */
+       for (n = 0; n < numnodes; n++) {
+               if ((*avail)[n] == NULL) {
+                       (*avail)[n] = a;
+               }
+       }
+
+       return true;
+}
+
+void print_ctdb_available_ips(int numnodes, struct ctdb_all_public_ips **avail)
+{
+       int n, i;
+
+       for (n = 0; n < numnodes; n++) {
+               if ((avail[n] != NULL) && (avail[n]->num > 0)) {
+                       printf("%d:", n);
+                       for (i = 0; i < avail[n]->num; i++) {
+                               printf("%s%s",
+                                      (i == 0) ? " " : ", ",
+                                      ctdb_addr_to_str(&(avail[n]->ips[i].addr)));
+                       }
+                       printf("\n");
+               }
+       }
+}
+
+void ctdb_test_read_ctdb_public_ip_info(const char nodestates[])
+{
+       int numnodes;
+       struct ctdb_public_ip_list *l;
+       struct ctdb_all_public_ips **avail;
+       char *tok, *ns;
+
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       /* Avoid that const */
+       ns = talloc_strdup(tmp_ctx, nodestates);
+
+       numnodes = 0;
+       tok = strtok(ns, ",");
+       while (tok != NULL) {
+               numnodes++;
+               if (numnodes > CTDB_TEST_MAX_NODES) {
+                       DEBUG(DEBUG_ERR, ("ERROR: Exceeding CTDB_TEST_MAX_NODES: %d\n", CTDB_TEST_MAX_NODES));
+                       exit(1);
+               }
+               tok = strtok(NULL, ",");
+       }
+       
+       read_ctdb_public_ip_info(tmp_ctx, numnodes, &l, &avail);
+
+       print_ctdb_public_ip_list(l);
+       print_ctdb_available_ips(numnodes, avail);
+
+       talloc_free(tmp_ctx);
+}
+
+/* Read 2 IPs from stdin, calculate the IP distance and print it. */
+void ctdb_test_ip_distance(void)
+{
+       struct ctdb_public_ip_list *l;
+       uint32_t distance;
+
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       l = read_ctdb_public_ip_list(tmp_ctx);
+
+       if (l && l->next) {
+               distance = ip_distance(&(l->addr), &(l->next->addr));
+               printf ("%lu\n", (unsigned long) distance);
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/* Read some IPs from stdin, calculate the sum of the squares of the
+ * IP distances between the 1st argument and those read that are on
+ * the given node. The given IP must one of the ones in the list.  */
+void ctdb_test_ip_distance_2_sum(const char ip[], int pnn)
+{
+       struct ctdb_public_ip_list *l;
+       struct ctdb_public_ip_list *t;
+       ctdb_sock_addr addr;
+       uint32_t distance;
+
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       
+       l = read_ctdb_public_ip_list(tmp_ctx);
+
+       if (l && parse_ip(ip, NULL, 0, &addr)) {
+               /* find the entry for the specified IP */
+               for (t=l; t!=NULL; t=t->next) {
+                       if (ctdb_same_ip(&(t->addr), &addr)) {
+                               break;
+                       }
+               }
+
+               if (t == NULL) {
+                       fprintf(stderr, "IP NOT PRESENT IN LIST");
+                       exit(1);
+               }
+
+               distance = ip_distance_2_sum(&(t->addr), l, pnn);
+               printf ("%lu\n", (unsigned long) distance);
+       } else {
+               fprintf(stderr, "BAD INPUT");
+               exit(1);
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/* Read some IPs from stdin, calculate the sume of the squares of the
+ * IP distances between the first and the rest, and print it. */
+void ctdb_test_lcp2_imbalance(int pnn)
+{
+       struct ctdb_public_ip_list *l;
+       uint32_t imbalance;
+
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       l = read_ctdb_public_ip_list(tmp_ctx);
+
+       imbalance = lcp2_imbalance(l, pnn);
+       printf ("%lu\n", (unsigned long) imbalance);
+
+       talloc_free(tmp_ctx);
+}
+
+static uint32_t *get_tunable_values(TALLOC_CTX *tmp_ctx,
+                                   int numnodes,
+                                   const char *tunable)
+{
+       int i;
+       char *tok;
+       uint32_t *tvals = talloc_zero_array(tmp_ctx, uint32_t, numnodes);
+       char *t = getenv(tunable);
+
+       if (t) {
+               if (strcmp(t, "1") == 0) {
+                       for (i=0; i<numnodes; i++) {
+                               tvals[i] = 1;
+                       }
+               } else {
+                       tok = strtok(t, ",");
+                       i = 0;
+                       while (tok != NULL) {
+                               tvals[i] =
+                                       (uint32_t) strtol(tok, NULL, 0);
+                               i++;
+                               tok = strtok(NULL, ",");
+                       }
+                       if (i != numnodes) {
+                               fprintf(stderr, "ERROR: Wrong number of values in %s\n", tunable);
+                               exit(1);
+                       }
+               }
+       }
+
+       return tvals;
+}
+
+static enum ctdb_runstate *get_runstate(TALLOC_CTX *tmp_ctx,
+                                       int numnodes)
+{
+       int i;
+       uint32_t *tvals;
+       enum ctdb_runstate *runstate =
+               talloc_zero_array(tmp_ctx, enum ctdb_runstate, numnodes);
+       char *t = getenv("CTDB_TEST_RUNSTATE");
+
+       if (t == NULL) {
+               for (i=0; i<numnodes; i++) {
+                       runstate[i] = CTDB_RUNSTATE_RUNNING;
+               }
+       } else {
+               tvals = get_tunable_values(tmp_ctx, numnodes, "CTDB_TEST_RUNSTATE");
+               for (i=0; i<numnodes; i++) {
+                       runstate[i] = (enum ctdb_runstate) tvals[i];
+               }
+               talloc_free(tvals);
+       }
+
+       return runstate;
+}
+
+/* Fake up enough CTDB state to be able to run the IP allocation
+ * algorithm.  Usually this sets up some standard state, sets the node
+ * states from the command-line and reads the current IP layout from
+ * stdin.
+ *
+ * However, if read_ips_for_multiple_nodes is true then each node's
+ * idea of the IP layout is read separately from stdin.  In this mode
+ * is doesn't make much sense to use read_ctdb_public_ip_info's
+ * optional ALLOWED_PNN,... list in the input, since each node is
+ * being handled separately anyway.  IPs for each node are separated
+ * by a blank line.  This mode is for testing weird behaviours where
+ * the IP layouts differs across nodes and we want to improve
+ * create_merged_ip_list(), so should only be used in tests of
+ * ctdb_takeover_run_core().  Yes, it is a hack...  :-)
+ */
+void ctdb_test_init(const char nodestates[],
+                   struct ctdb_context **ctdb,
+                   struct ctdb_public_ip_list **all_ips,
+                   struct ctdb_ipflags **ipflags,
+                   bool read_ips_for_multiple_nodes)
+{
+       struct ctdb_all_public_ips **avail;
+       int i, numnodes;
+       uint32_t nodeflags[CTDB_TEST_MAX_NODES];
+       char *tok, *ns, *t;
+       struct ctdb_node_map *nodemap;
+       uint32_t *tval_noiptakeover;
+       uint32_t *tval_noiptakeoverondisabled;
+       enum ctdb_runstate *runstate;
+
+       *ctdb = talloc_zero(NULL, struct ctdb_context);
+
+       /* Avoid that const */
+       ns = talloc_strdup(*ctdb, nodestates);
+
+       numnodes = 0;
+       tok = strtok(ns, ",");
+       while (tok != NULL) {
+               nodeflags[numnodes] = (uint32_t) strtol(tok, NULL, 0);
+               numnodes++;
+               if (numnodes > CTDB_TEST_MAX_NODES) {
+                       DEBUG(DEBUG_ERR, ("ERROR: Exceeding CTDB_TEST_MAX_NODES: %d\n", CTDB_TEST_MAX_NODES));
+                       exit(1);
+               }
+               tok = strtok(NULL, ",");
+       }
+       
+       /* Fake things up... */
+       (*ctdb)->num_nodes = numnodes;
+
+       /* Default to LCP2 */
+       (*ctdb)->tunable.lcp2_public_ip_assignment = 1;
+       (*ctdb)->tunable.deterministic_public_ips = 0;
+       (*ctdb)->tunable.disable_ip_failover = 0;
+       (*ctdb)->tunable.no_ip_failback = 0;
+
+       if ((t = getenv("CTDB_IP_ALGORITHM"))) {
+               if (strcmp(t, "lcp2") == 0) {
+                       (*ctdb)->tunable.lcp2_public_ip_assignment = 1;
+               } else if (strcmp(t, "nondet") == 0) {
+                       (*ctdb)->tunable.lcp2_public_ip_assignment = 0;
+               } else if (strcmp(t, "det") == 0) {
+                       (*ctdb)->tunable.lcp2_public_ip_assignment = 0;
+                       (*ctdb)->tunable.deterministic_public_ips = 1;
+               } else {
+                       fprintf(stderr, "ERROR: unknown IP algorithm %s\n", t);
+                       exit(1);
+               }
+       }
+
+       tval_noiptakeover = get_tunable_values(*ctdb, numnodes,
+                                              "CTDB_SET_NoIPTakeover");
+       tval_noiptakeoverondisabled =
+               get_tunable_values(*ctdb, numnodes,
+                                  "CTDB_SET_NoIPHostOnAllDisabled");
+
+       runstate = get_runstate(*ctdb, numnodes);
+
+       nodemap =  talloc_array(*ctdb, struct ctdb_node_map, numnodes);
+       nodemap->num = numnodes;
+
+       if (!read_ips_for_multiple_nodes) {
+               read_ctdb_public_ip_info(*ctdb, numnodes, all_ips, &avail);
+       }
+
+       (*ctdb)->nodes = talloc_array(*ctdb, struct ctdb_node *, numnodes); // FIXME: bogus size, overkill
+
+       for (i=0; i < numnodes; i++) {
+               nodemap->nodes[i].pnn = i;
+               nodemap->nodes[i].flags = nodeflags[i];
+               /* nodemap->nodes[i].sockaddr is uninitialised */
+
+               if (read_ips_for_multiple_nodes) {
+                       read_ctdb_public_ip_info(*ctdb, numnodes,
+                                                all_ips, &avail);
+               }
+
+               (*ctdb)->nodes[i] = talloc(*ctdb, struct ctdb_node);
+               (*ctdb)->nodes[i]->pnn = i;
+               (*ctdb)->nodes[i]->flags = nodeflags[i];
+               (*ctdb)->nodes[i]->available_public_ips = avail[i];
+               (*ctdb)->nodes[i]->known_public_ips = avail[i];
+       }
+
+       *ipflags = set_ipflags_internal(*ctdb, *ctdb, nodemap,
+                                       tval_noiptakeover,
+                                       tval_noiptakeoverondisabled,
+                                       runstate);
+}
+
+/* IP layout is read from stdin. */
+void ctdb_test_lcp2_allocate_unassigned(const char nodestates[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_public_ip_list *all_ips;
+       struct ctdb_ipflags *ipflags;
+
+       uint32_t *lcp2_imbalances;
+       bool *newly_healthy;
+
+       ctdb_test_init(nodestates, &ctdb, &all_ips, &ipflags, false);
+
+       lcp2_init(ctdb, ipflags, all_ips, NULL,
+                 &lcp2_imbalances, &newly_healthy);
+
+       lcp2_allocate_unassigned(ctdb, ipflags,
+                                all_ips, lcp2_imbalances);
+
+       print_ctdb_public_ip_list(all_ips);
+
+       talloc_free(ctdb);
+}
+
+/* IP layout is read from stdin. */
+void ctdb_test_lcp2_failback(const char nodestates[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_public_ip_list *all_ips;
+       struct ctdb_ipflags *ipflags;
+
+       uint32_t *lcp2_imbalances;
+       bool *newly_healthy;
+
+       ctdb_test_init(nodestates, &ctdb, &all_ips, &ipflags, false);
+
+       lcp2_init(ctdb, ipflags, all_ips, NULL,
+                 &lcp2_imbalances, &newly_healthy);
+
+       lcp2_failback(ctdb, ipflags,
+                     all_ips, lcp2_imbalances, newly_healthy);
+
+       print_ctdb_public_ip_list(all_ips);
+
+       talloc_free(ctdb);
+}
+
+/* IP layout is read from stdin. */
+void ctdb_test_lcp2_failback_loop(const char nodestates[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_public_ip_list *all_ips;
+       struct ctdb_ipflags *ipflags;
+
+       uint32_t *lcp2_imbalances;
+       bool *newly_healthy;
+
+       ctdb_test_init(nodestates, &ctdb, &all_ips, &ipflags, false);
+
+       lcp2_init(ctdb, ipflags, all_ips, NULL,
+                 &lcp2_imbalances, &newly_healthy);
+
+       lcp2_failback(ctdb, ipflags,
+                     all_ips, lcp2_imbalances, newly_healthy);
+
+       print_ctdb_public_ip_list(all_ips);
+
+       talloc_free(ctdb);
+}
+
+/* IP layout is read from stdin.  See comment for ctdb_test_init() for
+ * explanation of read_ips_for_multiple_nodes.
+ */
+void ctdb_test_ctdb_takeover_run_core(const char nodestates[],
+                                     bool read_ips_for_multiple_nodes)
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_public_ip_list *all_ips;
+       struct ctdb_ipflags *ipflags;
+
+       ctdb_test_init(nodestates, &ctdb, &all_ips, &ipflags,
+                      read_ips_for_multiple_nodes);
+
+       ctdb_takeover_run_core(ctdb, ipflags, &all_ips, NULL);
+
+       print_ctdb_public_ip_list(all_ips);
+
+       talloc_free(ctdb);
+}
+
+void usage(void)
+{
+       fprintf(stderr, "usage: ctdb_takeover_tests <op>\n");
+       exit(1);
+}
+
+int main(int argc, const char *argv[])
+{
+       LogLevel = DEBUG_DEBUG;
+       if (getenv("CTDB_TEST_LOGLEVEL")) {
+               LogLevel = atoi(getenv("CTDB_TEST_LOGLEVEL"));
+       }
+
+       if (argc < 2) {
+               usage();
+       }
+
+       if (strcmp(argv[1], "ip_list") == 0) {
+               ctdb_test_read_ctdb_public_ip_list();
+       } else if (argc == 3 && strcmp(argv[1], "ip_info") == 0) {
+               ctdb_test_read_ctdb_public_ip_info(argv[2]);
+       } else if (strcmp(argv[1], "ip_distance") == 0) {
+               ctdb_test_ip_distance();
+       } else if (argc == 4 && strcmp(argv[1], "ip_distance_2_sum") == 0) {
+               ctdb_test_ip_distance_2_sum(argv[2], atoi(argv[3]));
+       } else if (argc >= 3 && strcmp(argv[1], "lcp2_imbalance") == 0) {
+               ctdb_test_lcp2_imbalance(atoi(argv[2]));
+       } else if (argc == 3 && strcmp(argv[1], "lcp2_allocate_unassigned") == 0) {
+               ctdb_test_lcp2_allocate_unassigned(argv[2]);
+       } else if (argc == 3 && strcmp(argv[1], "lcp2_failback") == 0) {
+               ctdb_test_lcp2_failback(argv[2]);
+       } else if (argc == 3 && strcmp(argv[1], "lcp2_failback_loop") == 0) {
+               ctdb_test_lcp2_failback_loop(argv[2]);
+       } else if (argc == 3 &&
+                  strcmp(argv[1], "ctdb_takeover_run_core") == 0) {
+               ctdb_test_ctdb_takeover_run_core(argv[2], false);
+       } else if (argc == 4 &&
+                  strcmp(argv[1], "ctdb_takeover_run_core") == 0 &&
+                  strcmp(argv[3], "multi") == 0) {
+               ctdb_test_ctdb_takeover_run_core(argv[2], true);
+       } else {
+               usage();
+       }
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_test.c b/ctdb/tests/src/ctdb_test.c
new file mode 100644 (file)
index 0000000..bbb51bd
--- /dev/null
@@ -0,0 +1,104 @@
+/*
+   ctdb test include file
+
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CTDBD_TEST_C
+#define _CTDBD_TEST_C
+
+#ifdef CTDB_TEST_OVERRIDE_MAIN
+
+/* Define our own main() and usage() functions */
+#define main(argc, argv) main_foobar(argc, argv)
+#define usage usage_foobar
+
+#endif /* CTDB_TEST_USE_MAIN */
+
+#define ctdb_cmdline_client(x, y) \
+       ctdb_cmdline_client_stub(x, y)
+#define ctdb_ctrl_getnodemap(ctdb, timelimit, pnn, tmp_ctx, nodemap) \
+       ctdb_ctrl_getnodemap_stub(ctdb, timelimit, pnn, tmp_ctx, nodemap)
+#define ctdb_ctrl_get_ifaces(ctdb, timelimit, pnn, tmp_ctx, ifaces) \
+       ctdb_ctrl_get_ifaces_stub(ctdb, timelimit, pnn, tmp_ctx, ifaces)
+#define ctdb_ctrl_getpnn(ctdb, timelimit, pnn) \
+       ctdb_ctrl_getpnn_stub(ctdb, timelimit, pnn)
+#define ctdb_ctrl_getrecmode(ctdb, tmp_ctx, timelimit, pnn, recmode) \
+       ctdb_ctrl_getrecmode_stub(ctdb, tmp_ctx, timelimit, pnn, recmode)
+#define ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, timelimit, pnn, recmaster) \
+       ctdb_ctrl_getrecmaster_stub(ctdb, tmp_ctx, timelimit, pnn, recmaster)
+#define ctdb_ctrl_getvnnmap(ctdb, timelimit, pnn, tmp_ctx, vnnmap) \
+       ctdb_ctrl_getvnnmap_stub(ctdb, timelimit, pnn, tmp_ctx, vnnmap)
+#define ctdb_ctrl_getdebseqnum(ctdb, timelimit, pnn, db_id, seqnum) \
+       ctdb_ctrl_getvnnmap_stub(ctdb, timelimit, pnn, db_id, seqnum)
+#define ctdb_client_check_message_handlers(ctdb, ids, argc, result) \
+       ctdb_client_check_message_handlers_stub(ctdb, ids, argc, result)
+#define ctdb_ctrl_getcapabilities(ctdb, timeout, destnode, capabilities) \
+       ctdb_ctrl_getcapabilities_stub(ctdb, timeout, destnode, capabilities)
+
+#include "tools/ctdb.c"
+
+#ifndef CTDB_TEST_USE_MAIN
+#undef main
+#undef usage
+#endif /* CTDB_TEST_USE_MAIN */
+
+#undef ctdb_cmdline_client
+
+#include "common/cmdline.c"
+
+#undef ctdb_ctrl_getnodemap
+#undef ctdb_ctrl_get_ifaces 
+#undef ctdb_ctrl_getpnn
+#undef ctdb_ctrl_getrecmode
+#undef ctdb_ctrl_getrecmaster
+#undef ctdb_ctrl_getvnnmap
+#undef ctdb_ctrl_getdebseqnum
+#undef ctdb_client_check_message_handlers
+#undef ctdb_ctrl_getcapabilities
+
+#undef TIMELIMIT
+#include "tools/ctdb_vacuum.c"
+
+/* UTIL_OBJ */
+#include "lib/util/idtree.c"
+#include "lib/util/db_wrap.c"
+#include "lib/util/strlist.c"
+#include "lib/util/util.c"
+#include "lib/util/util_time.c"
+#include "lib/util/util_file.c"
+#include "lib/util/fault.c"
+#include "lib/util/substitute.c"
+#include "lib/util/signal.c"
+
+/* CTDB_COMMON_OBJ */
+#include "common/ctdb_io.c"
+#include "common/ctdb_util.c"
+#include "common/ctdb_ltdb.c"
+#include "common/ctdb_message.c"
+#include "lib/util/debug.c"
+#include "common/rb_tree.c"
+#include "common/system_common.c"
+#include "common/ctdb_logging.c"
+#include "common/ctdb_fork.c"
+
+/* CTDB_CLIENT_OBJ */
+#include "client/ctdb_client.c"
+
+/* TEST STUBS */
+#include "ctdb_test_stubs.c"
+
+#endif /* _CTDBD_TEST_C */
diff --git a/ctdb/tests/src/ctdb_test_stubs.c b/ctdb/tests/src/ctdb_test_stubs.c
new file mode 100644 (file)
index 0000000..456b1fd
--- /dev/null
@@ -0,0 +1,529 @@
+/*
+   Test stubs and support functions for some CTDB client functions
+
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* Read a nodemap from stdin.  Each line looks like:
+ *  <PNN> <FLAGS> [RECMASTER] [CURRENT]
+ * EOF or a blank line terminates input.
+ */
+void ctdb_test_stubs_read_nodemap(struct ctdb_context *ctdb)
+{
+       char line[1024];
+
+       TALLOC_FREE(ctdb->nodes);
+       ctdb->pnn = -1;
+       ctdb->num_nodes = 0;
+
+       ctdb->nodes = NULL;
+
+       while ((fgets(line, sizeof(line), stdin) != NULL) &&
+              (line[0] != '\n')) {
+               uint32_t pnn, flags;
+               char *tok, *t;
+               const char *ip;
+               ctdb_sock_addr saddr;
+
+               /* Get rid of pesky newline */
+               if ((t = strchr(line, '\n')) != NULL) {
+                       *t = '\0';
+               }
+
+               /* Get PNN */
+               tok = strtok(line, " \t");
+               if (tok == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line (PNN) ignored \"%s\"\n", line));
+                       continue;
+               }
+               pnn = (uint32_t)strtoul(tok, NULL, 0);
+
+               /* Get IP */
+               tok = strtok(NULL, " \t");
+               if (tok == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line (no IP) ignored \"%s\"\n", line));
+                       continue;
+               }
+               if (!parse_ip(tok, NULL, 0, &saddr)) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line (IP) ignored \"%s\"\n", line));
+                       continue;
+               }
+               ip = talloc_strdup(ctdb, tok);
+
+               /* Get flags */
+               tok = strtok(NULL, " \t");
+               if (tok == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line (flags) ignored \"%s\"\n", line));
+                       continue;
+               }
+               flags = (uint32_t)strtoul(tok, NULL, 0);
+
+               tok = strtok(NULL, " \t");
+               while (tok != NULL) {
+                       if (strcmp(tok, "CURRENT") == 0) {
+                               ctdb->pnn = pnn;
+                       } else if (strcmp(tok, "RECMASTER") == 0) {
+                               ctdb->recovery_master = pnn;
+                       }
+                       tok = strtok(NULL, " \t");
+               }
+
+               ctdb->nodes = talloc_realloc(ctdb, ctdb->nodes, struct ctdb_node *, ctdb->num_nodes + 1);
+               if (ctdb->nodes == NULL) {
+                       DEBUG(DEBUG_ERR, ("OOM allocating nodes array\n"));
+                       exit (1);
+               }
+               ctdb->nodes[ctdb->num_nodes] = talloc_zero(ctdb, struct ctdb_node);
+               if (ctdb->nodes[ctdb->num_nodes] == NULL) {
+                       DEBUG(DEBUG_ERR, ("OOM allocating node structure\n"));
+                       exit (1);
+               }
+
+               ctdb->nodes[ctdb->num_nodes]->ctdb = ctdb;
+               ctdb->nodes[ctdb->num_nodes]->name = "fakectdb";
+               ctdb->nodes[ctdb->num_nodes]->pnn = pnn;
+               ctdb->nodes[ctdb->num_nodes]->address.address = ip;
+               ctdb->nodes[ctdb->num_nodes]->address.port = 0;
+               ctdb->nodes[ctdb->num_nodes]->flags = flags;
+               ctdb->num_nodes++;
+       }
+}
+
+void ctdb_test_stubs_print_nodemap(struct ctdb_context *ctdb)
+{
+       int i;
+
+       for (i = 0; i < ctdb->num_nodes; i++) {
+               printf("%ld\t0x%lx%s%s\n",
+                      (unsigned long) ctdb->nodes[i]->pnn,
+                      (unsigned long) ctdb->nodes[i]->flags,
+                      ctdb->nodes[i]->pnn == ctdb->pnn ? "\tCURRENT" : "",
+                      ctdb->nodes[i]->pnn == ctdb->recovery_master ? "\tRECMASTER" : "");
+       }
+}
+
+/* Read interfaces information.  Same format as "ctdb ifaces -Y"
+ * output:
+ *   :Name:LinkStatus:References:
+ *   :eth2:1:4294967294
+ *   :eth1:1:4294967292
+ */
+
+struct ctdb_iface {
+       struct ctdb_iface *prev, *next;
+       const char *name;
+       bool link_up;
+       uint32_t references;
+};
+
+void ctdb_test_stubs_read_ifaces(struct ctdb_context *ctdb)
+{
+       char line[1024];
+       struct ctdb_iface *iface;
+
+       while ((fgets(line, sizeof(line), stdin) != NULL) &&
+              (line[0] != '\n')) {
+               uint16_t link_state;
+               uint32_t references;
+               char *tok, *t, *name;
+
+               /* Get rid of pesky newline */
+               if ((t = strchr(line, '\n')) != NULL) {
+                       *t = '\0';
+               }
+
+               if (strcmp(line, ":Name:LinkStatus:References:") == 0) {
+                       continue;
+               }
+
+               /* name */
+               //tok = strtok(line, ":"); /* Leading colon... */
+               tok = strtok(line, ":");
+               if (tok == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line ignored \"%s\"\n", line));
+                       continue;
+               }
+               name = tok;
+
+               /* link_state */
+               tok = strtok(NULL, ":");
+               if (tok == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line ignored \"%s\"\n", line));
+                       continue;
+               }
+               link_state = (uint16_t)strtoul(tok, NULL, 0);
+
+               /* references... */
+               tok = strtok(NULL, ":");
+               if (tok == NULL) {
+                       DEBUG(DEBUG_ERR, (__location__ " WARNING, bad line ignored \"%s\"\n", line));
+                       continue;
+               }
+               references = (uint32_t)strtoul(tok, NULL, 0);
+
+               iface = talloc_zero(ctdb, struct ctdb_iface);
+
+               if (iface == NULL) {
+                       DEBUG(DEBUG_ERR, ("OOM allocating iface\n"));
+                       exit (1);
+               }
+
+               iface->name = talloc_strdup(iface, name);
+               iface->link_up = link_state;
+               iface->references = references;
+
+               DLIST_ADD(ctdb->ifaces, iface);
+       }
+}
+
+void ctdb_test_stubs_print_ifaces(struct ctdb_context *ctdb)
+{
+       struct ctdb_iface *iface;
+
+       printf(":Name:LinkStatus:References:\n");
+       for (iface = ctdb->ifaces; iface != NULL; iface = iface->next) {
+               printf(":%s:%u:%u:\n",
+                      iface->name,
+                      iface->link_up,
+                      iface->references);
+       }
+}
+
+/* Read vnn map.
+ * output:
+ *   <GENERATION>
+ *   <LMASTER0>
+ *   <LMASTER1>
+ *   ...
+ */
+
+/*
+struct ctdb_vnn_map {
+       uint32_t generation;
+       uint32_t size;
+       uint32_t *map;
+};
+*/
+void ctdb_test_stubs_read_vnnmap(struct ctdb_context *ctdb)
+{
+       char line[1024];
+
+       TALLOC_FREE(ctdb->vnn_map);
+
+       ctdb->vnn_map = talloc_zero(ctdb, struct ctdb_vnn_map);
+       if (ctdb->vnn_map == NULL) {
+               DEBUG(DEBUG_ERR, ("OOM allocating vnnmap\n"));
+               exit (1);
+       }
+       ctdb->vnn_map->generation = INVALID_GENERATION;
+       ctdb->vnn_map->size = 0;
+       ctdb->vnn_map->map = NULL;
+
+       while ((fgets(line, sizeof(line), stdin) != NULL) &&
+              (line[0] != '\n')) {
+               uint32_t n;
+               char *t;
+
+               /* Get rid of pesky newline */
+               if ((t = strchr(line, '\n')) != NULL) {
+                       *t = '\0';
+               }
+
+               n = (uint32_t) strtol(line, NULL, 0);
+
+               /* generation */
+               if (ctdb->vnn_map->generation == INVALID_GENERATION) {
+                       ctdb->vnn_map->generation = n;
+                       continue;
+               }
+
+               ctdb->vnn_map->map = talloc_realloc(ctdb, ctdb->vnn_map->map, uint32_t, ctdb->vnn_map->size + 1);
+               if (ctdb->vnn_map->map == NULL) {
+                       DEBUG(DEBUG_ERR, ("OOM allocating vnn_map->map\n"));
+                       exit (1);
+               }
+
+               ctdb->vnn_map->map[ctdb->vnn_map->size] = n;
+               ctdb->vnn_map->size++;
+       }
+}
+
+void ctdb_test_stubs_print_vnnmap(struct ctdb_context *ctdb)
+{
+       int i;
+
+       printf("%d\n", ctdb->vnn_map->generation);
+       for (i = 0; i < ctdb->vnn_map->size; i++) {
+               printf("%d\n", ctdb->vnn_map->map[i]);
+       }
+}
+
+void ctdb_test_stubs_fake_setup(struct ctdb_context *ctdb)
+{
+       char line[1024];
+
+       while (fgets(line, sizeof(line), stdin) != NULL) {
+               char *t;
+
+               /* Get rid of pesky newline */
+               if ((t = strchr(line, '\n')) != NULL) {
+                       *t = '\0';
+               }
+
+               if (strcmp(line, "NODEMAP") == 0) {
+                       ctdb_test_stubs_read_nodemap(ctdb);
+               } else if (strcmp(line, "IFACES") == 0) {
+                       ctdb_test_stubs_read_ifaces(ctdb);
+               } else if (strcmp(line, "VNNMAP") == 0) {
+                       ctdb_test_stubs_read_vnnmap(ctdb);
+               } else {
+                       printf("Unknown line %s\n", line);
+                       exit(1);
+               }
+       }
+}
+
+/* Support... */
+static bool current_node_is_connected (struct ctdb_context *ctdb)
+{
+       int i;
+       for (i = 0; i < ctdb->num_nodes; i++) {
+               if (ctdb->nodes[i]->pnn == ctdb->pnn) {
+                       if (ctdb->nodes[i]->flags &
+                           (NODE_FLAGS_DISCONNECTED | NODE_FLAGS_DELETED)) {
+                               return false;
+                       } else {
+                               return true;
+                       }
+               }
+       }
+
+       /* Shouldn't really happen, so fag an error */
+       return false;
+}
+
+/* Stubs... */
+
+struct ctdb_context *ctdb_cmdline_client_stub(struct tevent_context *ev,
+                                             struct timeval req_timeout)
+{
+       struct ctdb_context *ctdb;
+
+       ctdb = talloc_zero(NULL, struct ctdb_context);
+
+       ctdb_set_socketname(ctdb, "fake");
+
+       ctdb_test_stubs_fake_setup(ctdb);
+
+       return ctdb;
+}
+
+/* Copied from ctdb_recover.c */
+int
+ctdb_control_getnodemap(struct ctdb_context *ctdb, uint32_t opcode, TDB_DATA indata, TDB_DATA *outdata)
+{
+       uint32_t i, num_nodes;
+       struct ctdb_node_map *node_map;
+
+       CHECK_CONTROL_DATA_SIZE(0);
+
+       num_nodes = ctdb->num_nodes;
+
+       outdata->dsize = offsetof(struct ctdb_node_map, nodes) + num_nodes*sizeof(struct ctdb_node_and_flags);
+       outdata->dptr  = (unsigned char *)talloc_zero_size(outdata, outdata->dsize);
+       if (!outdata->dptr) {
+               DEBUG(DEBUG_ALERT, (__location__ " Failed to allocate nodemap array\n"));
+               exit(1);
+       }
+
+       node_map = (struct ctdb_node_map *)outdata->dptr;
+       node_map->num = num_nodes;
+       for (i=0; i<num_nodes; i++) {
+               if (parse_ip(ctdb->nodes[i]->address.address,
+                            NULL, /* TODO: pass in the correct interface here*/
+                            0,
+                            &node_map->nodes[i].addr) == 0)
+               {
+                       DEBUG(DEBUG_ERR, (__location__ " Failed to parse %s into a sockaddr\n", ctdb->nodes[i]->address.address));
+               }
+
+               node_map->nodes[i].pnn   = ctdb->nodes[i]->pnn;
+               node_map->nodes[i].flags = ctdb->nodes[i]->flags;
+       }
+
+       return 0;
+}
+
+int
+ctdb_ctrl_getnodemap_stub(struct ctdb_context *ctdb,
+                         struct timeval timeout, uint32_t destnode,
+                         TALLOC_CTX *mem_ctx,
+                         struct ctdb_node_map **nodemap)
+{
+       int ret;
+
+       TDB_DATA indata;
+       TDB_DATA *outdata;
+
+       if (!current_node_is_connected(ctdb)) {
+               return -1;
+       }
+
+       indata.dsize = 0;
+       indata.dptr = NULL;
+
+       outdata = talloc_zero(ctdb, TDB_DATA);
+
+       ret = ctdb_control_getnodemap(ctdb, CTDB_CONTROL_GET_NODEMAP,
+                                     indata, outdata);
+
+       if (ret == 0) {
+               *nodemap = (struct ctdb_node_map *) outdata->dptr;
+       }
+
+       return ret;
+}
+
+int
+ctdb_ctrl_getvnnmap_stub(struct ctdb_context *ctdb,
+                        struct timeval timeout, uint32_t destnode,
+                        TALLOC_CTX *mem_ctx, struct ctdb_vnn_map **vnnmap)
+{
+       *vnnmap = talloc(ctdb, struct ctdb_vnn_map);
+       if (*vnnmap == NULL) {
+               DEBUG(DEBUG_ERR, (__location__ "OOM\n"));
+               exit (1);
+       }
+       (*vnnmap)->map = talloc_array(*vnnmap, uint32_t, ctdb->vnn_map->size);
+
+       (*vnnmap)->generation = ctdb->vnn_map->generation;
+       (*vnnmap)->size = ctdb->vnn_map->size;
+       memcpy((*vnnmap)->map, ctdb->vnn_map->map, sizeof(uint32_t) * (*vnnmap)->size);
+
+       return 0;
+}
+
+int
+ctdb_ctrl_getrecmode_stub(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+                         struct timeval timeout, uint32_t destnode,
+                         uint32_t *recmode)
+{
+       *recmode = ctdb->recovery_mode;
+
+       return 0;
+}
+
+int
+ctdb_ctrl_getrecmaster_stub(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx,
+                           struct timeval timeout, uint32_t destnode,
+                           uint32_t *recmaster)
+{
+       *recmaster = ctdb->recovery_master;
+
+       return 0;
+}
+
+int
+ctdb_ctrl_getpnn_stub(struct ctdb_context *ctdb, struct timeval timeout,
+                     uint32_t destnode)
+{
+       if (!current_node_is_connected(ctdb)) {
+               return -1;
+       }
+
+       if (destnode == CTDB_CURRENT_NODE) {
+               return ctdb->pnn;
+       } else {
+               return destnode;
+       }
+}
+
+/* From ctdb_takeover.c */
+int32_t ctdb_control_get_ifaces(struct ctdb_context *ctdb,
+                               struct ctdb_req_control *c,
+                               TDB_DATA *outdata)
+{
+       int i, num, len;
+       struct ctdb_control_get_ifaces *ifaces;
+       struct ctdb_iface *cur;
+
+       /* count how many public ip structures we have */
+       num = 0;
+       for (cur=ctdb->ifaces;cur;cur=cur->next) {
+               num++;
+       }
+
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+               num*sizeof(struct ctdb_control_iface_info);
+       ifaces = talloc_zero_size(outdata, len);
+       CTDB_NO_MEMORY(ctdb, ifaces);
+
+       i = 0;
+       for (cur=ctdb->ifaces;cur;cur=cur->next) {
+               strcpy(ifaces->ifaces[i].name, cur->name);
+               ifaces->ifaces[i].link_state = cur->link_up;
+               ifaces->ifaces[i].references = cur->references;
+               i++;
+       }
+       ifaces->num = i;
+       len = offsetof(struct ctdb_control_get_ifaces, ifaces) +
+               i*sizeof(struct ctdb_control_iface_info);
+
+       outdata->dsize = len;
+       outdata->dptr  = (uint8_t *)ifaces;
+
+       return 0;
+}
+
+int
+ctdb_ctrl_get_ifaces_stub(struct ctdb_context *ctdb,
+                         struct timeval timeout, uint32_t destnode,
+                         TALLOC_CTX *mem_ctx,
+                         struct ctdb_control_get_ifaces **ifaces)
+{
+       TDB_DATA *outdata;
+       int ret;
+
+       if (!current_node_is_connected(ctdb)) {
+               return -1;
+       }
+
+       outdata = talloc(mem_ctx, TDB_DATA);
+
+       ret = ctdb_control_get_ifaces(ctdb, NULL, outdata);
+
+       if (ret == 0) {
+               *ifaces = (struct ctdb_control_get_ifaces *)outdata->dptr;
+       }
+
+       return ret;
+}
+
+int ctdb_client_check_message_handlers_stub(struct ctdb_context *ctdb,
+                                           uint64_t *ids, uint32_t num,
+                                           uint8_t *result)
+{
+       DEBUG(DEBUG_ERR, (__location__ " NOT IMPLEMENTED\n"));
+       return -1;
+}
+
+int ctdb_ctrl_getcapabilities_stub(struct ctdb_context *ctdb,
+                                  struct timeval timeout, uint32_t destnode,
+                                  uint32_t *capabilities)
+{
+       *capabilities = CTDB_CAP_RECMASTER|CTDB_CAP_LMASTER|CTDB_CAP_NATGW;
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_trackingdb_test.c b/ctdb/tests/src/ctdb_trackingdb_test.c
new file mode 100644 (file)
index 0000000..ee473c0
--- /dev/null
@@ -0,0 +1,135 @@
+/* 
+   simple trackingdb test tool
+
+   This program is used to test the funcitons to manipulate and enumerate
+   the trackingdb records :
+       ctdb_trackingdb_add_pnn()
+       ctdb_trackingdb_traverse()
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/time.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+#include "db_wrap.h"
+
+#define MAXINDEX 64
+char indices[MAXINDEX];
+
+void vn_cb(struct ctdb_context *ctdb, uint32_t pnn, void *private_data)
+{
+       char *ind = private_data;
+
+       printf("Callback for node %d\n", pnn);
+       if (ind[pnn] == 0) {
+               printf("ERROR, node %d from callback was never added\n", pnn);
+               exit(10);
+       }
+       ind[pnn] = 0;
+}
+
+void verify_nodes(struct ctdb_context *ctdb, TDB_DATA data)
+{
+       int i;
+
+       printf("Verify the nodes\n");
+       ctdb_trackingdb_traverse(ctdb, data, vn_cb, indices);
+       for(i = 0; i < MAXINDEX; i++) {
+               if (indices[i] != 0) {
+                       printf("Callback for %d was never invoked\n", i);
+                       exit(0);
+               }
+       }
+}
+
+       
+       
+void add_node(struct ctdb_context *ctdb, TDB_DATA *data, int pnn)
+{
+       printf("Add node %d\n", pnn);
+       if (ctdb_trackingdb_add_pnn(ctdb, data, pnn)) {
+               printf("Failed to add tracking db data\n");
+               exit(10);
+       }
+       indices[pnn] = 1;
+}
+
+static void trackdb_test(struct ctdb_context *ctdb)
+{
+       TDB_DATA data = {NULL,0};
+       int i;
+
+       printf("Add 10 nodes\n");
+       srandom(time(NULL));
+       for(i=0; i<10; i++) {
+               add_node(ctdb, &data, random()%MAXINDEX);
+       }
+
+       verify_nodes(ctdb, data);
+       printf("OK all seems well\n");
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(5, 0));
+       if (ctdb == NULL) {
+               exit(1);
+       }
+
+       trackdb_test(ctdb);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_transaction.c b/ctdb/tests/src/ctdb_transaction.c
new file mode 100644 (file)
index 0000000..78a63f1
--- /dev/null
@@ -0,0 +1,300 @@
+/* 
+   simple tool to test persistent databases
+
+   Copyright (C) Andrew Tridgell  2006-2007
+   Copyright (c) Ronnie sahlberg  2007
+   Copyright (C) Michael Adam     2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static struct timeval tp1,tp2;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+static int timelimit = 10;
+static int delay = 0;
+static int verbose = 0;
+
+static unsigned int pnn;
+
+static TDB_DATA old_data;
+
+static bool success = false;
+
+static void print_counters(void)
+{
+       int i;
+       uint32_t *old_counters;
+
+       printf("[%4u] Counters: ", getpid());
+       old_counters = (uint32_t *)old_data.dptr;
+       for (i=0;i<old_data.dsize/sizeof(uint32_t); i++) {
+               printf("%6u ", old_counters[i]);
+       }
+       printf("\n");
+}
+
+static void each_second(struct event_context *ev, struct timed_event *te,
+                                        struct timeval t, void *private_data)
+{
+       struct ctdb_context *ctdb = talloc_get_type(private_data, struct ctdb_context);
+
+       print_counters();
+
+       event_add_timed(ev, ctdb, timeval_current_ofs(1, 0), each_second, ctdb);
+}
+
+static void check_counters(struct ctdb_context *ctdb, TDB_DATA data)
+{
+       int i;
+       uint32_t *counters, *old_counters;
+       bool monotonous = true;
+
+       counters     = (uint32_t *)data.dptr;
+       old_counters = (uint32_t *)old_data.dptr;
+
+       /* check that all the counters are monotonic increasing */
+       for (i=0; i<old_data.dsize/sizeof(uint32_t); i++) {
+               if (counters[i]<old_counters[i]) {
+                       printf("[%4u] ERROR: counters has decreased for node %u  From %u to %u\n", 
+                              getpid(), i, old_counters[i], counters[i]);
+                       monotonous = false;
+               }
+       }
+
+       if (old_data.dsize != data.dsize) {
+               old_data.dsize = data.dsize;
+               old_data.dptr = talloc_realloc_size(ctdb, old_data.dptr, old_data.dsize);
+       }
+
+       memcpy(old_data.dptr, data.dptr, data.dsize);
+       if (verbose) print_counters();
+
+       success = monotonous;
+}
+
+
+static void do_sleep(unsigned int sec)
+{
+       unsigned int i;
+       for (i=0; i<sec; i++) {
+               if (verbose) printf(".");
+               sleep(1);
+       }
+       if (verbose) printf("\n");
+}
+
+static void test_store_records(struct ctdb_context *ctdb, struct event_context *ev)
+{
+       TDB_DATA key;
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+       uint32_t *counters;
+       ctdb_db = ctdb_db_handle(ctdb, "transaction.tdb");
+
+       key.dptr = discard_const("testkey");
+       key.dsize = strlen((const char *)key.dptr)+1;
+
+       start_timer();
+       while ((timelimit == 0) || (end_timer() < timelimit)) {
+               TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+               TDB_DATA data;
+               struct ctdb_transaction_handle *h;
+
+               if (verbose) DEBUG(DEBUG_ERR, ("starting transaction\n"));
+               h = ctdb_transaction_start(ctdb_db, tmp_ctx);
+               if (h == NULL) {
+                       DEBUG(DEBUG_ERR, ("Failed to start transaction on node %d\n",
+                              ctdb_get_pnn(ctdb)));
+                       talloc_free(tmp_ctx);
+                       return;
+               }
+               if (verbose) DEBUG(DEBUG_ERR, ("transaction started\n"));
+               do_sleep(delay);
+
+               if (verbose) DEBUG(DEBUG_ERR, ("calling transaction_fetch\n"));
+               ret = ctdb_transaction_fetch(h, tmp_ctx, key, &data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to fetch record\n"));
+                       exit(1);
+               }
+               if (verbose) DEBUG(DEBUG_ERR, ("fetched data ok\n"));
+               do_sleep(delay);
+
+               if (data.dsize < sizeof(uint32_t) * (pnn+1)) {
+                       unsigned char *ptr = data.dptr;
+
+                       data.dptr = talloc_zero_size(tmp_ctx, sizeof(uint32_t) * (pnn+1));
+                       memcpy(data.dptr, ptr, data.dsize);
+                       talloc_free(ptr);
+
+                       data.dsize = sizeof(uint32_t) * (pnn+1);
+               }
+
+               if (data.dptr == NULL) {
+                       DEBUG(DEBUG_ERR, ("Failed to realloc array\n"));
+                       talloc_free(tmp_ctx);
+                       return;
+               }
+
+               counters = (uint32_t *)data.dptr;
+
+               /* bump our counter */
+               counters[pnn]++;
+
+               if (verbose) DEBUG(DEBUG_ERR, ("calling transaction_store\n"));
+               ret = ctdb_transaction_store(h, key, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to store record\n"));
+                       exit(1);
+               }
+               if (verbose) DEBUG(DEBUG_ERR, ("stored data ok\n"));
+               do_sleep(delay);
+
+               if (verbose) DEBUG(DEBUG_ERR, ("calling transaction_commit\n"));
+               ret = ctdb_transaction_commit(h);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to commit transaction\n"));
+                       check_counters(ctdb, data);
+                       exit(1);
+               }
+               if (verbose) DEBUG(DEBUG_ERR, ("transaction committed\n"));
+
+               /* store the counters and verify that they are sane */
+               if (verbose || (pnn == 0)) {
+                       check_counters(ctdb, data);
+               }
+
+               do_sleep(delay);
+
+               talloc_free(tmp_ctx);
+       }
+
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+       int unsafe_writes = 0;
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "timelimit", 't', POPT_ARG_INT, &timelimit, 0, "timelimit", "integer" },
+               { "delay", 'D', POPT_ARG_INT, &delay, 0, "delay (in seconds) between operations", "integer" },
+               { "verbose", 'v', POPT_ARG_NONE,  &verbose, 0, "switch on verbose mode", NULL },
+               { "unsafe-writes", 'u', POPT_ARG_NONE, &unsafe_writes, 0, "do not use tdb transactions when writing", NULL },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       if (verbose) {
+               setbuf(stdout, (char *)NULL); /* don't buffer */
+       } else {
+               setlinebuf(stdout);
+       }
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+       if (ctdb == NULL) {
+               DEBUG(DEBUG_ERR, ("Could not attach to daemon\n"));
+               return 1;
+       }
+
+       /* attach to a specific database */
+       if (unsafe_writes == 1) {
+               ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0),
+                                     "transaction.tdb", true, TDB_NOSYNC);
+       } else {
+               ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0),
+                                     "transaction.tdb", true, 0);
+       }
+
+       if (!ctdb_db) {
+               DEBUG(DEBUG_ERR, ("ctdb_attach failed - %s\n", ctdb_errstr(ctdb)));
+               exit(1);
+       }
+
+       DEBUG(DEBUG_ERR, ("Waiting for cluster\n"));
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       pnn = ctdb_get_pnn(ctdb);
+       printf("Starting test on node %u. running for %u seconds. sleep delay: %u seconds.\n", pnn, timelimit, delay);
+
+       if (!verbose && (pnn == 0)) {
+               event_add_timed(ev, ctdb, timeval_current_ofs(1, 0), each_second, ctdb);
+       }
+
+       test_store_records(ctdb, ev);
+
+       if (verbose || (pnn == 0)) {
+               if (success != true) {
+                       printf("The test FAILED\n");
+                       return 1;
+               } else {
+                       printf("SUCCESS!\n");
+               }
+       }
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_traverse.c b/ctdb/tests/src/ctdb_traverse.c
new file mode 100644 (file)
index 0000000..5b37ed9
--- /dev/null
@@ -0,0 +1,116 @@
+/* 
+   simple tool to traverse a ctdb database over and over and over
+
+   Copyright (C) Andrew Tridgell  2006
+       Ronnie sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+
+static const char *dbname = "test.tdb";
+
+static int traverse_callback(struct ctdb_context *ctdb, TDB_DATA key, TDB_DATA data, void *private_data)
+{
+       uint32_t *count = private_data;
+       
+       (*count)++;
+       return 0;
+}
+
+static void traverse_loop(struct ctdb_context *ctdb, struct ctdb_db_context *ctdb_db, struct event_context *ev)
+{
+       uint32_t count;
+
+       printf("traversing database\n");
+       count = 0;
+       ctdb_traverse(ctdb_db, traverse_callback, &count);
+       printf("traversed %d records\n", count);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       struct ctdb_db_context *ctdb_db;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "database", 0, POPT_ARG_STRING, &dbname, 0, "database to traverse", "name" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* talloc_enable_leak_report_full(); */
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(3, 0));
+       if (ctdb == NULL) {
+               exit(1);
+       }
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(2, 0), dbname, false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       while (1) {
+               traverse_loop(ctdb, ctdb_db, ev);
+       }
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_update_record.c b/ctdb/tests/src/ctdb_update_record.c
new file mode 100644 (file)
index 0000000..6eff1d0
--- /dev/null
@@ -0,0 +1,160 @@
+/* 
+   simple ctdb test tool
+   This test just fetch_locks a record bumps the RSN and then writes new content
+
+   Copyright (C) Ronnie Sahlberg 2009
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+
+static struct ctdb_db_context *ctdb_db;
+
+#define TESTKEY "testkey"
+
+
+/*
+       Just try locking/unlocking a single record once
+*/
+static void fetch_lock_once(struct ctdb_context *ctdb, struct event_context *ev, uint32_t generation)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA key, data;
+       struct ctdb_record_handle *h;
+       struct ctdb_ltdb_header *header;
+       int ret;
+
+       key.dptr = discard_const(TESTKEY);
+       key.dsize = strlen(TESTKEY);
+
+       printf("Trying to fetch lock the record ...\n");
+
+       h = ctdb_fetch_readonly_lock(ctdb_db, tmp_ctx, key, &data, false);
+       if (h == NULL) {
+               printf("Failed to fetch record '%s' on node %d\n", 
+                       (const char *)key.dptr, ctdb_get_pnn(ctdb));
+               talloc_free(tmp_ctx);
+               exit(10);
+       }
+
+       printf("Record fetchlocked.\n");
+       header = talloc_memdup(tmp_ctx, ctdb_header_from_record_handle(h), sizeof(*header));
+               printf("RSN:%d\n", (int)header->rsn);
+       talloc_free(h);
+       printf("Record released.\n");
+
+       printf("Write new record with RSN+10\n");
+       header->rsn += 10;
+       data.dptr = (void *)talloc_asprintf(tmp_ctx, "%d", (int)header->rsn);
+       data.dsize = strlen((char *)data.dptr);
+
+       ret = ctdb_ctrl_updaterecord(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb_db, key, header, data);
+       if (ret != 0) {
+               printf("Failed to writerecord,  ret==%d\n", ret);       
+               exit(1);
+       }
+
+       printf("re-fetch the record\n");
+       h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+       if (h == NULL) {
+               printf("Failed to fetch record '%s' on node %d\n", 
+                       (const char *)key.dptr, ctdb_get_pnn(ctdb));
+               talloc_free(tmp_ctx);
+               exit(10);
+       }
+
+       printf("Record fetchlocked.\n");
+       header = talloc_memdup(tmp_ctx, ctdb_header_from_record_handle(h), sizeof(*header));
+               printf("RSN:%d\n", (int)header->rsn);
+       talloc_free(h);
+       printf("Record released.\n");
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       struct event_context *ev;
+       struct ctdb_vnn_map *vnnmap=NULL;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(5, 0));
+       if (ctdb == NULL) {
+               exit(1);
+       }
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(5, 0), "test.tdb", false, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+
+       if (ctdb_ctrl_getvnnmap(ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb, &vnnmap) != 0) {
+               printf("Unable to get vnnmap from local node\n");
+               exit(1);
+       }
+       printf("Current Generation %d\n", (int)vnnmap->generation);
+
+       fetch_lock_once(ctdb, ev, vnnmap->generation);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdb_update_record_persistent.c b/ctdb/tests/src/ctdb_update_record_persistent.c
new file mode 100644 (file)
index 0000000..a0bb383
--- /dev/null
@@ -0,0 +1,138 @@
+/* 
+   simple ctdb test tool
+   This test just creates/updates a record in a persistent database
+
+   Copyright (C) Ronnie Sahlberg 2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/util/db_wrap.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "ctdb_private.h"
+
+
+static void update_once(struct ctdb_context *ctdb, struct event_context *ev, struct ctdb_db_context *ctdb_db, char *record, char *value)
+{
+       TDB_DATA key, data, olddata;
+       struct ctdb_ltdb_header header;
+
+       memset(&header, 0, sizeof(header));
+
+       key.dptr  = (uint8_t *)record;
+       key.dsize = strlen(record);
+
+       data.dptr  = (uint8_t *)value;
+       data.dsize = strlen(value);
+
+       olddata = tdb_fetch(ctdb_db->ltdb->tdb, key);
+       if (olddata.dsize != 0) {
+               memcpy(&header, olddata.dptr, sizeof(header));
+       } 
+       header.rsn++;
+
+       if (ctdb_ctrl_updaterecord(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, ctdb_db, key, &header, data) != 0) {
+               printf("Failed to update record\n");
+               exit(1);
+       }
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       char *test_db = NULL;
+       char *record = NULL;
+       char *value = NULL;
+       struct ctdb_db_context *ctdb_db;
+       struct event_context *ev;
+
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "database",      'D', POPT_ARG_STRING, &test_db, 0, "database", "string" },
+               { "record",      'R', POPT_ARG_STRING, &record, 0, "record", "string" },
+               { "value",      'V', POPT_ARG_STRING, &value, 0, "value", "string" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+       ctdb = ctdb_cmdline_client(ev, timeval_current_ofs(5, 0));
+       if (ctdb == NULL) {
+               exit(1);
+       }
+
+       if (test_db == NULL) {
+               fprintf(stderr, "You must specify the database\n");
+               exit(10);
+       }
+
+       if (record == NULL) {
+               fprintf(stderr, "You must specify the record\n");
+               exit(10);
+       }
+
+       if (value == NULL) {
+               fprintf(stderr, "You must specify the value\n");
+               exit(10);
+       }
+
+       /* attach to a specific database */
+       ctdb_db = ctdb_attach(ctdb, timeval_current_ofs(5, 0), test_db, true, 0);
+       if (!ctdb_db) {
+               printf("ctdb_attach failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
+       printf("Waiting for cluster\n");
+       while (1) {
+               uint32_t recmode=1;
+               ctdb_ctrl_getrecmode(ctdb, ctdb, timeval_zero(), CTDB_CURRENT_NODE, &recmode);
+               if (recmode == 0) break;
+               event_loop_once(ev);
+       }
+
+       update_once(ctdb, ev, ctdb_db, record, value);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/ctdbd_test.c b/ctdb/tests/src/ctdbd_test.c
new file mode 100644 (file)
index 0000000..fb29ba8
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+   ctdbd test include file
+
+   Copyright (C) Martin Schwenke  2011
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CTDBD_TEST_C
+#define _CTDBD_TEST_C
+
+#include "includes.h"
+#include "tdb.h"
+#include "ctdb_private.h"
+
+/*
+ * Need these, since they're defined in ctdbd.c but we can't include
+ * that.
+ */
+int script_log_level;
+bool fast_start;
+
+/* UTIL_OBJ */
+#include "lib/util/idtree.c"
+#include "lib/util/db_wrap.c"
+#include "lib/util/strlist.c"
+#include "lib/util/util.c"
+#include "lib/util/util_time.c"
+#include "lib/util/util_file.c"
+#include "lib/util/fault.c"
+#include "lib/util/substitute.c"
+#include "lib/util/signal.c"
+
+/* CTDB_COMMON_OBJ */
+#include "common/ctdb_io.c"
+#include "common/ctdb_util.c"
+#include "common/ctdb_ltdb.c"
+#include "common/ctdb_message.c"
+#include "common/cmdline.c"
+#include "lib/util/debug.c"
+#include "common/rb_tree.c"
+#include "common/system_common.c"
+#include "common/ctdb_logging.c"
+#include "common/ctdb_fork.c"
+
+/* CTDB_SERVER_OBJ */
+#include "server/ctdb_daemon.c"
+#include "server/ctdb_recoverd.c"
+#include "server/ctdb_recover.c"
+#include "server/ctdb_freeze.c"
+#include "server/ctdb_tunables.c"
+#include "server/ctdb_monitor.c"
+#include "server/ctdb_server.c"
+#include "server/ctdb_control.c"
+#include "server/ctdb_call.c"
+#include "server/ctdb_ltdb_server.c"
+#include "server/ctdb_traverse.c"
+#include "server/eventscript.c"
+#include "server/ctdb_takeover.c"
+#include "server/ctdb_serverids.c"
+#include "server/ctdb_persistent.c"
+#include "server/ctdb_keepalive.c"
+#include "server/ctdb_logging.c"
+#include "server/ctdb_uptime.c"
+#include "server/ctdb_vacuum.c"
+#include "server/ctdb_banning.c"
+#include "server/ctdb_statistics.c"
+#include "server/ctdb_update_record.c"
+#include "server/ctdb_lock.c"
+
+/* CTDB_CLIENT_OBJ */
+#include "client/ctdb_client.c"
+
+/* CTDB_TCP_OBJ */
+#include "tcp/tcp_connect.c"
+#include "tcp/tcp_io.c"
+#include "tcp/tcp_init.c"
+
+#endif /* _CTDBD_TEST_C */
diff --git a/ctdb/tests/src/rb_perftest.c b/ctdb/tests/src/rb_perftest.c
new file mode 100644 (file)
index 0000000..1760cd1
--- /dev/null
@@ -0,0 +1,123 @@
+/* 
+   simple rb vs dlist benchmark
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/events/events.h"
+#include "lib/util/dlinklist.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+#include "common/rb_tree.h"
+
+static struct timeval tp1,tp2;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+
+static int num_records = 1000;
+
+
+struct list_node {
+       struct list_node *prev, *next;
+};
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "num-records", 'r', POPT_ARG_INT, &num_records, 0, "num_records", "integer" },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       int ret;
+       poptContext pc;
+       struct event_context *ev;
+       double elapsed;
+       int i;
+       trbt_tree_t *tree;
+       struct list_node *list, *list_new, *list_head=NULL;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       ev = event_context_init(NULL);
+
+
+       printf("testing tree insert for %d records\n", num_records);
+       tree = trbt_create(NULL);
+       start_timer();
+       for (i=0;i<num_records;i++) {
+               trbt_insert32(tree, i, NULL);
+       }
+       elapsed=end_timer();
+       printf("%f seconds\n",(float)elapsed);
+
+
+       printf("testing dlist (worst case) add to tail for %d records\n", num_records);
+       list_new=talloc(NULL, struct list_node);
+       DLIST_ADD(list_head, list_new);
+       start_timer();
+       for (i=0;i<num_records;i++) {
+               for(list=list_head;list->next;list=list->next) {
+                       /* the events code does a timeval_compare */
+                       timeval_compare(&tp1, &tp2);
+               }
+
+               list_new=talloc(NULL, struct list_node);
+               DLIST_ADD_AFTER(list_head, list_new, list);
+       }
+       elapsed=end_timer();
+       printf("%f seconds\n",(float)elapsed);
+
+       return 0;
+}
diff --git a/ctdb/tests/src/rb_test.c b/ctdb/tests/src/rb_test.c
new file mode 100644 (file)
index 0000000..092732b
--- /dev/null
@@ -0,0 +1,347 @@
+/* 
+   simple rb test tool
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "lib/util/dlinklist.h"
+#include "system/filesys.h"
+#include "popt.h"
+#include "cmdline.h"
+
+#include <sys/time.h>
+#include <time.h>
+#include "common/rb_tree.h"
+
+static struct timeval tp1,tp2;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+int num_records=5;
+
+void *callback(void *p, void *d)
+{
+       uint32_t *data = (uint32_t *)d;
+
+       if (d==NULL) {
+               data = (uint32_t *)p;
+       }
+
+       (*data)++;
+
+       return data;
+}
+
+void *random_add(void *p, void *d)
+{
+       return p;
+}
+
+int traverse(void *p, void *d)
+{
+       uint32_t *data = (uint32_t *)d;
+
+       printf("traverse data:%d\n",*data);
+       return 0;
+}
+
+int random_traverse(void *p, void *d)
+{
+       printf("%s   ",(char *)d);
+       return 0;
+}
+
+static uint32_t calc_checksum = 0;     
+int traverse_checksum(void *p, void *d)
+{
+       int i,j,k;
+
+       sscanf(d, "%d.%d.%d", &i, &j, &k);
+       calc_checksum += i*100+j*10+k;
+       return 0;
+}
+
+int count_traverse(void *p, void *d)
+{
+       int *count = p;
+       (*count)++;
+       return 0;
+}
+
+int count_traverse_abort(void *p, void *d)
+{
+       int *count = p;
+       (*count)++;
+       return -1;
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "num-records", 'r', POPT_ARG_INT, &num_records, 0, "num_records", "integer" },
+               POPT_TABLEEND
+       };
+       int opt, traverse_count;
+       const char **extra_argv;
+       int extra_argc = 0;
+       poptContext pc;
+       int i,j,k;
+       trbt_tree_t *tree;
+       uint32_t *data;
+       uint32_t key[3];
+       uint32_t key1[3] = {0,10,20};
+       uint32_t key2[3] = {0,10,21};
+       uint32_t key3[3] = {0,11,20};
+       uint32_t key4[3] = {2,10,20};
+       TALLOC_CTX *memctx;
+       uint32_t **u32array;
+       uint32_t checksum;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       printf("testing trbt_insert32_callback for %d records\n", num_records);
+       memctx   = talloc_new(NULL);
+       u32array = talloc_array(memctx, uint32_t *, num_records);
+       tree = trbt_create(memctx, 0);
+       for (i=0; i<num_records; i++) {
+               u32array[i]  = talloc(u32array, uint32_t);
+               *u32array[i] = 0;
+               trbt_insert32_callback(tree, i, callback, u32array[i]);
+       }
+       for (i=3; i<num_records; i++) {
+               trbt_insert32_callback(tree, i, callback, NULL);
+       }
+
+       printf("first 3 keys should have data==1\n");
+       printf("the rest of the keys should have data==2\n");
+       for (i=0; i<num_records; i++) {
+               data = trbt_lookup32(tree, i);
+               printf("key:%d data:%d\n", i, *data);
+       }
+//     talloc_report_full(tree, stdout);
+//     talloc_report_full(memctx, stdout);
+//     print_tree(tree);
+
+       printf("deleting key 2\n");
+       talloc_free(u32array[2]);
+//     talloc_report_full(tree, stdout);
+//     talloc_report_full(memctx, stdout);
+//     print_tree(tree);
+
+       printf("deleting key 1\n");
+       talloc_free(u32array[1]);
+//     talloc_report_full(tree, stdout);
+//     talloc_report_full(memctx, stdout);
+//     print_tree(tree);
+
+       printf("freeing tree\n");
+       talloc_report_full(memctx, stdout);
+       talloc_free(memctx);
+
+
+       printf("testing trbt_insertarray32_callback\n");
+       memctx   = talloc_new(NULL);
+       tree = trbt_create(memctx, 0);
+       u32array = talloc_array(memctx, uint32_t *, 4);
+       for (i=0;i<4;i++) {
+               u32array[i]  = talloc(u32array, uint32_t);
+               *u32array[i] = 0;
+       }
+       trbt_insertarray32_callback(tree, 3, key1, callback, u32array[0]);
+       trbt_insertarray32_callback(tree, 3, key1, callback, u32array[0]);
+       trbt_insertarray32_callback(tree, 3, key2, callback, u32array[1]);
+       trbt_insertarray32_callback(tree, 3, key3, callback, u32array[2]);
+       trbt_insertarray32_callback(tree, 3, key2, callback, u32array[1]);
+       trbt_insertarray32_callback(tree, 3, key1, callback, u32array[0]);
+
+       data = trbt_lookuparray32(tree, 3, key1);
+       printf("key1 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key2);
+       printf("key2 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key3);
+       printf("key3 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key4);
+       printf("key4 dataptr:%p == %d\n",data,data?*data:-1);
+       trbt_traversearray32(tree, 3, traverse, NULL);
+
+       printf("\ndeleting key4\n");
+       talloc_free(trbt_lookuparray32(tree, 3, key4));
+       data = trbt_lookuparray32(tree, 3, key1);
+       printf("key1 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key2);
+       printf("key2 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key3);
+       printf("key3 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key4);
+       printf("key4 dataptr:%p == %d\n",data,data?*data:-1);
+       trbt_traversearray32(tree, 3, traverse, NULL);
+
+       printf("\ndeleting key2\n");
+       talloc_free(trbt_lookuparray32(tree, 3, key2));
+       data = trbt_lookuparray32(tree, 3, key1);
+       printf("key1 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key2);
+       printf("key2 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key3);
+       printf("key3 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key4);
+       printf("key4 dataptr:%p == %d\n",data,data?*data:-1);
+       trbt_traversearray32(tree, 3, traverse, NULL);
+       
+       printf("\ndeleting key3\n");
+       talloc_free(trbt_lookuparray32(tree, 3, key3));
+       data = trbt_lookuparray32(tree, 3, key1);
+       printf("key1 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key2);
+       printf("key2 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key3);
+       printf("key3 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key4);
+       printf("key4 dataptr:%p == %d\n",data,data?*data:-1);
+       trbt_traversearray32(tree, 3, traverse, NULL);
+       
+       printf("\ndeleting key1\n");
+       talloc_free(trbt_lookuparray32(tree, 3, key1));
+       data = trbt_lookuparray32(tree, 3, key1);
+       printf("key1 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key2);
+       printf("key2 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key3);
+       printf("key3 dataptr:%p == %d\n",data,data?*data:-1);
+       data = trbt_lookuparray32(tree, 3, key4);
+       printf("key4 dataptr:%p == %d\n",data,data?*data:-1);
+       trbt_traversearray32(tree, 3, traverse, NULL);
+
+       talloc_free(tree);
+       talloc_free(memctx);
+       
+
+       printf("\nrun random insert and delete for 60 seconds\n");
+       memctx   = talloc_new(NULL);
+       tree = trbt_create(memctx, 0);
+       i=0;
+       start_timer();
+       checksum = 0;
+       /* add and delete nodes from a 3 level tree fro 60 seconds.
+          each time a node is added or deleted, traverse the tree and
+          compute a checksum over the data stored in the tree and compare this
+          with a checksum we keep which contains what the checksum should be
+        */
+       while(end_timer() < 60.0){
+               char *str;
+
+               i++;
+               key[0]=random()%10;
+               key[1]=random()%10;
+               key[2]=random()%10;
+               if (random()%2) {
+                       if (trbt_lookuparray32(tree, 3, key) == NULL) {
+                               /* this node does not yet exist, add it to the
+                                  tree and update the checksum
+                                */
+                               str=talloc_asprintf(memctx, "%d.%d.%d", key[0],key[1],key[2]);
+                               trbt_insertarray32_callback(tree, 3, key, random_add, str);
+                               checksum += key[0]*100+key[1]*10+key[2];
+                       }
+               } else {
+                       if ((str=trbt_lookuparray32(tree, 3, key)) != NULL) {
+                               /* this node does exist in  the tree, delete 
+                                  it and update the checksum accordingly
+                                */
+                               talloc_free(str);
+                               checksum -= key[0]*100+key[1]*10+key[2];
+                       }
+               }
+               /* traverse all nodes in the tree and calculate the checksum
+                  it better match the one we keep track of in
+                  'checksum'
+               */
+               calc_checksum = 0;
+               trbt_traversearray32(tree, 3, traverse_checksum, NULL);
+               if(checksum != calc_checksum) {
+                       printf("Wrong checksum  %d!=%d\n",checksum, calc_checksum);
+                       exit(10);
+               }
+
+               if(i%1000==999)printf(".");fflush(stdout);
+       }
+       printf("\niterations passed:%d\n", i);
+       trbt_traversearray32(tree, 3, random_traverse, NULL);
+       printf("\n");
+       printf("first node: %s\n", (char *)trbt_findfirstarray32(tree, 3));
+
+       traverse_count = 0;
+       trbt_traversearray32(tree, 3, count_traverse, &traverse_count);
+       printf("\n");
+       printf("number of entries in traverse %d\n", traverse_count);
+
+       traverse_count = 0;
+       trbt_traversearray32(tree, 3, count_traverse_abort, &traverse_count);
+       printf("\n");
+       printf("number of entries in aborted traverse %d\n", traverse_count);
+       if (traverse_count != 1) {
+               printf("Failed to abort the traverse. Should have been aborted after 1 element but did iterate over %d elements\n", traverse_count);
+               exit(10);
+       }
+       printf("\ndeleting all entries\n");
+       for(i=0;i<10;i++){
+       for(j=0;j<10;j++){
+       for(k=0;k<10;k++){
+               key[0]=i;
+               key[1]=j;
+               key[2]=k;
+               talloc_free(trbt_lookuparray32(tree, 3, key));
+       }
+       }
+       }
+       trbt_traversearray32(tree, 3, random_traverse, NULL);
+       printf("\n");
+       talloc_report_full(tree, stdout);
+
+       return 0;
+}
diff --git a/ctdb/tests/takeover/README b/ctdb/tests/takeover/README
new file mode 100644 (file)
index 0000000..764f389
--- /dev/null
@@ -0,0 +1,5 @@
+Unit tests for the CTDB IP allocation algorithm(s).
+
+Test case filenames look like <algorithm>.NNN.sh, where <algorithm>
+indicates the IP allocation algorithm to use.  These use the
+ctdb_takeover_test test program.
diff --git a/ctdb/tests/takeover/det.001.sh b/ctdb/tests/takeover/det.001.sh
new file mode 100755 (executable)
index 0000000..2387f12
--- /dev/null
@@ -0,0 +1,36 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 healthy"
+
+required_result <<EOF
+DATE TIME [PID]: Deterministic IPs enabled. Resetting all ip allocations
+DATE TIME [PID]: Unassign IP: 192.168.21.254 from 0
+DATE TIME [PID]: Unassign IP: 192.168.21.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.254 from 0
+DATE TIME [PID]: Unassign IP: 192.168.20.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.251 from 0
+DATE TIME [PID]: Unassign IP: 192.168.20.250 from 1
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+simple_test 2,2,0 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/det.002.sh b/ctdb/tests/takeover/det.002.sh
new file mode 100755 (executable)
index 0000000..21fbaec
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 2 healthy"
+
+required_result <<EOF
+DATE TIME [PID]: Deterministic IPs enabled. Resetting all ip allocations
+DATE TIME [PID]: Unassign IP: 192.168.21.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.250 from 1
+192.168.21.254 0
+192.168.21.253 0
+192.168.21.252 2
+192.168.20.254 0
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 0
+192.168.20.250 0
+192.168.20.249 2
+EOF
+
+simple_test 0,2,0 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/det.003.sh b/ctdb/tests/takeover/det.003.sh
new file mode 100755 (executable)
index 0000000..3666047
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 -> all healthy"
+
+required_result <<EOF
+DATE TIME [PID]: Deterministic IPs enabled. Resetting all ip allocations
+192.168.21.254 0
+192.168.21.253 1
+192.168.21.252 2
+192.168.20.254 0
+192.168.20.253 1
+192.168.20.252 2
+192.168.20.251 0
+192.168.20.250 1
+192.168.20.249 2
+EOF
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 1
+192.168.20.250 1
+192.168.20.251 1
+192.168.20.252 1
+192.168.20.253 1
+192.168.20.254 1
+192.168.21.252 1
+192.168.21.253 1
+192.168.21.254 1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.001.sh b/ctdb/tests/takeover/lcp2.001.sh
new file mode 100755 (executable)
index 0000000..8772318
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 3 -> 1 healthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+simple_test 2,2,0 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.002.sh b/ctdb/tests/takeover/lcp2.002.sh
new file mode 100755 (executable)
index 0000000..f3f6f0a
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 3 -> 2 healthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 0
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 0
+192.168.20.251 2
+192.168.20.250 0
+192.168.20.249 0
+EOF
+
+simple_test 0,2,0 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.003.sh b/ctdb/tests/takeover/lcp2.003.sh
new file mode 100755 (executable)
index 0000000..f6cfe57
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 -> all healthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 2
+192.168.21.253 0
+192.168.21.252 1
+192.168.20.254 2
+192.168.20.253 0
+192.168.20.252 1
+192.168.20.251 2
+192.168.20.250 0
+192.168.20.249 1
+EOF
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 1
+192.168.20.250 1
+192.168.20.251 1
+192.168.20.252 1
+192.168.20.253 1
+192.168.20.254 1
+192.168.21.252 1
+192.168.21.253 1
+192.168.21.254 1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.004.sh b/ctdb/tests/takeover/lcp2.004.sh
new file mode 100755 (executable)
index 0000000..c067184
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 -> all healthy, info logging"
+
+export CTDB_TEST_LOGLEVEL=3
+
+required_result <<EOF
+DATE TIME [PID]: 1 [-121363] -> 192.168.20.253 -> 0 [+0]
+DATE TIME [PID]: 1 [-105738] -> 192.168.20.251 -> 2 [+0]
+DATE TIME [PID]: 1 [-88649] -> 192.168.21.253 -> 0 [+14161]
+DATE TIME [PID]: 1 [-75448] -> 192.168.20.254 -> 2 [+15625]
+DATE TIME [PID]: 1 [-59823] -> 192.168.20.250 -> 0 [+29786]
+DATE TIME [PID]: 1 [-44198] -> 192.168.21.254 -> 2 [+28322]
+192.168.21.254 2
+192.168.21.253 0
+192.168.21.252 1
+192.168.20.254 2
+192.168.20.253 0
+192.168.20.252 1
+192.168.20.251 2
+192.168.20.250 0
+192.168.20.249 1
+EOF
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 1
+192.168.20.250 1
+192.168.20.251 1
+192.168.20.252 1
+192.168.20.253 1
+192.168.20.254 1
+192.168.21.252 1
+192.168.21.253 1
+192.168.21.254 1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.005.sh b/ctdb/tests/takeover/lcp2.005.sh
new file mode 100755 (executable)
index 0000000..113e52f
--- /dev/null
@@ -0,0 +1,163 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 -> all healthy, debug logging"
+
+export CTDB_TEST_LOGLEVEL=4
+
+required_result <<EOF
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES (UNASSIGNED)
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [539166]
+DATE TIME [PID]:  1 [-116718] -> 192.168.21.254 -> 0 [+0]
+DATE TIME [PID]:  1 [-116718] -> 192.168.21.254 -> 2 [+0]
+DATE TIME [PID]:  1 [-116971] -> 192.168.21.253 -> 0 [+0]
+DATE TIME [PID]:  1 [-116971] -> 192.168.21.253 -> 2 [+0]
+DATE TIME [PID]:  1 [-116971] -> 192.168.21.252 -> 0 [+0]
+DATE TIME [PID]:  1 [-116971] -> 192.168.21.252 -> 2 [+0]
+DATE TIME [PID]:  1 [-121110] -> 192.168.20.254 -> 0 [+0]
+DATE TIME [PID]:  1 [-121110] -> 192.168.20.254 -> 2 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.253 -> 0 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.253 -> 2 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.252 -> 0 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.252 -> 2 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.251 -> 0 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.251 -> 2 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.250 -> 0 [+0]
+DATE TIME [PID]:  1 [-121363] -> 192.168.20.250 -> 2 [+0]
+DATE TIME [PID]:  1 [-121110] -> 192.168.20.249 -> 0 [+0]
+DATE TIME [PID]:  1 [-121110] -> 192.168.20.249 -> 2 [+0]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 1 [-121363] -> 192.168.20.253 -> 0 [+0]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [418056]
+DATE TIME [PID]:  1 [-102557] -> 192.168.21.254 -> 0 [+14161]
+DATE TIME [PID]:  1 [-102557] -> 192.168.21.254 -> 2 [+0]
+DATE TIME [PID]:  1 [-102810] -> 192.168.21.253 -> 0 [+14161]
+DATE TIME [PID]:  1 [-102810] -> 192.168.21.253 -> 2 [+0]
+DATE TIME [PID]:  1 [-102810] -> 192.168.21.252 -> 0 [+14161]
+DATE TIME [PID]:  1 [-102810] -> 192.168.21.252 -> 2 [+0]
+DATE TIME [PID]:  1 [-105234] -> 192.168.20.254 -> 0 [+15876]
+DATE TIME [PID]:  1 [-105234] -> 192.168.20.254 -> 2 [+0]
+DATE TIME [PID]:  1 [-105234] -> 192.168.20.252 -> 0 [+16129]
+DATE TIME [PID]:  1 [-105234] -> 192.168.20.252 -> 2 [+0]
+DATE TIME [PID]:  1 [-105738] -> 192.168.20.251 -> 0 [+15625]
+DATE TIME [PID]:  1 [-105738] -> 192.168.20.251 -> 2 [+0]
+DATE TIME [PID]:  1 [-105738] -> 192.168.20.250 -> 0 [+15625]
+DATE TIME [PID]:  1 [-105738] -> 192.168.20.250 -> 2 [+0]
+DATE TIME [PID]:  1 [-105485] -> 192.168.20.249 -> 0 [+15625]
+DATE TIME [PID]:  1 [-105485] -> 192.168.20.249 -> 2 [+0]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 1 [-105738] -> 192.168.20.251 -> 2 [+0]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [312571]
+DATE TIME [PID]:  1 [-88396] -> 192.168.21.254 -> 0 [+14161]
+DATE TIME [PID]:  1 [-88396] -> 192.168.21.254 -> 2 [+14161]
+DATE TIME [PID]:  1 [-88649] -> 192.168.21.253 -> 0 [+14161]
+DATE TIME [PID]:  1 [-88649] -> 192.168.21.253 -> 2 [+14161]
+DATE TIME [PID]:  1 [-88649] -> 192.168.21.252 -> 0 [+14161]
+DATE TIME [PID]:  1 [-88649] -> 192.168.21.252 -> 2 [+14161]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.254 -> 0 [+15876]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.254 -> 2 [+15625]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.252 -> 0 [+16129]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.252 -> 2 [+15625]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.250 -> 0 [+15625]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.250 -> 2 [+16129]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.249 -> 0 [+15625]
+DATE TIME [PID]:  1 [-89609] -> 192.168.20.249 -> 2 [+15876]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 1 [-88649] -> 192.168.21.253 -> 0 [+14161]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [222962]
+DATE TIME [PID]:  1 [-72520] -> 192.168.21.254 -> 0 [+30037]
+DATE TIME [PID]:  1 [-72520] -> 192.168.21.254 -> 2 [+14161]
+DATE TIME [PID]:  1 [-72520] -> 192.168.21.252 -> 0 [+30290]
+DATE TIME [PID]:  1 [-72520] -> 192.168.21.252 -> 2 [+14161]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.254 -> 0 [+30037]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.254 -> 2 [+15625]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.252 -> 0 [+30290]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.252 -> 2 [+15625]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.250 -> 0 [+29786]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.250 -> 2 [+16129]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.249 -> 0 [+29786]
+DATE TIME [PID]:  1 [-75448] -> 192.168.20.249 -> 2 [+15876]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 1 [-75448] -> 192.168.20.254 -> 2 [+15625]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [147514]
+DATE TIME [PID]:  1 [-58359] -> 192.168.21.254 -> 0 [+30037]
+DATE TIME [PID]:  1 [-58359] -> 192.168.21.254 -> 2 [+28322]
+DATE TIME [PID]:  1 [-58359] -> 192.168.21.252 -> 0 [+30290]
+DATE TIME [PID]:  1 [-58359] -> 192.168.21.252 -> 2 [+28322]
+DATE TIME [PID]:  1 [-59572] -> 192.168.20.252 -> 0 [+30290]
+DATE TIME [PID]:  1 [-59572] -> 192.168.20.252 -> 2 [+31501]
+DATE TIME [PID]:  1 [-59823] -> 192.168.20.250 -> 0 [+29786]
+DATE TIME [PID]:  1 [-59823] -> 192.168.20.250 -> 2 [+31754]
+DATE TIME [PID]:  1 [-59823] -> 192.168.20.249 -> 0 [+29786]
+DATE TIME [PID]:  1 [-59823] -> 192.168.20.249 -> 2 [+31501]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 1 [-59823] -> 192.168.20.250 -> 0 [+29786]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [87691]
+DATE TIME [PID]:  1 [-44198] -> 192.168.21.254 -> 0 [+44198]
+DATE TIME [PID]:  1 [-44198] -> 192.168.21.254 -> 2 [+28322]
+DATE TIME [PID]:  1 [-44198] -> 192.168.21.252 -> 0 [+44451]
+DATE TIME [PID]:  1 [-44198] -> 192.168.21.252 -> 2 [+28322]
+DATE TIME [PID]:  1 [-43947] -> 192.168.20.252 -> 0 [+45915]
+DATE TIME [PID]:  1 [-43947] -> 192.168.20.252 -> 2 [+31501]
+DATE TIME [PID]:  1 [-43947] -> 192.168.20.249 -> 0 [+45662]
+DATE TIME [PID]:  1 [-43947] -> 192.168.20.249 -> 2 [+31501]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 1 [-44198] -> 192.168.21.254 -> 2 [+28322]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 0 [43947]
+DATE TIME [PID]:  0 [-28322] -> 192.168.21.253 -> 0 [+28322]
+DATE TIME [PID]:  0 [-28322] -> 192.168.21.253 -> 2 [+44198]
+DATE TIME [PID]:  0 [-29786] -> 192.168.20.253 -> 0 [+29786]
+DATE TIME [PID]:  0 [-29786] -> 192.168.20.253 -> 2 [+45662]
+DATE TIME [PID]:  0 [-29786] -> 192.168.20.250 -> 0 [+29786]
+DATE TIME [PID]:  0 [-29786] -> 192.168.20.250 -> 2 [+45915]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 2 [43947]
+DATE TIME [PID]:  2 [-28322] -> 192.168.21.254 -> 0 [+44198]
+DATE TIME [PID]:  2 [-28322] -> 192.168.21.254 -> 2 [+28322]
+DATE TIME [PID]:  2 [-29786] -> 192.168.20.254 -> 0 [+45662]
+DATE TIME [PID]:  2 [-29786] -> 192.168.20.254 -> 2 [+29786]
+DATE TIME [PID]:  2 [-29786] -> 192.168.20.251 -> 0 [+45915]
+DATE TIME [PID]:  2 [-29786] -> 192.168.20.251 -> 2 [+29786]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [43744]
+DATE TIME [PID]:  1 [-28322] -> 192.168.21.252 -> 0 [+44451]
+DATE TIME [PID]:  1 [-28322] -> 192.168.21.252 -> 2 [+44198]
+DATE TIME [PID]:  1 [-29786] -> 192.168.20.252 -> 0 [+45915]
+DATE TIME [PID]:  1 [-29786] -> 192.168.20.252 -> 2 [+45662]
+DATE TIME [PID]:  1 [-29786] -> 192.168.20.249 -> 0 [+45662]
+DATE TIME [PID]:  1 [-29786] -> 192.168.20.249 -> 2 [+45662]
+DATE TIME [PID]:  ----------------------------------------
+192.168.21.254 2
+192.168.21.253 0
+192.168.21.252 1
+192.168.20.254 2
+192.168.20.253 0
+192.168.20.252 1
+192.168.20.251 2
+192.168.20.250 0
+192.168.20.249 1
+EOF
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 1
+192.168.20.250 1
+192.168.20.251 1
+192.168.20.252 1
+192.168.20.253 1
+192.168.20.254 1
+192.168.21.252 1
+192.168.21.253 1
+192.168.21.254 1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.006.sh b/ctdb/tests/takeover/lcp2.006.sh
new file mode 100755 (executable)
index 0000000..13bb40f
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 0 -> 1 healthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 1
+192.168.21.253 1
+192.168.21.252 1
+192.168.20.254 1
+192.168.20.253 1
+192.168.20.252 1
+192.168.20.251 1
+192.168.20.250 1
+192.168.20.249 1
+EOF
+
+simple_test 2,0,2 <<EOF
+192.168.20.249 -1
+192.168.20.250 -1
+192.168.20.251 -1
+192.168.20.252 -1
+192.168.20.253 -1
+192.168.20.254 -1
+192.168.21.252 -1
+192.168.21.253 -1
+192.168.21.254 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.007.sh b/ctdb/tests/takeover/lcp2.007.sh
new file mode 100755 (executable)
index 0000000..76fa06e
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 0 -> 2 healthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 1
+192.168.21.253 2
+192.168.21.252 1
+192.168.20.254 1
+192.168.20.253 2
+192.168.20.252 1
+192.168.20.251 1
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+simple_test 2,0,0 <<EOF
+192.168.20.249 -1
+192.168.20.250 -1
+192.168.20.251 -1
+192.168.20.252 -1
+192.168.20.253 -1
+192.168.20.254 -1
+192.168.21.252 -1
+192.168.21.253 -1
+192.168.21.254 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.008.sh b/ctdb/tests/takeover/lcp2.008.sh
new file mode 100755 (executable)
index 0000000..f5c0af3
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 0 -> all healthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 0
+192.168.21.253 1
+192.168.21.252 2
+192.168.20.254 0
+192.168.20.253 1
+192.168.20.252 2
+192.168.20.251 0
+192.168.20.250 1
+192.168.20.249 2
+EOF
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 -1
+192.168.20.250 -1
+192.168.20.251 -1
+192.168.20.252 -1
+192.168.20.253 -1
+192.168.20.254 -1
+192.168.21.252 -1
+192.168.21.253 -1
+192.168.21.254 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.009.sh b/ctdb/tests/takeover/lcp2.009.sh
new file mode 100755 (executable)
index 0000000..e862c92
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 3 healthy -> all disconnected"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
+
+simple_test 1,1,1 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.010.sh b/ctdb/tests/takeover/lcp2.010.sh
new file mode 100755 (executable)
index 0000000..20b1c98
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "2 disjoint groups of nodes/addresses, a node becomes healthy"
+
+# This illustrates a bug in LCP2 when the the only candidate for a
+# source node is chosen to be the "most imbalanced" node.  This means
+# that nodes in the smaller group aren't necessarily (depends on sort
+# order and addresses used) considered as candidates.  If the larger
+# group has 6 addresses then the "necessarily" goes away and the
+# smaller group won't be rebalanced.
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.209.102 3
+192.168.209.101 2
+192.168.140.4 1
+192.168.140.3 1
+192.168.140.2 0
+192.168.140.1 0
+EOF
+
+simple_test 0,0,0,0 <<EOF
+192.168.140.1          0       0,1
+192.168.140.2          0       0,1
+192.168.140.3          1       0,1
+192.168.140.4          1       0,1
+192.168.209.101                2       2,3
+192.168.209.102                2       2,3
+EOF
diff --git a/ctdb/tests/takeover/lcp2.011.sh b/ctdb/tests/takeover/lcp2.011.sh
new file mode 100755 (executable)
index 0000000..f752aa3
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "2 disjoint groups of nodes/addresses, continue a stopped node"
+
+# Another LCP2 1.0 bug
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+10.11.19.46 3
+10.11.19.45 3
+10.11.19.44 1
+10.11.18.46 1
+10.11.18.45 3
+10.11.18.44 1
+10.11.17.46 3
+10.11.17.45 3
+10.11.17.44 1
+10.11.16.46 1
+10.11.16.45 3
+10.11.16.44 1
+9.11.136.46 2
+9.11.136.45 0
+9.11.136.44 2
+EOF
+
+simple_test 0,0,0,0 <<EOF
+9.11.136.44 2 0,2
+9.11.136.45 2 0,2
+9.11.136.46 2 0,2
+10.11.16.44 1 1,3
+10.11.16.45 3 1,3
+10.11.16.46 1 1,3
+10.11.17.44 1 1,3
+10.11.17.45 3 1,3
+10.11.17.46 3 1,3
+10.11.18.44 1 1,3
+10.11.18.45 3 1,3
+10.11.18.46 1 1,3
+10.11.19.44 1 1,3
+10.11.19.45 3 1,3
+10.11.19.46 3 1,3
+EOF
diff --git a/ctdb/tests/takeover/lcp2.012.sh b/ctdb/tests/takeover/lcp2.012.sh
new file mode 100755 (executable)
index 0000000..8f5c537
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Node with NODE_FLAGS_NOIPTAKEOVER doesn't gain IPs"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 1
+192.168.21.253 2
+192.168.21.252 1
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 1
+192.168.20.251 2
+192.168.20.250 1
+192.168.20.249 1
+EOF
+
+export CTDB_SET_NoIPTakeover="1,0,0"
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 1
+192.168.20.250 1
+192.168.20.251 1
+192.168.20.252 1
+192.168.20.253 1
+192.168.20.254 1
+192.168.21.252 1
+192.168.21.253 1
+192.168.21.254 1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.013.sh b/ctdb/tests/takeover/lcp2.013.sh
new file mode 100755 (executable)
index 0000000..fb9d724
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Node with NODE_FLAGS_NOIPTAKEOVER doesn't lose IPs"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 2
+192.168.21.253 1
+192.168.21.252 0
+192.168.20.254 2
+192.168.20.253 1
+192.168.20.252 0
+192.168.20.251 2
+192.168.20.250 1
+192.168.20.249 0
+EOF
+
+export CTDB_SET_NoIPTakeover="1,0,0"
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.014.sh b/ctdb/tests/takeover/lcp2.014.sh
new file mode 100755 (executable)
index 0000000..36eda92
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, no IPs assigned, all unhealthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 0
+192.168.21.253 1
+192.168.21.252 2
+192.168.20.254 0
+192.168.20.253 1
+192.168.20.252 2
+192.168.20.251 0
+192.168.20.250 1
+192.168.20.249 2
+EOF
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.015.sh b/ctdb/tests/takeover/lcp2.015.sh
new file mode 100755 (executable)
index 0000000..a2569e0
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all IPs assigned, all unhealthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 1
+192.168.20.253 1
+192.168.20.252 1
+192.168.20.251 0
+192.168.20.250 0
+192.168.20.249 0
+EOF
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 1
+192.168.20.253 1
+192.168.20.252 1
+192.168.20.251 0
+192.168.20.250 0
+192.168.20.249 0
+EOF
diff --git a/ctdb/tests/takeover/lcp2.016.sh b/ctdb/tests/takeover/lcp2.016.sh
new file mode 100755 (executable)
index 0000000..2e2df1b
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all IPs assigned, 2->3 unhealthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 1
+192.168.21.253 0
+192.168.21.252 2
+192.168.20.254 1
+192.168.20.253 0
+192.168.20.252 2
+192.168.20.251 1
+192.168.20.250 0
+192.168.20.249 2
+EOF
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.017.sh b/ctdb/tests/takeover/lcp2.017.sh
new file mode 100755 (executable)
index 0000000..07b22fb
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, no IPs assigned, all unhealthy, NoIPHostOnAllDisabled"
+
+export CTDB_TEST_LOGLEVEL=0
+export CTDB_SET_NoIPHostOnAllDisabled=1
+
+required_result <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.018.sh b/ctdb/tests/takeover/lcp2.018.sh
new file mode 100755 (executable)
index 0000000..4a797f7
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all IPs assigned, all unhealthy, NoIPHostOnAllDisabled"
+
+export CTDB_TEST_LOGLEVEL=0
+export CTDB_SET_NoIPHostOnAllDisabled=1
+
+required_result <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 1
+192.168.20.253 1
+192.168.20.252 1
+192.168.20.251 0
+192.168.20.250 0
+192.168.20.249 0
+EOF
diff --git a/ctdb/tests/takeover/lcp2.019.sh b/ctdb/tests/takeover/lcp2.019.sh
new file mode 100755 (executable)
index 0000000..0d8937c
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all IPs assigned, 2->3 unhealthy, NoIPHostOnAllDisabled"
+
+export CTDB_TEST_LOGLEVEL=0
+export CTDB_SET_NoIPHostOnAllDisabled=1
+
+required_result <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.020.sh b/ctdb/tests/takeover/lcp2.020.sh
new file mode 100755 (executable)
index 0000000..e3fe3c4
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all IPs assigned, 2->3 unhealthy, NoIPHostOnAllDisabled on 2"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+export CTDB_SET_NoIPHostOnAllDisabled=1,1,0
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.021.sh b/ctdb/tests/takeover/lcp2.021.sh
new file mode 100755 (executable)
index 0000000..7dcddb1
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, no IPs assigned, 3->2 unhealthy, NoIPHostOnAllDisabled on 2 others"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.21.254 0
+192.168.21.253 0
+192.168.21.252 0
+192.168.20.254 0
+192.168.20.253 0
+192.168.20.252 0
+192.168.20.251 0
+192.168.20.250 0
+192.168.20.249 0
+EOF
+
+export CTDB_SET_NoIPHostOnAllDisabled=0,1,1
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.022.sh b/ctdb/tests/takeover/lcp2.022.sh
new file mode 100755 (executable)
index 0000000..7eb4d8a
--- /dev/null
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, no IPs assigned, 3->2 unhealthy, NoIPTakeover on 2 others"
+
+export CTDB_TEST_LOGLEVEL=0
+
+# We expect 1/2 the IPs to move, but the rest to stay (as opposed to
+# NoIPHostOnAllDisabled)
+required_result <<EOF
+192.168.21.254 2
+192.168.21.253 0
+192.168.21.252 2
+192.168.20.254 0
+192.168.20.253 0
+192.168.20.252 2
+192.168.20.251 0
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+export CTDB_SET_NoIPTakeover=0,1,1
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.023.sh b/ctdb/tests/takeover/lcp2.023.sh
new file mode 100755 (executable)
index 0000000..9bffc58
--- /dev/null
@@ -0,0 +1,77 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all IPs assigned, 1->3 unhealthy"
+
+export CTDB_TEST_LOGLEVEL=4
+
+required_result <<EOF
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES (UNASSIGNED)
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 2 [147968]
+DATE TIME [PID]:  2 [-58359] -> 192.168.21.254 -> 1 [+0]
+DATE TIME [PID]:  2 [-58359] -> 192.168.21.252 -> 1 [+0]
+DATE TIME [PID]:  2 [-59572] -> 192.168.20.253 -> 1 [+0]
+DATE TIME [PID]:  2 [-59823] -> 192.168.20.251 -> 1 [+0]
+DATE TIME [PID]:  2 [-59823] -> 192.168.20.249 -> 1 [+0]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 2 [-59823] -> 192.168.20.251 -> 1 [+0]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 0 [89609]
+DATE TIME [PID]:  0 [-42483] -> 192.168.21.253 -> 1 [+14161]
+DATE TIME [PID]:  0 [-45662] -> 192.168.20.254 -> 1 [+15625]
+DATE TIME [PID]:  0 [-45662] -> 192.168.20.252 -> 1 [+15625]
+DATE TIME [PID]:  0 [-45411] -> 192.168.20.250 -> 1 [+16129]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 0 [-45662] -> 192.168.20.254 -> 1 [+15625]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 2 [88145]
+DATE TIME [PID]:  2 [-44198] -> 192.168.21.254 -> 1 [+28322]
+DATE TIME [PID]:  2 [-44198] -> 192.168.21.252 -> 1 [+28322]
+DATE TIME [PID]:  2 [-43947] -> 192.168.20.253 -> 1 [+31501]
+DATE TIME [PID]:  2 [-43947] -> 192.168.20.249 -> 1 [+31501]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]: 2 [-44198] -> 192.168.21.254 -> 1 [+28322]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 0 [44198]
+DATE TIME [PID]:  0 [-28322] -> 192.168.21.253 -> 1 [+44198]
+DATE TIME [PID]:  0 [-29786] -> 192.168.20.252 -> 1 [+45662]
+DATE TIME [PID]:  0 [-29786] -> 192.168.20.250 -> 1 [+45915]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 2 [44198]
+DATE TIME [PID]:  2 [-28322] -> 192.168.21.252 -> 1 [+44198]
+DATE TIME [PID]:  2 [-29786] -> 192.168.20.253 -> 1 [+45662]
+DATE TIME [PID]:  2 [-29786] -> 192.168.20.249 -> 1 [+45662]
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  ----------------------------------------
+DATE TIME [PID]:  CONSIDERING MOVES FROM 1 [43947]
+DATE TIME [PID]:  1 [-28322] -> 192.168.21.254 -> 1 [+28322]
+DATE TIME [PID]:  1 [-29786] -> 192.168.20.254 -> 1 [+29786]
+DATE TIME [PID]:  1 [-29786] -> 192.168.20.251 -> 1 [+29786]
+DATE TIME [PID]:  ----------------------------------------
+192.168.21.254 1
+192.168.21.253 0
+192.168.21.252 2
+192.168.20.254 1
+192.168.20.253 2
+192.168.20.252 0
+192.168.20.251 1
+192.168.20.250 0
+192.168.20.249 2
+EOF
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 2
+192.168.21.253 0
+192.168.21.252 2
+192.168.20.254 0
+192.168.20.253 2
+192.168.20.252 0
+192.168.20.251 2
+192.168.20.250 0
+192.168.20.249 2
+EOF
diff --git a/ctdb/tests/takeover/lcp2.024.sh b/ctdb/tests/takeover/lcp2.024.sh
new file mode 100755 (executable)
index 0000000..0509552
--- /dev/null
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, no IPs assigned, all healthy, all in STARTUP runstate"
+
+export CTDB_TEST_LOGLEVEL=2
+
+required_result <<EOF
+DATE TIME [PID]: Failed to find node to cover ip 192.168.21.254
+DATE TIME [PID]: Failed to find node to cover ip 192.168.21.253
+DATE TIME [PID]: Failed to find node to cover ip 192.168.21.252
+DATE TIME [PID]: Failed to find node to cover ip 192.168.20.254
+DATE TIME [PID]: Failed to find node to cover ip 192.168.20.253
+DATE TIME [PID]: Failed to find node to cover ip 192.168.20.252
+DATE TIME [PID]: Failed to find node to cover ip 192.168.20.251
+DATE TIME [PID]: Failed to find node to cover ip 192.168.20.250
+DATE TIME [PID]: Failed to find node to cover ip 192.168.20.249
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
+
+export CTDB_TEST_RUNSTATE=4,4,4
+
+simple_test 0,0,0 <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.025.sh b/ctdb/tests/takeover/lcp2.025.sh
new file mode 100755 (executable)
index 0000000..44b8583
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, no IPs assigned, all healthy, 1 in STARTUP runstate"
+
+export CTDB_TEST_LOGLEVEL=2
+
+required_result <<EOF
+192.168.21.254 1
+192.168.21.253 2
+192.168.21.252 1
+192.168.20.254 1
+192.168.20.253 2
+192.168.20.252 1
+192.168.20.251 1
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+export CTDB_TEST_RUNSTATE=4,5,5
+
+simple_test 0,0,0 <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.026.sh b/ctdb/tests/takeover/lcp2.026.sh
new file mode 100755 (executable)
index 0000000..4c22ba5
--- /dev/null
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, no IPs assigned, all unhealthy, 1 in STARTUP runstate"
+
+export CTDB_TEST_LOGLEVEL=2
+
+required_result <<EOF
+192.168.21.254 1
+192.168.21.253 2
+192.168.21.252 1
+192.168.20.254 1
+192.168.20.253 2
+192.168.20.252 1
+192.168.20.251 1
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+export CTDB_TEST_RUNSTATE=4,5,5
+
+simple_test 2,2,2 <<EOF
+192.168.21.254 -1
+192.168.21.253 -1
+192.168.21.252 -1
+192.168.20.254 -1
+192.168.20.253 -1
+192.168.20.252 -1
+192.168.20.251 -1
+192.168.20.250 -1
+192.168.20.249 -1
+EOF
diff --git a/ctdb/tests/takeover/lcp2.027.sh b/ctdb/tests/takeover/lcp2.027.sh
new file mode 100755 (executable)
index 0000000..20e0f28
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "4 nodes, all IPs assigned, 3->4 unhealthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 3
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 0
+130.216.30.172 3
+130.216.30.171 1
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 0
+10.19.99.250 3
+EOF
+
+simple_test 0,0,2,0 <<EOF
+130.216.30.170 3
+130.216.30.171 2
+130.216.30.172 3
+130.216.30.173 2
+130.216.30.174 1
+130.216.30.175 0
+130.216.30.176 1
+130.216.30.177 0
+130.216.30.178 3
+130.216.30.179 2
+130.216.30.180 1
+130.216.30.181 0
+10.19.99.250 3
+10.19.99.251 2
+10.19.99.252 1
+10.19.99.253 0
+EOF
diff --git a/ctdb/tests/takeover/lcp2.028.sh b/ctdb/tests/takeover/lcp2.028.sh
new file mode 100755 (executable)
index 0000000..60d22d9
--- /dev/null
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "4 nodes, all healthy/assigned, stays unbalanced"
+
+export CTDB_TEST_LOGLEVEL=3
+
+required_result <<EOF
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 2
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 0
+130.216.30.172 3
+130.216.30.171 1
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 0
+10.19.99.250 3
+EOF
+
+simple_test 0,0,0,0 <<EOF
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 2
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 0
+130.216.30.172 3
+130.216.30.171 1
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 0
+10.19.99.250 3
+EOF
diff --git a/ctdb/tests/takeover/lcp2.029.sh b/ctdb/tests/takeover/lcp2.029.sh
new file mode 100755 (executable)
index 0000000..d3c817f
--- /dev/null
@@ -0,0 +1,111 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "4 nodes, some IPs unassigned on target nodes"
+
+export CTDB_TEST_LOGLEVEL=3
+
+required_result <<EOF
+DATE TIME [PID]:  10.19.99.251 -> 2 [+9216]
+DATE TIME [PID]:  130.216.30.173 -> 2 [+24345]
+DATE TIME [PID]:  130.216.30.171 -> 2 [+39970]
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 2
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 2
+130.216.30.172 3
+130.216.30.171 2
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 2
+10.19.99.250 3
+EOF
+
+# In this example were 4 releases from node 2 in a previous iteration
+#
+#   Release of IP 130.216.30.179/27 on interface ethX1  node:3
+#   Release of IP 130.216.30.173/27 on interface ethX1  node:0
+#   Release of IP 130.216.30.171/27 on interface ethX1  node:1
+#   Release of IP 10.19.99.251/22 on interface ethX2  node:0
+#
+# However, one release failed so no takeovers were done.  This means
+# that the target node for each IP still thinks that the IPs are held
+# by node 2.  The release of 130.216.30.179 was so late that node 2
+# still thought that it held that address.
+
+simple_test 0,0,0,0 multi <<EOF
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 3
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 2
+130.216.30.172 3
+130.216.30.171 1
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 2
+10.19.99.250 3
+
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 3
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 0
+130.216.30.172 3
+130.216.30.171 2
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 0
+10.19.99.250 3
+
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 2
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 0
+130.216.30.172 3
+130.216.30.171 1
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 0
+10.19.99.250 3
+
+130.216.30.181 0
+130.216.30.180 1
+130.216.30.179 2
+130.216.30.178 3
+130.216.30.177 0
+130.216.30.176 1
+130.216.30.175 0
+130.216.30.174 1
+130.216.30.173 0
+130.216.30.172 3
+130.216.30.171 1
+130.216.30.170 3
+10.19.99.253 0
+10.19.99.252 1
+10.19.99.251 0
+10.19.99.250 3
+EOF
diff --git a/ctdb/tests/takeover/lcp2.030.sh b/ctdb/tests/takeover/lcp2.030.sh
new file mode 100755 (executable)
index 0000000..739757b
--- /dev/null
@@ -0,0 +1,1813 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "900 IPs, 5 nodes, 0 -> 5 healthy"
+
+export CTDB_TEST_LOGLEVEL=0
+
+required_result <<EOF
+192.168.10.90 0
+192.168.10.89 1
+192.168.10.88 2
+192.168.10.87 3
+192.168.10.86 4
+192.168.10.85 0
+192.168.10.84 1
+192.168.10.83 2
+192.168.10.82 3
+192.168.10.81 4
+192.168.10.80 0
+192.168.10.79 0
+192.168.10.78 1
+192.168.10.77 2
+192.168.10.76 3
+192.168.10.75 4
+192.168.10.74 1
+192.168.10.73 2
+192.168.10.72 3
+192.168.10.71 3
+192.168.10.70 4
+192.168.10.69 0
+192.168.10.68 1
+192.168.10.67 2
+192.168.10.66 4
+192.168.10.65 0
+192.168.10.64 1
+192.168.10.63 0
+192.168.10.62 1
+192.168.10.61 2
+192.168.10.60 3
+192.168.10.59 4
+192.168.10.58 2
+192.168.10.57 3
+192.168.10.56 0
+192.168.10.55 0
+192.168.10.54 1
+192.168.10.53 2
+192.168.10.52 3
+192.168.10.51 4
+192.168.10.50 1
+192.168.10.49 4
+192.168.10.48 2
+192.168.10.47 0
+192.168.10.46 1
+192.168.10.45 2
+192.168.10.44 3
+192.168.10.43 4
+192.168.10.42 2
+192.168.10.41 3
+192.168.10.40 1
+192.168.10.39 3
+192.168.10.38 4
+192.168.10.37 0
+192.168.10.36 1
+192.168.10.35 2
+192.168.10.34 4
+192.168.10.33 0
+192.168.10.32 3
+192.168.10.31 0
+192.168.10.30 1
+192.168.10.29 2
+192.168.10.28 3
+192.168.10.27 4
+192.168.10.26 3
+192.168.10.25 2
+192.168.10.24 0
+192.168.10.23 3
+192.168.10.22 4
+192.168.10.21 0
+192.168.10.20 1
+192.168.10.19 2
+192.168.10.18 4
+192.168.10.17 1
+192.168.10.16 4
+192.168.10.15 0
+192.168.10.14 1
+192.168.10.13 2
+192.168.10.12 3
+192.168.10.11 4
+192.168.10.10 2
+192.168.10.9 3
+192.168.10.8 4
+192.168.10.7 0
+192.168.10.6 1
+192.168.10.5 2
+192.168.10.4 3
+192.168.10.3 4
+192.168.10.2 0
+192.168.10.1 1
+192.168.9.90 0
+192.168.9.89 1
+192.168.9.88 2
+192.168.9.87 3
+192.168.9.86 4
+192.168.9.85 0
+192.168.9.84 1
+192.168.9.83 2
+192.168.9.82 3
+192.168.9.81 4
+192.168.9.80 0
+192.168.9.79 0
+192.168.9.78 1
+192.168.9.77 2
+192.168.9.76 3
+192.168.9.75 4
+192.168.9.74 1
+192.168.9.73 2
+192.168.9.72 3
+192.168.9.71 3
+192.168.9.70 4
+192.168.9.69 0
+192.168.9.68 1
+192.168.9.67 2
+192.168.9.66 4
+192.168.9.65 0
+192.168.9.64 1
+192.168.9.63 0
+192.168.9.62 1
+192.168.9.61 2
+192.168.9.60 3
+192.168.9.59 4
+192.168.9.58 2
+192.168.9.57 3
+192.168.9.56 4
+192.168.9.55 0
+192.168.9.54 1
+192.168.9.53 2
+192.168.9.52 3
+192.168.9.51 4
+192.168.9.50 0
+192.168.9.49 1
+192.168.9.48 2
+192.168.9.47 0
+192.168.9.46 1
+192.168.9.45 2
+192.168.9.44 3
+192.168.9.43 4
+192.168.9.42 2
+192.168.9.41 4
+192.168.9.40 3
+192.168.9.39 0
+192.168.9.38 1
+192.168.9.37 2
+192.168.9.36 3
+192.168.9.35 4
+192.168.9.34 0
+192.168.9.33 1
+192.168.9.32 4
+192.168.9.31 0
+192.168.9.30 1
+192.168.9.29 2
+192.168.9.28 3
+192.168.9.27 4
+192.168.9.26 2
+192.168.9.25 3
+192.168.9.24 0
+192.168.9.23 3
+192.168.9.22 4
+192.168.9.21 0
+192.168.9.20 1
+192.168.9.19 2
+192.168.9.18 4
+192.168.9.17 1
+192.168.9.16 3
+192.168.9.15 0
+192.168.9.14 1
+192.168.9.13 2
+192.168.9.12 3
+192.168.9.11 4
+192.168.9.10 2
+192.168.9.9 4
+192.168.9.8 3
+192.168.9.7 0
+192.168.9.6 1
+192.168.9.5 2
+192.168.9.4 3
+192.168.9.3 4
+192.168.9.2 0
+192.168.9.1 1
+192.168.8.90 0
+192.168.8.89 1
+192.168.8.88 2
+192.168.8.87 3
+192.168.8.86 4
+192.168.8.85 0
+192.168.8.84 1
+192.168.8.83 2
+192.168.8.82 3
+192.168.8.81 4
+192.168.8.80 0
+192.168.8.79 0
+192.168.8.78 1
+192.168.8.77 2
+192.168.8.76 3
+192.168.8.75 4
+192.168.8.74 1
+192.168.8.73 2
+192.168.8.72 3
+192.168.8.71 3
+192.168.8.70 4
+192.168.8.69 0
+192.168.8.68 1
+192.168.8.67 2
+192.168.8.66 4
+192.168.8.65 3
+192.168.8.64 0
+192.168.8.63 0
+192.168.8.62 1
+192.168.8.61 2
+192.168.8.60 3
+192.168.8.59 4
+192.168.8.58 1
+192.168.8.57 2
+192.168.8.56 3
+192.168.8.55 0
+192.168.8.54 1
+192.168.8.53 2
+192.168.8.52 3
+192.168.8.51 4
+192.168.8.50 0
+192.168.8.49 4
+192.168.8.48 1
+192.168.8.47 0
+192.168.8.46 1
+192.168.8.45 2
+192.168.8.44 3
+192.168.8.43 4
+192.168.8.42 2
+192.168.8.41 1
+192.168.8.40 4
+192.168.8.39 0
+192.168.8.38 1
+192.168.8.37 2
+192.168.8.36 3
+192.168.8.35 4
+192.168.8.34 3
+192.168.8.33 0
+192.168.8.32 2
+192.168.8.31 0
+192.168.8.30 1
+192.168.8.29 2
+192.168.8.28 3
+192.168.8.27 4
+192.168.8.26 2
+192.168.8.25 1
+192.168.8.24 3
+192.168.8.23 3
+192.168.8.22 4
+192.168.8.21 0
+192.168.8.20 1
+192.168.8.19 2
+192.168.8.18 4
+192.168.8.17 0
+192.168.8.16 4
+192.168.8.15 0
+192.168.8.14 1
+192.168.8.13 2
+192.168.8.12 3
+192.168.8.11 4
+192.168.8.10 1
+192.168.8.9 2
+192.168.8.8 4
+192.168.8.7 0
+192.168.8.6 1
+192.168.8.5 2
+192.168.8.4 3
+192.168.8.3 4
+192.168.8.2 3
+192.168.8.1 0
+192.168.7.90 0
+192.168.7.89 1
+192.168.7.88 2
+192.168.7.87 3
+192.168.7.86 4
+192.168.7.85 0
+192.168.7.84 1
+192.168.7.83 2
+192.168.7.82 3
+192.168.7.81 4
+192.168.7.80 1
+192.168.7.79 0
+192.168.7.78 1
+192.168.7.77 2
+192.168.7.76 3
+192.168.7.75 4
+192.168.7.74 2
+192.168.7.73 3
+192.168.7.72 0
+192.168.7.71 3
+192.168.7.70 4
+192.168.7.69 0
+192.168.7.68 1
+192.168.7.67 2
+192.168.7.66 4
+192.168.7.65 1
+192.168.7.64 3
+192.168.7.63 0
+192.168.7.62 1
+192.168.7.61 2
+192.168.7.60 3
+192.168.7.59 4
+192.168.7.58 2
+192.168.7.57 0
+192.168.7.56 1
+192.168.7.55 0
+192.168.7.54 1
+192.168.7.53 2
+192.168.7.52 3
+192.168.7.51 4
+192.168.7.50 3
+192.168.7.49 4
+192.168.7.48 2
+192.168.7.47 0
+192.168.7.46 1
+192.168.7.45 2
+192.168.7.44 3
+192.168.7.43 4
+192.168.7.42 2
+192.168.7.41 0
+192.168.7.40 1
+192.168.7.39 4
+192.168.7.38 0
+192.168.7.37 1
+192.168.7.36 2
+192.168.7.35 3
+192.168.7.34 4
+192.168.7.33 3
+192.168.7.32 0
+192.168.7.31 0
+192.168.7.30 1
+192.168.7.29 2
+192.168.7.28 3
+192.168.7.27 4
+192.168.7.26 2
+192.168.7.25 0
+192.168.7.24 1
+192.168.7.23 3
+192.168.7.22 4
+192.168.7.21 0
+192.168.7.20 1
+192.168.7.19 2
+192.168.7.18 4
+192.168.7.17 3
+192.168.7.16 4
+192.168.7.15 0
+192.168.7.14 1
+192.168.7.13 2
+192.168.7.12 3
+192.168.7.11 4
+192.168.7.10 3
+192.168.7.9 2
+192.168.7.8 0
+192.168.7.7 2
+192.168.7.6 4
+192.168.7.5 0
+192.168.7.4 1
+192.168.7.3 3
+192.168.7.2 4
+192.168.7.1 1
+192.168.6.90 0
+192.168.6.89 1
+192.168.6.88 2
+192.168.6.87 3
+192.168.6.86 4
+192.168.6.85 0
+192.168.6.84 1
+192.168.6.83 2
+192.168.6.82 4
+192.168.6.81 3
+192.168.6.80 0
+192.168.6.79 0
+192.168.6.78 1
+192.168.6.77 2
+192.168.6.76 3
+192.168.6.75 4
+192.168.6.74 2
+192.168.6.73 3
+192.168.6.72 1
+192.168.6.71 3
+192.168.6.70 4
+192.168.6.69 0
+192.168.6.68 1
+192.168.6.67 2
+192.168.6.66 4
+192.168.6.65 0
+192.168.6.64 1
+192.168.6.63 0
+192.168.6.62 1
+192.168.6.61 2
+192.168.6.60 3
+192.168.6.59 4
+192.168.6.58 2
+192.168.6.57 3
+192.168.6.56 0
+192.168.6.55 3
+192.168.6.54 4
+192.168.6.53 1
+192.168.6.52 2
+192.168.6.51 0
+192.168.6.50 4
+192.168.6.49 1
+192.168.6.48 2
+192.168.6.47 0
+192.168.6.46 1
+192.168.6.45 2
+192.168.6.44 3
+192.168.6.43 4
+192.168.6.42 2
+192.168.6.41 4
+192.168.6.40 3
+192.168.6.39 0
+192.168.6.38 1
+192.168.6.37 2
+192.168.6.36 3
+192.168.6.35 4
+192.168.6.34 0
+192.168.6.33 1
+192.168.6.32 4
+192.168.6.31 0
+192.168.6.30 1
+192.168.6.29 2
+192.168.6.28 3
+192.168.6.27 4
+192.168.6.26 2
+192.168.6.25 3
+192.168.6.24 0
+192.168.6.23 3
+192.168.6.22 4
+192.168.6.21 0
+192.168.6.20 1
+192.168.6.19 2
+192.168.6.18 4
+192.168.6.17 1
+192.168.6.16 3
+192.168.6.15 0
+192.168.6.14 1
+192.168.6.13 2
+192.168.6.12 3
+192.168.6.11 4
+192.168.6.10 2
+192.168.6.9 3
+192.168.6.8 4
+192.168.6.7 0
+192.168.6.6 1
+192.168.6.5 2
+192.168.6.4 3
+192.168.6.3 4
+192.168.6.2 0
+192.168.6.1 1
+192.168.5.90 0
+192.168.5.89 1
+192.168.5.88 2
+192.168.5.87 3
+192.168.5.86 4
+192.168.5.85 0
+192.168.5.84 1
+192.168.5.83 2
+192.168.5.82 4
+192.168.5.81 3
+192.168.5.80 0
+192.168.5.79 0
+192.168.5.78 1
+192.168.5.77 2
+192.168.5.76 3
+192.168.5.75 4
+192.168.5.74 2
+192.168.5.73 3
+192.168.5.72 1
+192.168.5.71 3
+192.168.5.70 4
+192.168.5.69 2
+192.168.5.68 0
+192.168.5.67 1
+192.168.5.66 4
+192.168.5.65 2
+192.168.5.64 0
+192.168.5.63 0
+192.168.5.62 1
+192.168.5.61 2
+192.168.5.60 3
+192.168.5.59 4
+192.168.5.58 1
+192.168.5.57 3
+192.168.5.56 2
+192.168.5.55 0
+192.168.5.54 1
+192.168.5.53 2
+192.168.5.52 3
+192.168.5.51 4
+192.168.5.50 0
+192.168.5.49 4
+192.168.5.48 1
+192.168.5.47 0
+192.168.5.46 1
+192.168.5.45 2
+192.168.5.44 3
+192.168.5.43 4
+192.168.5.42 1
+192.168.5.41 3
+192.168.5.40 2
+192.168.5.39 2
+192.168.5.38 3
+192.168.5.37 4
+192.168.5.36 0
+192.168.5.35 1
+192.168.5.34 4
+192.168.5.33 0
+192.168.5.32 4
+192.168.5.31 0
+192.168.5.30 1
+192.168.5.29 2
+192.168.5.28 3
+192.168.5.27 4
+192.168.5.26 1
+192.168.5.25 3
+192.168.5.24 2
+192.168.5.23 3
+192.168.5.22 4
+192.168.5.21 2
+192.168.5.20 0
+192.168.5.19 1
+192.168.5.18 4
+192.168.5.17 0
+192.168.5.16 3
+192.168.5.15 0
+192.168.5.14 1
+192.168.5.13 2
+192.168.5.12 3
+192.168.5.11 4
+192.168.5.10 1
+192.168.5.9 4
+192.168.5.8 3
+192.168.5.7 0
+192.168.5.6 1
+192.168.5.5 2
+192.168.5.4 3
+192.168.5.3 4
+192.168.5.2 2
+192.168.5.1 0
+192.168.4.90 0
+192.168.4.89 1
+192.168.4.88 2
+192.168.4.87 3
+192.168.4.86 4
+192.168.4.85 0
+192.168.4.84 1
+192.168.4.83 2
+192.168.4.82 3
+192.168.4.81 4
+192.168.4.80 0
+192.168.4.79 0
+192.168.4.78 1
+192.168.4.77 2
+192.168.4.76 3
+192.168.4.75 4
+192.168.4.74 1
+192.168.4.73 2
+192.168.4.72 3
+192.168.4.71 3
+192.168.4.70 4
+192.168.4.69 0
+192.168.4.68 1
+192.168.4.67 2
+192.168.4.66 4
+192.168.4.65 1
+192.168.4.64 3
+192.168.4.63 0
+192.168.4.62 1
+192.168.4.61 2
+192.168.4.60 3
+192.168.4.59 4
+192.168.4.58 0
+192.168.4.57 2
+192.168.4.56 1
+192.168.4.55 0
+192.168.4.54 1
+192.168.4.53 2
+192.168.4.52 3
+192.168.4.51 4
+192.168.4.50 3
+192.168.4.49 4
+192.168.4.48 0
+192.168.4.47 0
+192.168.4.46 1
+192.168.4.45 2
+192.168.4.44 3
+192.168.4.43 4
+192.168.4.42 2
+192.168.4.41 0
+192.168.4.40 1
+192.168.4.39 4
+192.168.4.38 0
+192.168.4.37 1
+192.168.4.36 2
+192.168.4.35 3
+192.168.4.34 4
+192.168.4.33 3
+192.168.4.32 2
+192.168.4.31 0
+192.168.4.30 1
+192.168.4.29 2
+192.168.4.28 3
+192.168.4.27 4
+192.168.4.26 0
+192.168.4.25 2
+192.168.4.24 1
+192.168.4.23 3
+192.168.4.22 4
+192.168.4.21 0
+192.168.4.20 1
+192.168.4.19 2
+192.168.4.18 4
+192.168.4.17 3
+192.168.4.16 1
+192.168.4.15 0
+192.168.4.14 1
+192.168.4.13 2
+192.168.4.12 3
+192.168.4.11 4
+192.168.4.10 3
+192.168.4.9 0
+192.168.4.8 2
+192.168.4.7 2
+192.168.4.6 3
+192.168.4.5 4
+192.168.4.4 0
+192.168.4.3 1
+192.168.4.2 4
+192.168.4.1 4
+192.168.3.90 0
+192.168.3.89 1
+192.168.3.88 2
+192.168.3.87 3
+192.168.3.86 4
+192.168.3.85 0
+192.168.3.84 1
+192.168.3.83 2
+192.168.3.82 3
+192.168.3.81 4
+192.168.3.80 0
+192.168.3.79 0
+192.168.3.78 1
+192.168.3.77 2
+192.168.3.76 3
+192.168.3.75 4
+192.168.3.74 1
+192.168.3.73 2
+192.168.3.72 3
+192.168.3.71 3
+192.168.3.70 4
+192.168.3.69 0
+192.168.3.68 1
+192.168.3.67 2
+192.168.3.66 4
+192.168.3.65 0
+192.168.3.64 3
+192.168.3.63 0
+192.168.3.62 1
+192.168.3.61 2
+192.168.3.60 3
+192.168.3.59 4
+192.168.3.58 2
+192.168.3.57 1
+192.168.3.56 3
+192.168.3.55 0
+192.168.3.54 1
+192.168.3.53 2
+192.168.3.52 3
+192.168.3.51 4
+192.168.3.50 0
+192.168.3.49 4
+192.168.3.48 2
+192.168.3.47 0
+192.168.3.46 1
+192.168.3.45 2
+192.168.3.44 3
+192.168.3.43 4
+192.168.3.42 2
+192.168.3.41 1
+192.168.3.40 0
+192.168.3.39 1
+192.168.3.38 2
+192.168.3.37 3
+192.168.3.36 4
+192.168.3.35 0
+192.168.3.34 4
+192.168.3.33 3
+192.168.3.32 4
+192.168.3.31 0
+192.168.3.30 1
+192.168.3.29 2
+192.168.3.28 3
+192.168.3.27 4
+192.168.3.26 2
+192.168.3.25 1
+192.168.3.24 0
+192.168.3.23 3
+192.168.3.22 4
+192.168.3.21 0
+192.168.3.20 1
+192.168.3.19 2
+192.168.3.18 4
+192.168.3.17 3
+192.168.3.16 1
+192.168.3.15 0
+192.168.3.14 1
+192.168.3.13 2
+192.168.3.12 3
+192.168.3.11 4
+192.168.3.10 2
+192.168.3.9 1
+192.168.3.8 0
+192.168.3.7 4
+192.168.3.6 0
+192.168.3.5 1
+192.168.3.4 2
+192.168.3.3 3
+192.168.3.2 4
+192.168.3.1 3
+192.168.2.90 0
+192.168.2.89 1
+192.168.2.88 2
+192.168.2.87 3
+192.168.2.86 4
+192.168.2.85 0
+192.168.2.84 1
+192.168.2.83 2
+192.168.2.82 3
+192.168.2.81 4
+192.168.2.80 1
+192.168.2.79 0
+192.168.2.78 1
+192.168.2.77 2
+192.168.2.76 3
+192.168.2.75 4
+192.168.2.74 2
+192.168.2.73 3
+192.168.2.72 0
+192.168.2.71 3
+192.168.2.70 4
+192.168.2.69 0
+192.168.2.68 1
+192.168.2.67 2
+192.168.2.66 4
+192.168.2.65 1
+192.168.2.64 3
+192.168.2.63 0
+192.168.2.62 1
+192.168.2.61 2
+192.168.2.60 3
+192.168.2.59 4
+192.168.2.58 0
+192.168.2.57 2
+192.168.2.56 1
+192.168.2.55 0
+192.168.2.54 1
+192.168.2.53 2
+192.168.2.52 3
+192.168.2.51 4
+192.168.2.50 3
+192.168.2.49 4
+192.168.2.48 0
+192.168.2.47 0
+192.168.2.46 1
+192.168.2.45 2
+192.168.2.44 3
+192.168.2.43 4
+192.168.2.42 2
+192.168.2.41 0
+192.168.2.40 1
+192.168.2.39 0
+192.168.2.38 1
+192.168.2.37 2
+192.168.2.36 3
+192.168.2.35 4
+192.168.2.34 3
+192.168.2.33 4
+192.168.2.32 2
+192.168.2.31 0
+192.168.2.30 1
+192.168.2.29 2
+192.168.2.28 3
+192.168.2.27 4
+192.168.2.26 2
+192.168.2.25 0
+192.168.2.24 1
+192.168.2.23 3
+192.168.2.22 4
+192.168.2.21 0
+192.168.2.20 1
+192.168.2.19 2
+192.168.2.18 4
+192.168.2.17 3
+192.168.2.16 4
+192.168.2.15 0
+192.168.2.14 1
+192.168.2.13 2
+192.168.2.12 3
+192.168.2.11 4
+192.168.2.10 0
+192.168.2.9 2
+192.168.2.8 3
+192.168.2.7 2
+192.168.2.6 4
+192.168.2.5 0
+192.168.2.4 1
+192.168.2.3 3
+192.168.2.2 4
+192.168.2.1 1
+192.168.1.90 0
+192.168.1.89 1
+192.168.1.88 2
+192.168.1.87 3
+192.168.1.86 4
+192.168.1.85 0
+192.168.1.84 1
+192.168.1.83 2
+192.168.1.82 3
+192.168.1.81 4
+192.168.1.80 0
+192.168.1.79 0
+192.168.1.78 1
+192.168.1.77 2
+192.168.1.76 3
+192.168.1.75 4
+192.168.1.74 1
+192.168.1.73 2
+192.168.1.72 3
+192.168.1.71 3
+192.168.1.70 4
+192.168.1.69 0
+192.168.1.68 1
+192.168.1.67 2
+192.168.1.66 4
+192.168.1.65 0
+192.168.1.64 1
+192.168.1.63 0
+192.168.1.62 1
+192.168.1.61 2
+192.168.1.60 3
+192.168.1.59 4
+192.168.1.58 2
+192.168.1.57 3
+192.168.1.56 1
+192.168.1.55 0
+192.168.1.54 1
+192.168.1.53 2
+192.168.1.52 3
+192.168.1.51 4
+192.168.1.50 0
+192.168.1.49 4
+192.168.1.48 2
+192.168.1.47 0
+192.168.1.46 1
+192.168.1.45 2
+192.168.1.44 3
+192.168.1.43 4
+192.168.1.42 2
+192.168.1.41 3
+192.168.1.40 0
+192.168.1.39 3
+192.168.1.38 4
+192.168.1.37 0
+192.168.1.36 1
+192.168.1.35 2
+192.168.1.34 4
+192.168.1.33 1
+192.168.1.32 3
+192.168.1.31 0
+192.168.1.30 1
+192.168.1.29 2
+192.168.1.28 3
+192.168.1.27 4
+192.168.1.26 2
+192.168.1.25 3
+192.168.1.24 0
+192.168.1.23 3
+192.168.1.22 4
+192.168.1.21 0
+192.168.1.20 1
+192.168.1.19 2
+192.168.1.18 4
+192.168.1.17 1
+192.168.1.16 4
+192.168.1.15 0
+192.168.1.14 1
+192.168.1.13 2
+192.168.1.12 3
+192.168.1.11 4
+192.168.1.10 2
+192.168.1.9 3
+192.168.1.8 0
+192.168.1.7 3
+192.168.1.6 4
+192.168.1.5 0
+192.168.1.4 1
+192.168.1.3 2
+192.168.1.2 4
+192.168.1.1 1
+EOF
+
+simple_test 0,0,0,0,0 <<EOF
+192.168.1.1 -1
+192.168.1.2 -1
+192.168.1.3 -1
+192.168.1.4 -1
+192.168.1.5 -1
+192.168.1.6 -1
+192.168.1.7 -1
+192.168.1.8 -1
+192.168.1.9 -1
+192.168.1.10 -1
+192.168.1.11 -1
+192.168.1.12 -1
+192.168.1.13 -1
+192.168.1.14 -1
+192.168.1.15 -1
+192.168.1.16 -1
+192.168.1.17 -1
+192.168.1.18 -1
+192.168.1.19 -1
+192.168.1.20 -1
+192.168.1.21 -1
+192.168.1.22 -1
+192.168.1.23 -1
+192.168.1.24 -1
+192.168.1.25 -1
+192.168.1.26 -1
+192.168.1.27 -1
+192.168.1.28 -1
+192.168.1.29 -1
+192.168.1.30 -1
+192.168.1.31 -1
+192.168.1.32 -1
+192.168.1.33 -1
+192.168.1.34 -1
+192.168.1.35 -1
+192.168.1.36 -1
+192.168.1.37 -1
+192.168.1.38 -1
+192.168.1.39 -1
+192.168.1.40 -1
+192.168.1.41 -1
+192.168.1.42 -1
+192.168.1.43 -1
+192.168.1.44 -1
+192.168.1.45 -1
+192.168.1.46 -1
+192.168.1.47 -1
+192.168.1.48 -1
+192.168.1.49 -1
+192.168.1.50 -1
+192.168.1.51 -1
+192.168.1.52 -1
+192.168.1.53 -1
+192.168.1.54 -1
+192.168.1.55 -1
+192.168.1.56 -1
+192.168.1.57 -1
+192.168.1.58 -1
+192.168.1.59 -1
+192.168.1.60 -1
+192.168.1.61 -1
+192.168.1.62 -1
+192.168.1.63 -1
+192.168.1.64 -1
+192.168.1.65 -1
+192.168.1.66 -1
+192.168.1.67 -1
+192.168.1.68 -1
+192.168.1.69 -1
+192.168.1.70 -1
+192.168.1.71 -1
+192.168.1.72 -1
+192.168.1.73 -1
+192.168.1.74 -1
+192.168.1.75 -1
+192.168.1.76 -1
+192.168.1.77 -1
+192.168.1.78 -1
+192.168.1.79 -1
+192.168.1.80 -1
+192.168.1.81 -1
+192.168.1.82 -1
+192.168.1.83 -1
+192.168.1.84 -1
+192.168.1.85 -1
+192.168.1.86 -1
+192.168.1.87 -1
+192.168.1.88 -1
+192.168.1.89 -1
+192.168.1.90 -1
+192.168.2.1 -1
+192.168.2.2 -1
+192.168.2.3 -1
+192.168.2.4 -1
+192.168.2.5 -1
+192.168.2.6 -1
+192.168.2.7 -1
+192.168.2.8 -1
+192.168.2.9 -1
+192.168.2.10 -1
+192.168.2.11 -1
+192.168.2.12 -1
+192.168.2.13 -1
+192.168.2.14 -1
+192.168.2.15 -1
+192.168.2.16 -1
+192.168.2.17 -1
+192.168.2.18 -1
+192.168.2.19 -1
+192.168.2.20 -1
+192.168.2.21 -1
+192.168.2.22 -1
+192.168.2.23 -1
+192.168.2.24 -1
+192.168.2.25 -1
+192.168.2.26 -1
+192.168.2.27 -1
+192.168.2.28 -1
+192.168.2.29 -1
+192.168.2.30 -1
+192.168.2.31 -1
+192.168.2.32 -1
+192.168.2.33 -1
+192.168.2.34 -1
+192.168.2.35 -1
+192.168.2.36 -1
+192.168.2.37 -1
+192.168.2.38 -1
+192.168.2.39 -1
+192.168.2.40 -1
+192.168.2.41 -1
+192.168.2.42 -1
+192.168.2.43 -1
+192.168.2.44 -1
+192.168.2.45 -1
+192.168.2.46 -1
+192.168.2.47 -1
+192.168.2.48 -1
+192.168.2.49 -1
+192.168.2.50 -1
+192.168.2.51 -1
+192.168.2.52 -1
+192.168.2.53 -1
+192.168.2.54 -1
+192.168.2.55 -1
+192.168.2.56 -1
+192.168.2.57 -1
+192.168.2.58 -1
+192.168.2.59 -1
+192.168.2.60 -1
+192.168.2.61 -1
+192.168.2.62 -1
+192.168.2.63 -1
+192.168.2.64 -1
+192.168.2.65 -1
+192.168.2.66 -1
+192.168.2.67 -1
+192.168.2.68 -1
+192.168.2.69 -1
+192.168.2.70 -1
+192.168.2.71 -1
+192.168.2.72 -1
+192.168.2.73 -1
+192.168.2.74 -1
+192.168.2.75 -1
+192.168.2.76 -1
+192.168.2.77 -1
+192.168.2.78 -1
+192.168.2.79 -1
+192.168.2.80 -1
+192.168.2.81 -1
+192.168.2.82 -1
+192.168.2.83 -1
+192.168.2.84 -1
+192.168.2.85 -1
+192.168.2.86 -1
+192.168.2.87 -1
+192.168.2.88 -1
+192.168.2.89 -1
+192.168.2.90 -1
+192.168.3.1 -1
+192.168.3.2 -1
+192.168.3.3 -1
+192.168.3.4 -1
+192.168.3.5 -1
+192.168.3.6 -1
+192.168.3.7 -1
+192.168.3.8 -1
+192.168.3.9 -1
+192.168.3.10 -1
+192.168.3.11 -1
+192.168.3.12 -1
+192.168.3.13 -1
+192.168.3.14 -1
+192.168.3.15 -1
+192.168.3.16 -1
+192.168.3.17 -1
+192.168.3.18 -1
+192.168.3.19 -1
+192.168.3.20 -1
+192.168.3.21 -1
+192.168.3.22 -1
+192.168.3.23 -1
+192.168.3.24 -1
+192.168.3.25 -1
+192.168.3.26 -1
+192.168.3.27 -1
+192.168.3.28 -1
+192.168.3.29 -1
+192.168.3.30 -1
+192.168.3.31 -1
+192.168.3.32 -1
+192.168.3.33 -1
+192.168.3.34 -1
+192.168.3.35 -1
+192.168.3.36 -1
+192.168.3.37 -1
+192.168.3.38 -1
+192.168.3.39 -1
+192.168.3.40 -1
+192.168.3.41 -1
+192.168.3.42 -1
+192.168.3.43 -1
+192.168.3.44 -1
+192.168.3.45 -1
+192.168.3.46 -1
+192.168.3.47 -1
+192.168.3.48 -1
+192.168.3.49 -1
+192.168.3.50 -1
+192.168.3.51 -1
+192.168.3.52 -1
+192.168.3.53 -1
+192.168.3.54 -1
+192.168.3.55 -1
+192.168.3.56 -1
+192.168.3.57 -1
+192.168.3.58 -1
+192.168.3.59 -1
+192.168.3.60 -1
+192.168.3.61 -1
+192.168.3.62 -1
+192.168.3.63 -1
+192.168.3.64 -1
+192.168.3.65 -1
+192.168.3.66 -1
+192.168.3.67 -1
+192.168.3.68 -1
+192.168.3.69 -1
+192.168.3.70 -1
+192.168.3.71 -1
+192.168.3.72 -1
+192.168.3.73 -1
+192.168.3.74 -1
+192.168.3.75 -1
+192.168.3.76 -1
+192.168.3.77 -1
+192.168.3.78 -1
+192.168.3.79 -1
+192.168.3.80 -1
+192.168.3.81 -1
+192.168.3.82 -1
+192.168.3.83 -1
+192.168.3.84 -1
+192.168.3.85 -1
+192.168.3.86 -1
+192.168.3.87 -1
+192.168.3.88 -1
+192.168.3.89 -1
+192.168.3.90 -1
+192.168.4.1 -1
+192.168.4.2 -1
+192.168.4.3 -1
+192.168.4.4 -1
+192.168.4.5 -1
+192.168.4.6 -1
+192.168.4.7 -1
+192.168.4.8 -1
+192.168.4.9 -1
+192.168.4.10 -1
+192.168.4.11 -1
+192.168.4.12 -1
+192.168.4.13 -1
+192.168.4.14 -1
+192.168.4.15 -1
+192.168.4.16 -1
+192.168.4.17 -1
+192.168.4.18 -1
+192.168.4.19 -1
+192.168.4.20 -1
+192.168.4.21 -1
+192.168.4.22 -1
+192.168.4.23 -1
+192.168.4.24 -1
+192.168.4.25 -1
+192.168.4.26 -1
+192.168.4.27 -1
+192.168.4.28 -1
+192.168.4.29 -1
+192.168.4.30 -1
+192.168.4.31 -1
+192.168.4.32 -1
+192.168.4.33 -1
+192.168.4.34 -1
+192.168.4.35 -1
+192.168.4.36 -1
+192.168.4.37 -1
+192.168.4.38 -1
+192.168.4.39 -1
+192.168.4.40 -1
+192.168.4.41 -1
+192.168.4.42 -1
+192.168.4.43 -1
+192.168.4.44 -1
+192.168.4.45 -1
+192.168.4.46 -1
+192.168.4.47 -1
+192.168.4.48 -1
+192.168.4.49 -1
+192.168.4.50 -1
+192.168.4.51 -1
+192.168.4.52 -1
+192.168.4.53 -1
+192.168.4.54 -1
+192.168.4.55 -1
+192.168.4.56 -1
+192.168.4.57 -1
+192.168.4.58 -1
+192.168.4.59 -1
+192.168.4.60 -1
+192.168.4.61 -1
+192.168.4.62 -1
+192.168.4.63 -1
+192.168.4.64 -1
+192.168.4.65 -1
+192.168.4.66 -1
+192.168.4.67 -1
+192.168.4.68 -1
+192.168.4.69 -1
+192.168.4.70 -1
+192.168.4.71 -1
+192.168.4.72 -1
+192.168.4.73 -1
+192.168.4.74 -1
+192.168.4.75 -1
+192.168.4.76 -1
+192.168.4.77 -1
+192.168.4.78 -1
+192.168.4.79 -1
+192.168.4.80 -1
+192.168.4.81 -1
+192.168.4.82 -1
+192.168.4.83 -1
+192.168.4.84 -1
+192.168.4.85 -1
+192.168.4.86 -1
+192.168.4.87 -1
+192.168.4.88 -1
+192.168.4.89 -1
+192.168.4.90 -1
+192.168.5.1 -1
+192.168.5.2 -1
+192.168.5.3 -1
+192.168.5.4 -1
+192.168.5.5 -1
+192.168.5.6 -1
+192.168.5.7 -1
+192.168.5.8 -1
+192.168.5.9 -1
+192.168.5.10 -1
+192.168.5.11 -1
+192.168.5.12 -1
+192.168.5.13 -1
+192.168.5.14 -1
+192.168.5.15 -1
+192.168.5.16 -1
+192.168.5.17 -1
+192.168.5.18 -1
+192.168.5.19 -1
+192.168.5.20 -1
+192.168.5.21 -1
+192.168.5.22 -1
+192.168.5.23 -1
+192.168.5.24 -1
+192.168.5.25 -1
+192.168.5.26 -1
+192.168.5.27 -1
+192.168.5.28 -1
+192.168.5.29 -1
+192.168.5.30 -1
+192.168.5.31 -1
+192.168.5.32 -1
+192.168.5.33 -1
+192.168.5.34 -1
+192.168.5.35 -1
+192.168.5.36 -1
+192.168.5.37 -1
+192.168.5.38 -1
+192.168.5.39 -1
+192.168.5.40 -1
+192.168.5.41 -1
+192.168.5.42 -1
+192.168.5.43 -1
+192.168.5.44 -1
+192.168.5.45 -1
+192.168.5.46 -1
+192.168.5.47 -1
+192.168.5.48 -1
+192.168.5.49 -1
+192.168.5.50 -1
+192.168.5.51 -1
+192.168.5.52 -1
+192.168.5.53 -1
+192.168.5.54 -1
+192.168.5.55 -1
+192.168.5.56 -1
+192.168.5.57 -1
+192.168.5.58 -1
+192.168.5.59 -1
+192.168.5.60 -1
+192.168.5.61 -1
+192.168.5.62 -1
+192.168.5.63 -1
+192.168.5.64 -1
+192.168.5.65 -1
+192.168.5.66 -1
+192.168.5.67 -1
+192.168.5.68 -1
+192.168.5.69 -1
+192.168.5.70 -1
+192.168.5.71 -1
+192.168.5.72 -1
+192.168.5.73 -1
+192.168.5.74 -1
+192.168.5.75 -1
+192.168.5.76 -1
+192.168.5.77 -1
+192.168.5.78 -1
+192.168.5.79 -1
+192.168.5.80 -1
+192.168.5.81 -1
+192.168.5.82 -1
+192.168.5.83 -1
+192.168.5.84 -1
+192.168.5.85 -1
+192.168.5.86 -1
+192.168.5.87 -1
+192.168.5.88 -1
+192.168.5.89 -1
+192.168.5.90 -1
+192.168.6.1 -1
+192.168.6.2 -1
+192.168.6.3 -1
+192.168.6.4 -1
+192.168.6.5 -1
+192.168.6.6 -1
+192.168.6.7 -1
+192.168.6.8 -1
+192.168.6.9 -1
+192.168.6.10 -1
+192.168.6.11 -1
+192.168.6.12 -1
+192.168.6.13 -1
+192.168.6.14 -1
+192.168.6.15 -1
+192.168.6.16 -1
+192.168.6.17 -1
+192.168.6.18 -1
+192.168.6.19 -1
+192.168.6.20 -1
+192.168.6.21 -1
+192.168.6.22 -1
+192.168.6.23 -1
+192.168.6.24 -1
+192.168.6.25 -1
+192.168.6.26 -1
+192.168.6.27 -1
+192.168.6.28 -1
+192.168.6.29 -1
+192.168.6.30 -1
+192.168.6.31 -1
+192.168.6.32 -1
+192.168.6.33 -1
+192.168.6.34 -1
+192.168.6.35 -1
+192.168.6.36 -1
+192.168.6.37 -1
+192.168.6.38 -1
+192.168.6.39 -1
+192.168.6.40 -1
+192.168.6.41 -1
+192.168.6.42 -1
+192.168.6.43 -1
+192.168.6.44 -1
+192.168.6.45 -1
+192.168.6.46 -1
+192.168.6.47 -1
+192.168.6.48 -1
+192.168.6.49 -1
+192.168.6.50 -1
+192.168.6.51 -1
+192.168.6.52 -1
+192.168.6.53 -1
+192.168.6.54 -1
+192.168.6.55 -1
+192.168.6.56 -1
+192.168.6.57 -1
+192.168.6.58 -1
+192.168.6.59 -1
+192.168.6.60 -1
+192.168.6.61 -1
+192.168.6.62 -1
+192.168.6.63 -1
+192.168.6.64 -1
+192.168.6.65 -1
+192.168.6.66 -1
+192.168.6.67 -1
+192.168.6.68 -1
+192.168.6.69 -1
+192.168.6.70 -1
+192.168.6.71 -1
+192.168.6.72 -1
+192.168.6.73 -1
+192.168.6.74 -1
+192.168.6.75 -1
+192.168.6.76 -1
+192.168.6.77 -1
+192.168.6.78 -1
+192.168.6.79 -1
+192.168.6.80 -1
+192.168.6.81 -1
+192.168.6.82 -1
+192.168.6.83 -1
+192.168.6.84 -1
+192.168.6.85 -1
+192.168.6.86 -1
+192.168.6.87 -1
+192.168.6.88 -1
+192.168.6.89 -1
+192.168.6.90 -1
+192.168.7.1 -1
+192.168.7.2 -1
+192.168.7.3 -1
+192.168.7.4 -1
+192.168.7.5 -1
+192.168.7.6 -1
+192.168.7.7 -1
+192.168.7.8 -1
+192.168.7.9 -1
+192.168.7.10 -1
+192.168.7.11 -1
+192.168.7.12 -1
+192.168.7.13 -1
+192.168.7.14 -1
+192.168.7.15 -1
+192.168.7.16 -1
+192.168.7.17 -1
+192.168.7.18 -1
+192.168.7.19 -1
+192.168.7.20 -1
+192.168.7.21 -1
+192.168.7.22 -1
+192.168.7.23 -1
+192.168.7.24 -1
+192.168.7.25 -1
+192.168.7.26 -1
+192.168.7.27 -1
+192.168.7.28 -1
+192.168.7.29 -1
+192.168.7.30 -1
+192.168.7.31 -1
+192.168.7.32 -1
+192.168.7.33 -1
+192.168.7.34 -1
+192.168.7.35 -1
+192.168.7.36 -1
+192.168.7.37 -1
+192.168.7.38 -1
+192.168.7.39 -1
+192.168.7.40 -1
+192.168.7.41 -1
+192.168.7.42 -1
+192.168.7.43 -1
+192.168.7.44 -1
+192.168.7.45 -1
+192.168.7.46 -1
+192.168.7.47 -1
+192.168.7.48 -1
+192.168.7.49 -1
+192.168.7.50 -1
+192.168.7.51 -1
+192.168.7.52 -1
+192.168.7.53 -1
+192.168.7.54 -1
+192.168.7.55 -1
+192.168.7.56 -1
+192.168.7.57 -1
+192.168.7.58 -1
+192.168.7.59 -1
+192.168.7.60 -1
+192.168.7.61 -1
+192.168.7.62 -1
+192.168.7.63 -1
+192.168.7.64 -1
+192.168.7.65 -1
+192.168.7.66 -1
+192.168.7.67 -1
+192.168.7.68 -1
+192.168.7.69 -1
+192.168.7.70 -1
+192.168.7.71 -1
+192.168.7.72 -1
+192.168.7.73 -1
+192.168.7.74 -1
+192.168.7.75 -1
+192.168.7.76 -1
+192.168.7.77 -1
+192.168.7.78 -1
+192.168.7.79 -1
+192.168.7.80 -1
+192.168.7.81 -1
+192.168.7.82 -1
+192.168.7.83 -1
+192.168.7.84 -1
+192.168.7.85 -1
+192.168.7.86 -1
+192.168.7.87 -1
+192.168.7.88 -1
+192.168.7.89 -1
+192.168.7.90 -1
+192.168.8.1 -1
+192.168.8.2 -1
+192.168.8.3 -1
+192.168.8.4 -1
+192.168.8.5 -1
+192.168.8.6 -1
+192.168.8.7 -1
+192.168.8.8 -1
+192.168.8.9 -1
+192.168.8.10 -1
+192.168.8.11 -1
+192.168.8.12 -1
+192.168.8.13 -1
+192.168.8.14 -1
+192.168.8.15 -1
+192.168.8.16 -1
+192.168.8.17 -1
+192.168.8.18 -1
+192.168.8.19 -1
+192.168.8.20 -1
+192.168.8.21 -1
+192.168.8.22 -1
+192.168.8.23 -1
+192.168.8.24 -1
+192.168.8.25 -1
+192.168.8.26 -1
+192.168.8.27 -1
+192.168.8.28 -1
+192.168.8.29 -1
+192.168.8.30 -1
+192.168.8.31 -1
+192.168.8.32 -1
+192.168.8.33 -1
+192.168.8.34 -1
+192.168.8.35 -1
+192.168.8.36 -1
+192.168.8.37 -1
+192.168.8.38 -1
+192.168.8.39 -1
+192.168.8.40 -1
+192.168.8.41 -1
+192.168.8.42 -1
+192.168.8.43 -1
+192.168.8.44 -1
+192.168.8.45 -1
+192.168.8.46 -1
+192.168.8.47 -1
+192.168.8.48 -1
+192.168.8.49 -1
+192.168.8.50 -1
+192.168.8.51 -1
+192.168.8.52 -1
+192.168.8.53 -1
+192.168.8.54 -1
+192.168.8.55 -1
+192.168.8.56 -1
+192.168.8.57 -1
+192.168.8.58 -1
+192.168.8.59 -1
+192.168.8.60 -1
+192.168.8.61 -1
+192.168.8.62 -1
+192.168.8.63 -1
+192.168.8.64 -1
+192.168.8.65 -1
+192.168.8.66 -1
+192.168.8.67 -1
+192.168.8.68 -1
+192.168.8.69 -1
+192.168.8.70 -1
+192.168.8.71 -1
+192.168.8.72 -1
+192.168.8.73 -1
+192.168.8.74 -1
+192.168.8.75 -1
+192.168.8.76 -1
+192.168.8.77 -1
+192.168.8.78 -1
+192.168.8.79 -1
+192.168.8.80 -1
+192.168.8.81 -1
+192.168.8.82 -1
+192.168.8.83 -1
+192.168.8.84 -1
+192.168.8.85 -1
+192.168.8.86 -1
+192.168.8.87 -1
+192.168.8.88 -1
+192.168.8.89 -1
+192.168.8.90 -1
+192.168.9.1 -1
+192.168.9.2 -1
+192.168.9.3 -1
+192.168.9.4 -1
+192.168.9.5 -1
+192.168.9.6 -1
+192.168.9.7 -1
+192.168.9.8 -1
+192.168.9.9 -1
+192.168.9.10 -1
+192.168.9.11 -1
+192.168.9.12 -1
+192.168.9.13 -1
+192.168.9.14 -1
+192.168.9.15 -1
+192.168.9.16 -1
+192.168.9.17 -1
+192.168.9.18 -1
+192.168.9.19 -1
+192.168.9.20 -1
+192.168.9.21 -1
+192.168.9.22 -1
+192.168.9.23 -1
+192.168.9.24 -1
+192.168.9.25 -1
+192.168.9.26 -1
+192.168.9.27 -1
+192.168.9.28 -1
+192.168.9.29 -1
+192.168.9.30 -1
+192.168.9.31 -1
+192.168.9.32 -1
+192.168.9.33 -1
+192.168.9.34 -1
+192.168.9.35 -1
+192.168.9.36 -1
+192.168.9.37 -1
+192.168.9.38 -1
+192.168.9.39 -1
+192.168.9.40 -1
+192.168.9.41 -1
+192.168.9.42 -1
+192.168.9.43 -1
+192.168.9.44 -1
+192.168.9.45 -1
+192.168.9.46 -1
+192.168.9.47 -1
+192.168.9.48 -1
+192.168.9.49 -1
+192.168.9.50 -1
+192.168.9.51 -1
+192.168.9.52 -1
+192.168.9.53 -1
+192.168.9.54 -1
+192.168.9.55 -1
+192.168.9.56 -1
+192.168.9.57 -1
+192.168.9.58 -1
+192.168.9.59 -1
+192.168.9.60 -1
+192.168.9.61 -1
+192.168.9.62 -1
+192.168.9.63 -1
+192.168.9.64 -1
+192.168.9.65 -1
+192.168.9.66 -1
+192.168.9.67 -1
+192.168.9.68 -1
+192.168.9.69 -1
+192.168.9.70 -1
+192.168.9.71 -1
+192.168.9.72 -1
+192.168.9.73 -1
+192.168.9.74 -1
+192.168.9.75 -1
+192.168.9.76 -1
+192.168.9.77 -1
+192.168.9.78 -1
+192.168.9.79 -1
+192.168.9.80 -1
+192.168.9.81 -1
+192.168.9.82 -1
+192.168.9.83 -1
+192.168.9.84 -1
+192.168.9.85 -1
+192.168.9.86 -1
+192.168.9.87 -1
+192.168.9.88 -1
+192.168.9.89 -1
+192.168.9.90 -1
+192.168.10.1 -1
+192.168.10.2 -1
+192.168.10.3 -1
+192.168.10.4 -1
+192.168.10.5 -1
+192.168.10.6 -1
+192.168.10.7 -1
+192.168.10.8 -1
+192.168.10.9 -1
+192.168.10.10 -1
+192.168.10.11 -1
+192.168.10.12 -1
+192.168.10.13 -1
+192.168.10.14 -1
+192.168.10.15 -1
+192.168.10.16 -1
+192.168.10.17 -1
+192.168.10.18 -1
+192.168.10.19 -1
+192.168.10.20 -1
+192.168.10.21 -1
+192.168.10.22 -1
+192.168.10.23 -1
+192.168.10.24 -1
+192.168.10.25 -1
+192.168.10.26 -1
+192.168.10.27 -1
+192.168.10.28 -1
+192.168.10.29 -1
+192.168.10.30 -1
+192.168.10.31 -1
+192.168.10.32 -1
+192.168.10.33 -1
+192.168.10.34 -1
+192.168.10.35 -1
+192.168.10.36 -1
+192.168.10.37 -1
+192.168.10.38 -1
+192.168.10.39 -1
+192.168.10.40 -1
+192.168.10.41 -1
+192.168.10.42 -1
+192.168.10.43 -1
+192.168.10.44 -1
+192.168.10.45 -1
+192.168.10.46 -1
+192.168.10.47 -1
+192.168.10.48 -1
+192.168.10.49 -1
+192.168.10.50 -1
+192.168.10.51 -1
+192.168.10.52 -1
+192.168.10.53 -1
+192.168.10.54 -1
+192.168.10.55 -1
+192.168.10.56 -1
+192.168.10.57 -1
+192.168.10.58 -1
+192.168.10.59 -1
+192.168.10.60 -1
+192.168.10.61 -1
+192.168.10.62 -1
+192.168.10.63 -1
+192.168.10.64 -1
+192.168.10.65 -1
+192.168.10.66 -1
+192.168.10.67 -1
+192.168.10.68 -1
+192.168.10.69 -1
+192.168.10.70 -1
+192.168.10.71 -1
+192.168.10.72 -1
+192.168.10.73 -1
+192.168.10.74 -1
+192.168.10.75 -1
+192.168.10.76 -1
+192.168.10.77 -1
+192.168.10.78 -1
+192.168.10.79 -1
+192.168.10.80 -1
+192.168.10.81 -1
+192.168.10.82 -1
+192.168.10.83 -1
+192.168.10.84 -1
+192.168.10.85 -1
+192.168.10.86 -1
+192.168.10.87 -1
+192.168.10.88 -1
+192.168.10.89 -1
+192.168.10.90 -1
+EOF
diff --git a/ctdb/tests/takeover/nondet.001.sh b/ctdb/tests/takeover/nondet.001.sh
new file mode 100755 (executable)
index 0000000..6f79c34
--- /dev/null
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 healthy"
+
+required_result <<EOF
+DATE TIME [PID]: Unassign IP: 192.168.21.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.21.252 from 0
+DATE TIME [PID]: Unassign IP: 192.168.20.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.252 from 0
+DATE TIME [PID]: Unassign IP: 192.168.20.250 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.249 from 0
+192.168.21.254 2
+192.168.21.253 2
+192.168.21.252 2
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 2
+192.168.20.251 2
+192.168.20.250 2
+192.168.20.249 2
+EOF
+
+simple_test 2,2,0 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/nondet.002.sh b/ctdb/tests/takeover/nondet.002.sh
new file mode 100755 (executable)
index 0000000..c46f6a2
--- /dev/null
@@ -0,0 +1,32 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 2 healthy"
+
+required_result <<EOF
+DATE TIME [PID]: Unassign IP: 192.168.21.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.253 from 1
+DATE TIME [PID]: Unassign IP: 192.168.20.250 from 1
+192.168.21.254 2
+192.168.21.253 0
+192.168.21.252 0
+192.168.20.254 2
+192.168.20.253 2
+192.168.20.252 0
+192.168.20.251 2
+192.168.20.250 0
+192.168.20.249 0
+EOF
+
+simple_test 0,2,0 <<EOF
+192.168.20.249 0
+192.168.20.250 1
+192.168.20.251 2
+192.168.20.252 0
+192.168.20.253 1
+192.168.20.254 2
+192.168.21.252 0
+192.168.21.253 1
+192.168.21.254 2
+EOF
diff --git a/ctdb/tests/takeover/nondet.003.sh b/ctdb/tests/takeover/nondet.003.sh
new file mode 100755 (executable)
index 0000000..2a9dfb4
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 -> all healthy"
+
+required_result <<EOF
+192.168.21.254 0
+192.168.21.253 2
+192.168.21.252 0
+192.168.20.254 2
+192.168.20.253 0
+192.168.20.252 2
+192.168.20.251 1
+192.168.20.250 1
+192.168.20.249 1
+EOF
+
+simple_test 0,0,0 <<EOF
+192.168.20.249 1
+192.168.20.250 1
+192.168.20.251 1
+192.168.20.252 1
+192.168.20.253 1
+192.168.20.254 1
+192.168.21.252 1
+192.168.21.253 1
+192.168.21.254 1
+EOF
diff --git a/ctdb/tests/takeover/scripts/local.sh b/ctdb/tests/takeover/scripts/local.sh
new file mode 100644 (file)
index 0000000..3b69d14
--- /dev/null
@@ -0,0 +1,26 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+test_prog="ctdb_takeover_tests ctdb_takeover_run_core"
+
+define_test ()
+{
+    _f=$(basename "$0" ".sh")
+
+    export CTDB_IP_ALGORITHM="${_f%%.*}"
+    case "$CTDB_IP_ALGORITHM" in
+       lcp2|nondet|det) : ;;
+       *) die "Unknown algorithm for testcase \"$_f\"" ;;
+    esac
+
+    printf "%-12s - %s\n" "$_f" "$1"
+}
+
+simple_test ()
+{
+    # Do some filtering of the output to replace date/time.
+    OUT_FILTER='s@^[^\]]*\]:@DATE\ TIME\ \[PID\]:@'
+
+    _out=$($VALGRIND $test_prog "$@" 2>&1)
+
+    result_check "Algorithm: $CTDB_IP_ALGORITHM"
+}
diff --git a/ctdb/tests/takeover/simulation/README b/ctdb/tests/takeover/simulation/README
new file mode 100644 (file)
index 0000000..4a8267b
--- /dev/null
@@ -0,0 +1,6 @@
+This contains a Python simulation of CTDB's IP reallocation algorithm.
+
+It is useful for experimenting with improvements.
+
+To use this on RHEL5 you'll need python2.6 from EPEL
+<http://fedoraproject.org/wiki/EPEL>.
diff --git a/ctdb/tests/takeover/simulation/ctdb_takeover.py b/ctdb/tests/takeover/simulation/ctdb_takeover.py
new file mode 100755 (executable)
index 0000000..4b7ceef
--- /dev/null
@@ -0,0 +1,888 @@
+#!/usr/bin/env python
+
+# ctdb ip takeover code
+
+# Copyright (C) Martin Schwenke, Ronnie Sahlberg 2010, 2011
+
+# Based on original CTDB C code:
+#
+# Copyright (C) Ronnie Sahlberg  2007
+# Copyright (C) Andrew Tridgell  2007
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+
+import os
+import sys
+# Use optparse since newer argparse not available in RHEL5/EPEL.
+from optparse import OptionParser
+import copy
+import random
+import itertools
+
+# For parsing IP addresses
+import socket
+import struct
+
+# For external algorithm
+import subprocess
+import re
+
+options = None
+
+def process_args(extra_options=[]):
+    global options
+
+    parser = OptionParser(option_list=extra_options)
+
+    parser.add_option("--nd",
+                      action="store_false", dest="deterministic_public_ips",
+                      default=True,
+                      help="turn off deterministic_public_ips")
+    parser.add_option("--ni",
+                      action="store_true", dest="no_ip_failback", default=False,
+                      help="turn on no_ip_failback")
+    parser.add_option("-L", "--lcp2",
+                      action="store_true", dest="lcp2", default=False,
+                      help="use LCP2 IP rebalancing algorithm [default: %default]")
+    parser.add_option("-e", "--external",
+                      action="store_true", dest="external", default=False,
+                      help="use external test program to implement IP allocation algorithm [default: %default]")
+    parser.add_option("-b", "--balance",
+                      action="store_true", dest="balance", default=False,
+                      help="show (im)balance information after each event")
+    parser.add_option("-d", "--diff",
+                      action="store_true", dest="diff", default=False,
+                      help="show IP address movements for each event")
+    parser.add_option("-n", "--no-print",
+                      action="store_false", dest="show", default=True,
+                      help="don't show IP address layout after each event")
+    parser.add_option("-v", "--verbose",
+                      action="count", dest="verbose", default=0,
+                      help="print information and actions taken to stdout")
+    parser.add_option("-r", "--retries",
+                      action="store", type="int", dest="retries", default=5,
+                      help="number of retry loops for rebalancing non-deterministic failback [default: %default]")
+    parser.add_option("-i", "--iterations",
+                      action="store", type="int", dest="iterations",
+                      default=1000,
+                      help="number of iterations to run in test [default: %default]")
+    parser.add_option("-o", "--odds",
+                      action="store", type="int", dest="odds", default=4,
+                      help="make the chances of a failover 1 in ODDS [default: %default]")
+    parser.add_option("-A", "--aggressive",
+                      action="store_true", dest="aggressive", default=False,
+                      help="apply ODDS to try to flip each node [default: %default]")
+
+    def seed_callback(option, opt, value, parser):
+        random.seed(value)
+    parser.add_option("-s", "--seed",
+                      action="callback", type="int", callback=seed_callback,
+                      help="initial random number seed for random events")
+
+    parser.add_option("-x", "--exit",
+                      action="store_true", dest="exit", default=False,
+                      help="exit on the 1st gratuitous IP move or IP imbalance")
+    parser.add_option("-H", "--hard-imbalance-limit",
+                      action="store", type="int", dest="hard_limit", default=1,
+                      help="exceeding this limit causes termination  [default: %default]")
+    parser.add_option("-S", "--soft-imbalance-limit",
+                      action="store", type="int", dest="soft_limit", default=1,
+                      help="exceeding this limit increments a counter [default: %default]")
+
+    (options, args) = parser.parse_args()
+
+    if len(args) != 0:
+        parser.error("too many arguments")
+
+    # Could use a callback for this or change the default, but
+    # laziness is sometimes a virtue.  ;-)
+    if options.lcp2:
+        options.deterministic_public_ips = False
+
+def print_begin(t, delim='='):
+    print delim * 40
+    print "%s:" % (t)
+
+def print_end():
+    print "-" * 40
+
+def verbose_begin(t):
+    if options.verbose > 0:
+        print_begin(t)
+
+def verbose_end():
+    if options.verbose > 0:
+        print_end()
+
+def verbose_print(t):
+    if options.verbose > 0:
+        if not type(t) == list:
+            t = [t]
+        if t != []:
+            print "\n".join([str(i) for i in t])
+
+# more than this and we switch to the logging module...  :-)
+def debug_begin(t):
+    if options.verbose > 1:
+        print_begin(t, '-')
+
+def debug_end():
+    if options.verbose > 1:
+        print_end()
+
+def debug_print(t):
+    if options.verbose > 1:
+        if not type(t) == list:
+            t = [t]
+        if t != []:
+            print "\n".join([str(i) for i in t])
+
+def ip_to_list_of_ints(ip):
+    # Be lazy... but only expose errors in IPv4 addresses, since
+    # they'll be more commonly used.  :-)
+    try:
+        l = socket.inet_pton(socket.AF_INET6, ip)
+    except:
+        # Pad with leading 0s.  This makes IPv4 addresses comparable
+        # with IPv6 but reduces the overall effectiveness of the
+        # algorithm.  The alternative would be to treat these
+        # addresses separately while trying to keep all the IPs in
+        # overall balance.
+        l = "".join(itertools.repeat("\0", 12)) + \
+            socket.inet_pton(socket.AF_INET, ip)
+
+    return map(lambda x: struct.unpack('B', x)[0], l)
+
+def ip_distance(ip1, ip2):
+    """Calculate the distance between 2 IPs.
+
+    This is the length of the longtest common prefix between the IPs.
+    It is calculated by XOR-ing the 2 IPs together and counting the
+    number of leading zeroes."""
+
+    distance = 0
+    for (o1, o2) in zip(ip_to_list_of_ints(ip1), ip_to_list_of_ints(ip2)):
+        # XOR this pair of octets
+        x = o1 ^ o2
+        # count number leading zeroes
+        if x == 0:
+            distance += 8
+        else:
+            # bin() gives minimal length '0bNNN' string
+            distance += (8 - (len(bin(x)) - 2))
+            break
+
+    return distance
+
+def ip_distance_2_sum(ip, ips):
+    """Calculate the IP distance for the given IP relative to IPs.
+
+    This could be made more efficient by insering ip_distance_2 into
+    the loop in this function.  However, that would result in some
+    loss of clarity and also will not be necessary in a C
+    implemntation."""
+
+    sum = 0
+    for i in ips:
+        sum += ip_distance(ip, i) ** 2
+
+    return sum
+
+def imbalance_metric(ips):
+    """Return the imbalance metric for a group of IPs.
+
+    This is the sum of squares of the IP distances between each pair of IPs."""
+    if len(ips) > 1:
+        (h, t) = (ips[0], ips[1:])
+        return ip_distance_2_sum(h, t) + imbalance_metric(t)
+    else:
+        return 0
+
+def mean(l):
+    return float(sum(l))/len(l)
+
+class Node(object):
+    def __init__(self, public_addresses):
+        # List of list allows groups of IPs to be passed in.  They're
+        # not actually used in the algorithm but are just used by
+        # calculate_imbalance() for checking the simulation.  Note
+        # that people can pass in garbage and make this code
+        # fail... but we're all friends here in simulation world...
+        # :-)
+        if type(public_addresses[0]) is str:
+            self.public_addresses = set(public_addresses)
+            self.ip_groups = []
+        else:
+            # flatten
+            self.public_addresses = set([i for s in public_addresses for i in s])
+            self.ip_groups = public_addresses
+
+        self.current_addresses = set()
+        self.healthy = True
+        self.imbalance = -1
+
+    def __str__(self):
+        return "%s %s%s" % \
+            ("*" if len(self.public_addresses) == 0 else \
+                 (" " if self.healthy else "#"),
+             sorted(list(self.current_addresses)),
+             " %d" % self.imbalance if options.lcp2 else "")
+
+    def can_node_serve_ip(self, ip):
+        return ip in self.public_addresses
+
+    def node_ip_coverage(self, ips=None):
+        return len([a for a in self.current_addresses if ips == None or a in ips])
+
+    def set_imbalance(self, imbalance=-1):
+        """Set the imbalance metric to the given value.  If none given
+        then calculate it."""
+
+        if imbalance != -1:
+            self.imbalance = imbalance
+        else:
+            self.imbalance = imbalance_metric(list(self.current_addresses))
+
+    def get_imbalance(self):
+        return self.imbalance
+
+class Cluster(object):
+    def __init__(self):
+        self.nodes = []
+        self.deterministic_public_ips = options.deterministic_public_ips
+        self.no_ip_failback = options.no_ip_failback
+        self.all_public_ips = set()
+
+        # Statistics
+        self.ip_moves = []
+        self.grat_ip_moves = []
+        self.imbalance = []
+        self.imbalance_groups = []
+        self.imbalance_count = 0
+        self.imbalance_groups_count = itertools.repeat(0)
+        self.imbalance_metric = []
+        self.events = -1
+        self.num_unhealthy = []
+
+        self.prev = None
+
+    def __str__(self):
+        return "\n".join(["%2d %s" % (i, n) \
+                              for (i, n) in enumerate(self.nodes)])
+
+    # This is naive.  It assumes that IP groups are indicated by the
+    # 1st node having IP groups.
+    def have_ip_groups(self):
+        return (len(self.nodes[0].ip_groups) > 0)
+
+    def print_statistics(self):
+        print_begin("STATISTICS")
+        print "Events:                      %6d" % self.events
+        print "Total IP moves:              %6d" % sum(self.ip_moves)
+        print "Gratuitous IP moves:         %6d" % sum(self.grat_ip_moves)
+        print "Max imbalance:               %6d" % max(self.imbalance)
+        if self.have_ip_groups():
+            print "Max group imbalance counts:    ", map(max, zip(*self.imbalance_groups))
+        print "Mean imbalance:              %f" % mean(self.imbalance)
+        if self.have_ip_groups():
+            print "Mean group imbalances counts:   ", map(mean, zip(*self.imbalance_groups))
+        print "Final imbalance:             %6d" % self.imbalance[-1]
+        if self.have_ip_groups():
+            print "Final group imbalances:         ", self.imbalance_groups[-1]
+        if options.lcp2:
+            print "Max LCP2 imbalance  :        %6d" % max(self.imbalance_metric)
+        print "Soft imbalance count:        %6d" % self.imbalance_count
+        if self.have_ip_groups():
+            print "Soft imbalance group counts:    ", self.imbalance_groups_count
+        if options.lcp2:
+            print "Final LCP2 imbalance  :      %6d" % self.imbalance_metric[-1]
+        print "Maximum unhealthy:           %6d" % max(self.num_unhealthy)
+        print_end()
+
+    def find_pnn_with_ip(self, ip):
+        for (i, n) in enumerate(self.nodes):
+            if ip in n.current_addresses:
+                return i
+        return -1
+
+    def quietly_remove_ip(self, ip):
+        # Remove address from old node.
+        old = self.find_pnn_with_ip(ip)
+        if old != -1:
+            self.nodes[old].current_addresses.remove(ip)
+
+    def add_node(self, node):
+        self.nodes.append(node)
+        self.all_public_ips |= node.public_addresses
+
+    def healthy(self, *pnns):
+        verbose_begin("HEALTHY")
+
+        for pnn in pnns:
+            self.nodes[pnn].healthy = True
+            verbose_print(pnn)
+
+        verbose_end()
+
+    def unhealthy(self, *pnns):
+
+        verbose_begin("UNHEALTHY")
+
+        for pnn in pnns:
+            self.nodes[pnn].healthy = False
+            verbose_print(pnn)
+
+        verbose_end()
+
+    def do_something_random(self):
+
+        """Make random node(s) healthy or unhealthy.
+
+        If options.aggressive is False then: If all nodes are healthy
+        or unhealthy, then invert one of them; otherwise, there's a 1
+        in options.odds chance of making another node unhealthy.
+
+        If options.aggressive is True then: For each node there is a 1
+        in options.odds chance of flipping the state of that node
+        between healthy and unhealthy."""
+
+        if not options.aggressive:
+            num_nodes = len(self.nodes)
+            healthy_pnns = [i for (i,n) in enumerate(self.nodes) if n.healthy]
+            num_healthy = len(healthy_pnns)
+
+            if num_nodes == num_healthy:
+                self.unhealthy(random.randint(0, num_nodes-1))
+            elif num_healthy == 0:
+                self.healthy(random.randint(0, num_nodes-1))
+            elif random.randint(1, options.odds) == 1:
+                self.unhealthy(random.choice(healthy_pnns))
+            else:
+                all_pnns = range(num_nodes)
+                unhealthy_pnns = sorted(list(set(all_pnns) - set(healthy_pnns)))
+                self.healthy(random.choice(unhealthy_pnns))
+        else:
+            # We need to make at least one change or we retry...x
+            changed = False
+            while not changed:
+                for (pnn, n) in enumerate(self.nodes):
+                    if random.randint(1, options.odds) == 1:
+                        changed = True
+                        if n.healthy:
+                            self.unhealthy(pnn)
+                        else:
+                            self.healthy(pnn)
+
+    def random_iterations(self):
+        i = 1
+        while i <= options.iterations:
+            verbose_begin("EVENT %d" % i)
+            verbose_end()
+            self.do_something_random()
+            if self.recover() and options.exit:
+                break
+            i += 1
+
+        self.print_statistics()
+
+    def imbalance_for_ips(self, ips):
+
+        imbalance = 0
+
+        maxnode = -1
+        minnode = -1
+
+        for ip in ips:
+            for (i, n) in enumerate(self.nodes):
+
+                if not n.healthy or not n.can_node_serve_ip(ip):
+                    continue
+
+                num = n.node_ip_coverage(ips)
+
+                if maxnode == -1 or num > maxnum:
+                    maxnode = i
+                    maxnum = num
+
+                if minnode == -1 or num < minnum:
+                    minnode = i
+                    minnum = num
+
+            if maxnode == -1 or minnode == -1:
+                continue
+
+            i = maxnum - minnum
+            #if i < 2:
+            #    i = 0
+            imbalance = max([imbalance, i])
+
+        return imbalance
+
+
+    def calculate_imbalance(self):
+
+        # First, do all the assigned IPs.
+        assigned = sorted([ip
+                           for n in self.nodes
+                           for ip in n.current_addresses])
+
+        i = self.imbalance_for_ips(assigned)
+
+        ig = []
+        # FIXME?  If dealing with IP groups, assume the nodes are all
+        # the same.
+        for ips in self.nodes[0].ip_groups:
+            gi = self.imbalance_for_ips(ips)
+            ig.append(gi)
+
+        return (i, ig)
+
+
+    def diff(self):
+        """Calculate differences in IP assignments between self and prev.
+
+        Gratuitous IP moves (from a healthy node to a healthy node)
+        are prefixed by !!."""
+
+        ip_moves = 0
+        grat_ip_moves = 0
+        details = []
+
+        for (new, n) in enumerate(self.nodes):
+            for ip in n.current_addresses:
+                old = self.prev.find_pnn_with_ip(ip)
+                if old != new:
+                    ip_moves += 1
+                    if old != -1 and \
+                            self.prev.nodes[new].healthy and \
+                            self.nodes[new].healthy and \
+                            self.nodes[old].healthy and \
+                            self.prev.nodes[old].healthy:
+                        prefix = "!!"
+                        grat_ip_moves += 1
+                    else:
+                        prefix = "  "
+                    details.append("%s %s: %d -> %d" %
+                                   (prefix, ip, old, new))
+
+        return (ip_moves, grat_ip_moves, details)
+
+    def find_takeover_node(self, ip):
+
+        pnn = -1
+        min = 0
+        for (i, n) in enumerate(self.nodes):
+            if not n.healthy:
+                continue
+
+            if not n.can_node_serve_ip(ip):
+                continue
+
+            num = n.node_ip_coverage()
+
+            if (pnn == -1):
+                pnn = i
+                min = num
+            else:
+                if num < min:
+                    pnn = i
+                    min = num
+
+        if pnn == -1:
+            verbose_print("Could not find node to take over public address %s" % ip)
+            return False
+
+        self.nodes[pnn].current_addresses.add(ip)
+
+        verbose_print("%s -> %d" % (ip, pnn))
+        return True
+
+    def basic_allocate_unassigned(self):
+
+        assigned = set([ip for n in self.nodes for ip in n.current_addresses])
+        unassigned = sorted(list(self.all_public_ips - assigned))
+
+        for ip in unassigned:
+            self.find_takeover_node(ip)
+
+    def basic_failback(self, retries_l):
+
+        assigned = sorted([ip
+                           for n in self.nodes
+                           for ip in n.current_addresses])
+        for ip in assigned:
+
+            maxnode = -1
+            minnode = -1
+            for (i, n) in enumerate(self.nodes):
+                if not n.healthy:
+                    continue
+
+                if not n.can_node_serve_ip(ip):
+                    continue
+
+                num = n.node_ip_coverage()
+
+                if maxnode == -1:
+                    maxnode = i
+                    maxnum = num
+                else:
+                    if num > maxnum:
+                        maxnode = i
+                        maxnum = num
+                if minnode == -1:
+                    minnode = i
+                    minnum = num
+                else:
+                    if num < minnum:
+                        minnode = i
+                        minnum = num
+
+            if maxnode == -1:
+                print "Could not find maxnode. May not be able to serve ip", ip
+                continue
+
+            #if self.deterministic_public_ips:
+            #    continue
+
+            if maxnum > minnum + 1 and retries_l[0] < options.retries:
+                # Remove the 1st ip from maxnode
+                t = sorted(list(self.nodes[maxnode].current_addresses))
+                realloc = t[0]
+                verbose_print("%s <- %d" % (realloc, maxnode))
+                self.nodes[maxnode].current_addresses.remove(realloc)
+                # Redo the outer loop.
+                retries_l[0] += 1
+                return True
+
+        return False
+
+
+    def lcp2_allocate_unassigned(self):
+
+        # Assign as many unassigned addresses as possible.  Keep
+        # selecting the optimal assignment until we don't manage to
+        # assign anything.
+        assigned = set([ip for n in self.nodes for ip in n.current_addresses])
+        unassigned = sorted(list(self.all_public_ips - assigned))
+
+        should_loop = True
+        while len(unassigned) > 0 and should_loop:
+            should_loop = False
+
+            debug_begin(" CONSIDERING MOVES (UNASSIGNED)")
+
+            minnode = -1
+            mindsum = 0
+            minip = None
+
+            for ip in unassigned:
+                for dstnode in range(len(self.nodes)):
+                    if self.nodes[dstnode].can_node_serve_ip(ip) and \
+                            self.nodes[dstnode].healthy:
+                        dstdsum = ip_distance_2_sum(ip, self.nodes[dstnode].current_addresses)
+                        dstimbl = self.nodes[dstnode].get_imbalance() + dstdsum
+                        debug_print(" %s -> %d [+%d]" % \
+                                        (ip,
+                                         dstnode,
+                                         dstimbl - self.nodes[dstnode].get_imbalance()))
+
+                        if (minnode == -1) or (dstdsum < mindsum):
+                            minnode = dstnode
+                            minimbl = dstimbl
+                            mindsum = dstdsum
+                            minip = ip
+                            should_loop = True
+            debug_end()
+
+            if minnode != -1:
+                self.nodes[minnode].current_addresses.add(minip)
+                self.nodes[minnode].set_imbalance(self.nodes[minnode].get_imbalance() + mindsum)
+                verbose_print("%s -> %d [+%d]" % (minip, minnode, mindsum))
+                unassigned.remove(minip)
+
+        for ip in unassigned:
+            verbose_print("Could not find node to take over public address %s" % ip)
+
+    def lcp2_failback(self, targets):
+
+        # Get the node with the highest imbalance metric.
+        srcnode = -1
+        maximbl = 0
+        for (pnn, n) in enumerate(self.nodes):
+            b = n.get_imbalance()
+            if (srcnode == -1) or (b > maximbl):
+                srcnode = pnn
+                maximbl = b
+
+        # This means that all nodes had 0 or 1 addresses, so can't
+        # be imbalanced.
+        if maximbl == 0:
+            return False
+
+        # We'll need this a few times...
+        ips = self.nodes[srcnode].current_addresses
+
+        # Find an IP and destination node that best reduces imbalance.
+        optimum = None
+        debug_begin(" CONSIDERING MOVES FROM %d [%d]" % (srcnode, maximbl))
+        for ip in ips:
+            # What is this IP address costing the source node?
+            srcdsum = ip_distance_2_sum(ip, ips - set([ip]))
+            srcimbl = maximbl - srcdsum
+
+            # Consider this IP address would cost each potential
+            # destination node.  Destination nodes are limited to
+            # those that are newly healthy, since we don't want to
+            # do gratuitous failover of IPs just to make minor
+            # balance improvements.
+            for dstnode in targets:
+                if self.nodes[dstnode].can_node_serve_ip(ip) and \
+                        self.nodes[dstnode].healthy:
+                    dstdsum = ip_distance_2_sum(ip, self.nodes[dstnode].current_addresses)
+                    dstimbl = self.nodes[dstnode].get_imbalance() + dstdsum
+                    debug_print(" %d [%d] -> %s -> %d [+%d]" % \
+                                    (srcnode,
+                                     srcimbl - self.nodes[srcnode].get_imbalance(),
+                                     ip,
+                                     dstnode,
+                                     dstimbl - self.nodes[dstnode].get_imbalance()))
+
+                    if (dstimbl < maximbl) and (dstdsum < srcdsum):
+                        if optimum is None:
+                            optimum = (ip, srcnode, srcimbl, dstnode, dstimbl)
+                        else:
+                            (x, sn, si, dn, di) = optimum
+                            if (srcimbl + dstimbl) < (si + di):
+                                optimum = (ip, srcnode, srcimbl, dstnode, dstimbl)
+        debug_end()
+
+        if optimum is not None:
+            # We found a move that makes things better...
+            (ip, srcnode, srcimbl, dstnode, dstimbl) = optimum
+            ini_srcimbl = self.nodes[srcnode].get_imbalance()
+            ini_dstimbl = self.nodes[dstnode].get_imbalance()
+
+            self.nodes[srcnode].current_addresses.remove(ip)
+            self.nodes[srcnode].set_imbalance(srcimbl)
+
+            self.nodes[dstnode].current_addresses.add(ip)
+            self.nodes[dstnode].set_imbalance(dstimbl)
+
+            verbose_print("%d [%d] -> %s -> %d [+%d]" % \
+                              (srcnode,
+                               srcimbl - ini_srcimbl,
+                               ip,
+                               dstnode,
+                               dstimbl - ini_dstimbl))
+
+            return True
+
+        return False
+
+    def ctdb_takeover_run_python(self):
+
+        # Don't bother with the num_healthy stuff.  It is an
+        # irrelevant detail.
+
+        # We just keep the allocate IPs in the current_addresses field
+        # of the node.  This needs to readable, not efficient!
+
+        if self.deterministic_public_ips:
+            # Remap everything.
+            addr_list = sorted(list(self.all_public_ips))
+            for (i, ip) in enumerate(addr_list):
+                self.quietly_remove_ip(ip)
+                # Add addresses to new node.
+                pnn = i % len(self.nodes)
+                self.nodes[pnn].current_addresses.add(ip)
+                verbose_print("%s -> %d" % (ip, pnn))
+
+        # Remove public addresses from unhealthy nodes.
+        for (pnn, n) in enumerate(self.nodes):
+            if not n.healthy:
+                verbose_print(["%s <- %d" % (ip, pnn)
+                               for ip in n.current_addresses])
+                n.current_addresses = set()
+
+        # If a node can't serve an assigned address then remove it.
+        for n in self.nodes:
+            verbose_print(["%s <- %d" % (ip, pnn)
+                           for ip in n.current_addresses - n.public_addresses])
+            n.current_addresses &= n.public_addresses
+
+        if options.lcp2:
+            newly_healthy = [pnn for (pnn, n) in enumerate(self.nodes)
+                             if len(n.current_addresses) == 0 and n.healthy]
+            for n in self.nodes:
+                n.set_imbalance()
+
+        # We'll only retry the balancing act up to options.retries
+        # times (for the basic non-deterministic algorithm).  This
+        # nonsense gives us a reference on the retries count in
+        # Python.  It will be easier in C.  :-)
+        # For LCP2 we reassignas many IPs from heavily "loaded" nodes
+        # to nodes that are newly healthy, looping until we fail to
+        # reassign an IP.
+        retries_l = [0]
+        should_loop = True
+        while should_loop:
+            should_loop = False
+
+            if options.lcp2:
+                self.lcp2_allocate_unassigned()
+            else:
+                self.basic_allocate_unassigned()
+
+            if self.no_ip_failback or self.deterministic_public_ips:
+                break
+
+            if options.lcp2:
+                if len(newly_healthy) == 0:
+                    break
+                should_loop = self.lcp2_failback(newly_healthy)
+            else:
+                should_loop = self.basic_failback(retries_l)
+
+    def ctdb_takeover_run_external(self):
+
+        # Written while asleep...
+
+        # Convert the cluster state to something that be fed to
+        # ctdb_takeover_tests ctdb_takeover_run_core ...
+
+        in_lines = []
+        for ip in sorted(list(self.all_public_ips)):
+            allowed = []
+            assigned = -1
+            for (i, n) in enumerate(self.nodes):
+                if n.can_node_serve_ip(ip):
+                    allowed.append("%s" % i)
+                if ip in n.current_addresses:
+                    assigned = i
+            line = "%s\t%d\t%s" % (ip, assigned, ",".join(allowed))
+            in_lines.append(line)
+
+        nodestates = ",".join(["0" if n.healthy else "1" for n in self.nodes])
+
+        if options.lcp2:
+            os.environ["CTDB_LCP2"] = "yes"
+        if options.verbose > 1:
+            os.environ["CTDB_TEST_LOGLEVEL"] = "4"
+        elif options.verbose == 1:
+            os.environ["CTDB_TEST_LOGLEVEL"] = "3"
+        else:
+            os.environ["CTDB_TEST_LOGLEVEL"] = "0"
+
+        p = subprocess.Popen("../../bin/ctdb_takeover_tests ctdb_takeover_run_core %s 2>&1" % nodestates,
+                             shell=True,
+                             stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        p.stdin.write("\n".join(in_lines))
+        p.stdin.close()
+
+        # Flush all of the assigned IPs.
+        for n in self.nodes:
+            n.current_addresses = set()
+
+        # Uses the results to populate the current_addresses for each
+        # node.
+        for line in p.stdout.read().split("\n"):
+            # Some lines are debug, some are the final IP
+            # configuration.  Let's use a gross hack that assumes any
+            # line with 2 words is IP configuration.  That will do for
+            # now.
+            words = re.split("\s+", line)
+            if len(words) == 2:
+                # Add the IP as current for the specified node.
+                self.nodes[int(words[1])].current_addresses.add(words[0])
+            else:
+                 # First 3 words are log date/time, remove them...
+                 print " ".join(words[3:])
+
+        # Now fake up the LCP calculations.
+        for n in self.nodes:
+            n.set_imbalance()
+
+    def ctdb_takeover_run(self):
+
+        self.events += 1
+
+        if options.external:
+            return self.ctdb_takeover_run_external()
+        else:
+            return self.ctdb_takeover_run_python()
+
+    def recover(self):
+        verbose_begin("TAKEOVER")
+
+        self.ctdb_takeover_run()
+
+        verbose_end()
+
+        grat_ip_moves = 0
+
+        if self.prev is not None:
+            (ip_moves, grat_ip_moves, details) = self.diff()
+            self.ip_moves.append(ip_moves)
+            self.grat_ip_moves.append(grat_ip_moves)
+
+            if options.diff:
+                print_begin("DIFF")
+                print "\n".join(details)
+                print_end()
+
+        (imbalance, imbalance_groups) = self.calculate_imbalance()
+        self.imbalance.append(imbalance)
+        self.imbalance_groups.append(imbalance_groups)
+
+        if imbalance > options.soft_limit:
+            self.imbalance_count += 1
+
+        # There must be a cleaner way...
+        t = []
+        for (c, i) in zip(self.imbalance_groups_count, imbalance_groups):
+            if i > options.soft_limit:
+                t.append(c + i)
+            else:
+                t.append(c)
+        self.imbalance_groups_count = t
+
+        imbalance_metric = max([n.get_imbalance() for n in self.nodes])
+        self.imbalance_metric.append(imbalance_metric)
+        if options.balance:
+            print_begin("IMBALANCE")
+            print "ALL IPS:", imbalance
+            if self.have_ip_groups():
+                print "IP GROUPS:", imbalance_groups
+            if options.lcp2:
+                print "LCP2 IMBALANCE:", imbalance_metric
+            print_end()
+
+        num_unhealthy = len(self.nodes) - \
+            len([n for n in self.nodes if n.healthy])
+        self.num_unhealthy.append(num_unhealthy)
+
+        if options.show:
+            print_begin("STATE")
+            print self
+            print_end()
+
+        self.prev = None
+        self.prev = copy.deepcopy(self)
+
+        # True is bad!
+        return (grat_ip_moves > 0) or \
+            (not self.have_ip_groups() and imbalance > options.hard_limit) or \
+            (self.have_ip_groups() and (max(imbalance_groups) > options.hard_limit))
diff --git a/ctdb/tests/takeover/simulation/hey_jude.py b/ctdb/tests/takeover/simulation/hey_jude.py
new file mode 100755 (executable)
index 0000000..a6b14c5
--- /dev/null
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses10 = ['10.4.20.%d' % n for n in range(154, 168)]
+addresses172a = ['172.20.106.%d' % n for n in range(110, 124)]
+addresses172b = ['172.20.107.%d' % n for n in range(110, 117)]
+
+c = Cluster()
+
+#for i in range(7):
+#    c.add_node(Node([addresses10, addresses172]))
+
+
+for i in range(4):
+    c.add_node(Node([addresses172a, addresses172b]))
+for i in range(3):
+    c.add_node(Node(addresses10))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/ip_groups1.py b/ctdb/tests/takeover/simulation/ip_groups1.py
new file mode 100755 (executable)
index 0000000..0808f46
--- /dev/null
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+# 2 IP groups, both on the same 5 nodes, with each group on different
+# interfaces/VLANs.  One group has many more addresses to test how
+# well an "imbalanced" configuration will balance...
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses20 = ['192.168.20.%d' % n for n in range(1, 13)]
+addresses128 = ['192.168.128.%d' % n for n in range(1, 5)]
+
+c = Cluster()
+
+for i in range(5):
+    c.add_node(Node([addresses20, addresses128]))
+
+#for i in range(3):
+#    c.add_node(Node([addresses20]))
+
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/ip_groups2.py b/ctdb/tests/takeover/simulation/ip_groups2.py
new file mode 100755 (executable)
index 0000000..c6c1026
--- /dev/null
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+# 2 groups of addresses, combined into 1 pool so the checking
+# algorithm doesn't know about the groups, across 2 nodes.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses20 = ['192.168.20.%d' % n for n in range(1, 13)]
+addresses21 = ['192.168.21.%d' % n for n in range(1, 5)]
+
+c = Cluster()
+
+for i in range(2):
+    c.add_node(Node(addresses20 + addresses21))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/ip_groups3.py b/ctdb/tests/takeover/simulation/ip_groups3.py
new file mode 100755 (executable)
index 0000000..149946d
--- /dev/null
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+
+# 4 IP groups, across 10 nodes, with each group on different
+# interfaces/VLANs.  80 addresses in total but not evenly balanced, to
+# help check some of the more extreme behaviour.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['192.168.1.%d' % n for n in range(1, 41)]
+addresses2 = ['192.168.2.%d' % n for n in range(1, 21)]
+addresses3 = ['192.168.3.%d' % n for n in range(1, 11)]
+addresses4 = ['192.168.4.%d' % n for n in range(1, 11)]
+
+# Try detecting imbalance with square root of number of nodes?  Or
+# just with a parameter indicating how unbalanced you're willing to
+# accept...
+
+c = Cluster()
+
+for i in range(10):
+    c.add_node(Node([addresses1, addresses2, addresses3, addresses4]))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/ip_groups4.py b/ctdb/tests/takeover/simulation/ip_groups4.py
new file mode 100755 (executable)
index 0000000..fdcef7f
--- /dev/null
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+# 2 IP groups, across 2 nodes, with each group on different
+# interfaces.  4 addresses per group.  A nice little canonical 2 node
+# configuration.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['192.168.1.%d' % n for n in range(1, 5)]
+addresses2 = ['192.168.2.%d' % n for n in range(1, 5)]
+
+# Try detecting imbalance with square root of number of nodes?  Or
+# just with a parameter indicating how unbalanced you're willing to
+# accept...
+
+c = Cluster()
+
+for i in range(2):
+    c.add_node(Node([addresses1, addresses2]))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/ip_groups5.py b/ctdb/tests/takeover/simulation/ip_groups5.py
new file mode 100755 (executable)
index 0000000..8c46150
--- /dev/null
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+# 1 IP group, to test backward compatibility of LCP2 algorithm.  16
+# addresses across 4 nodes.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['192.168.1.%d' % n for n in range(1, 17)]
+
+# Try detecting imbalance with square root of number of nodes?  Or
+# just with a parameter indicating how unbalanced you're willing to
+# accept...
+
+c = Cluster()
+
+for i in range(4):
+    c.add_node(Node(addresses1))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/mgmt_simple.py b/ctdb/tests/takeover/simulation/mgmt_simple.py
new file mode 100755 (executable)
index 0000000..f891199
--- /dev/null
@@ -0,0 +1,22 @@
+#!/usr/bin/env python
+
+# This is an example showing a current SONAS configuration with 3
+# interface node and a management node.  When run with deterministic
+# IPs there are gratuitous IP reassignments.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
+
+c = Cluster()
+
+for i in range(3):
+    c.add_node(Node(addresses))
+
+c.add_node(Node([]))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/node_group.py b/ctdb/tests/takeover/simulation/node_group.py
new file mode 100755 (executable)
index 0000000..bf7de58
--- /dev/null
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+# This demonstrates a node group configurations.
+#
+# Node groups can be defined with the syntax "-g N@IP0,IP1-IP2,IP3".
+# This says to create a group of N nodes with IPs IP0, IP1, ..., IP2,
+# IP3.  Run it with deterministic IPs causes lots of gratuitous IP
+# reassignments.  Running with --nd fixes this.
+
+import ctdb_takeover
+import sys
+from optparse import make_option
+import string
+
+ctdb_takeover.process_args([
+        make_option("-g", "--group",
+                    action="append", type="string", dest="groups",
+                    help="define a node group using N@IPs syntax"),
+        ])
+
+def expand_range(r):
+    sr = r.split("-", 1)
+    if len(sr) == 2:
+        all = string.ascii_uppercase + string.ascii_lowercase
+        sr = list(all[all.index(sr[0]):all.index(sr[1])+1])
+    return sr
+            
+def add_node_group(s):
+    (count, ips_str) = s.split("@", 1)
+    ips = [i for r in ips_str.split(",") \
+               for i in expand_range(r) if r != ""]
+    for i in range(int(count)):
+        c.add_node(ctdb_takeover.Node(ips))
+
+c = ctdb_takeover.Cluster()
+
+if ctdb_takeover.options.groups is None:
+    print "Error: no node groups defined."
+    sys.exit(1)
+
+for g in ctdb_takeover.options.groups:
+    add_node_group(g)
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/node_group_extra.py b/ctdb/tests/takeover/simulation/node_group_extra.py
new file mode 100755 (executable)
index 0000000..7e9e518
--- /dev/null
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+
+# This example demonstrates a node group configuration.  Is it meant
+# to be the same as node_group_simple.py, but with a couple of nodes
+# added later, so they are listed after the management node.
+
+# When run with deterministic IPs (use "-d" to show the problem) it
+# does many gratuitous IP reassignments.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'] + ['P', 'Q', 'R', 'S', 'T', 'U']
+addresses2 = ['I', 'J', 'K', 'L']
+
+c = Cluster()
+
+for i in range(4):
+    c.add_node(Node(addresses1))
+
+for i in range(3):
+    c.add_node(Node(addresses2))
+
+c.add_node(Node([]))
+c.add_node(Node(addresses1))
+c.add_node(Node(addresses2))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/node_group_simple.py b/ctdb/tests/takeover/simulation/node_group_simple.py
new file mode 100755 (executable)
index 0000000..3c58ef7
--- /dev/null
@@ -0,0 +1,26 @@
+#!/usr/bin/env python
+
+# This example demonstrates a simple, sensible node group
+# configuration.  When run with deterministic IPs (use "-d" to show
+# the problem) it does many gratuitous IP reassignments.
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
+addresses2 = ['I', 'J', 'K']
+
+c = Cluster()
+
+for i in range(4):
+    c.add_node(Node(addresses1))
+
+for i in range(3):
+    c.add_node(Node(addresses2))
+
+c.add_node(Node([]))
+
+c.recover()
+
+c.random_iterations()
diff --git a/ctdb/tests/takeover/simulation/nondet_path_01.py b/ctdb/tests/takeover/simulation/nondet_path_01.py
new file mode 100755 (executable)
index 0000000..a62847a
--- /dev/null
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+
+# This is a contrived example that makes the balancing algorithm fail
+# for nondeterministic IPs (run with "-dv --nd" to see the failure).
+
+from ctdb_takeover import Cluster, Node, process_args
+
+process_args()
+
+addresses1 = ['A', 'B', 'C', 'D']
+addresses2 = ['B', 'E', 'F']
+
+c = Cluster()
+
+for i in range(2):
+    c.add_node(Node(addresses1))
+
+c.add_node(Node(addresses2))
+
+c.recover()
+
+c.unhealthy(1)
+c.recover()
+c.healthy(1)
+c.recover()
diff --git a/ctdb/tests/test_check_tcp_ports.sh b/ctdb/tests/test_check_tcp_ports.sh
new file mode 100755 (executable)
index 0000000..e439b6d
--- /dev/null
@@ -0,0 +1,17 @@
+#!/bin/sh
+
+DIRNAME=$(dirname $0)
+
+. ${DIRNAME}/../config/functions
+
+SERVICE="test-service"
+
+PORTS="$@"
+
+if [ "x${PORTS}" = "x" ] ; then
+       PORTS=139
+fi
+
+ctdb_check_tcp_ports ${SERVICE} ${PORTS}
+
+echo "Test for service '${SERVICE}' on tcp ports ${PORTS} succeeded!"
diff --git a/ctdb/tests/tool/README b/ctdb/tests/tool/README
new file mode 100644 (file)
index 0000000..8160528
--- /dev/null
@@ -0,0 +1,17 @@
+Unit tests for the ctdb tool (i.e. tools/ctdb).
+
+Test case filenames can take 2 forms:
+
+* func.<some_function>.NNN.sh
+
+  Run <some_function> in the ctdb tool code using the
+  ctdb_tool_functest test program.  This test program uses test stubs
+  for CTDB client functions.
+
+* stubby.<command>.NNN.sh
+
+  Run the ctdb_tool_stubby test program with <command> as the 1st
+  argument - subsequent are passed to simple_test().  ctdb_tool_stubby
+  is linked against the test stubs for CTDB client functions.
+
+To add tests here you may need to add appropriate test stubs.
diff --git a/ctdb/tests/tool/func.parse_nodestring.001.sh b/ctdb/tests/tool/func.parse_nodestring.001.sh
new file mode 100755 (executable)
index 0000000..d7caf89
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all, dd_ok, 3 healthy"
+
+required_result <<EOF
+NODES: 0 1 2
+PNN MODE: BROADCAST_ALL (4026531842)
+EOF
+
+simple_test all true <<EOF
+0       192.168.20.41   0x0
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0     CURRENT RECMASTER
+EOF
diff --git a/ctdb/tests/tool/func.parse_nodestring.002.sh b/ctdb/tests/tool/func.parse_nodestring.002.sh
new file mode 100755 (executable)
index 0000000..c89e444
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all, dd_ok, 2 ok/1 disconnected"
+
+required_result <<EOF
+NODES: 0 1 2
+PNN MODE: BROADCAST_ALL (4026531842)
+EOF
+
+simple_test all true <<EOF
+0       192.168.20.41   0x0
+1       192.168.20.42   0x1
+2       192.168.20.43   0x0     CURRENT RECMASTER
+EOF
diff --git a/ctdb/tests/tool/func.parse_nodestring.003.sh b/ctdb/tests/tool/func.parse_nodestring.003.sh
new file mode 100755 (executable)
index 0000000..3e03ac4
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all, current disconnected"
+
+required_result 10 <<EOF
+DATE TIME [PID]: Unable to get nodemap from local node
+EOF
+
+simple_test all true <<EOF
+0       192.168.20.41   0x0
+1       192.168.20.42   0x0
+2       192.168.20.43   0x1     CURRENT RECMASTER
+EOF
diff --git a/ctdb/tests/tool/scripts/local.sh b/ctdb/tests/tool/scripts/local.sh
new file mode 100644 (file)
index 0000000..385e2ad
--- /dev/null
@@ -0,0 +1,56 @@
+# Hey Emacs, this is a -*- shell-script -*- !!!  :-)
+
+if "$TEST_VERBOSE" ; then
+    debug () { echo "$@" ; }
+else
+    debug () { : ; }
+fi
+
+define_test ()
+{
+    _f=$(basename "$0" ".sh")
+
+    case "$_f" in
+       func.*)
+           _func="${_f#func.}"
+           _func="${_func%.*}" # Strip test number
+           test_prog="ctdb_functest ${_func}"
+           ;;
+       stubby.*)
+           _cmd="${_f#stubby.}"
+           _cmd="${_cmd%.*}" # Strip test number
+           test_prog="ctdb_stubtest ${_cmd}"
+           ;;
+       *)
+           die "Unknown pattern for testcase \"$_f\""
+    esac
+
+    printf "%-28s - %s\n" "$_f" "$1"
+}
+
+setup_natgw ()
+{
+    debug "Setting up NAT gateway"
+
+    natgw_config_dir="${TEST_VAR_DIR}/natgw_config"
+    mkdir -p "$natgw_config_dir"
+
+    # These will accumulate, 1 per test... but will be cleaned up at
+    # the end.
+    export CTDB_NATGW_NODES=$(mktemp --tmpdir="$natgw_config_dir")
+
+    cat >"$CTDB_NATGW_NODES"
+}
+
+simple_test ()
+{
+    # Most of the tests when the tool fails will have a date/time/pid
+    # prefix.  Strip that because it isn't possible to match it.
+    if [ $required_rc -ne 0 ]  ; then
+       OUT_FILTER='s@^[0-9/]+\ [0-9:\.]+\ \[[\ 0-9]+\]:@DATE\ TIME\ \[PID\]:@'
+    fi
+
+    _out=$($VALGRIND $test_prog "$@" 2>&1)
+
+    result_check
+}
diff --git a/ctdb/tests/tool/stubby.getcapabilities.001.sh b/ctdb/tests/tool/stubby.getcapabilities.001.sh
new file mode 100755 (executable)
index 0000000..df4a659
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all ok"
+
+required_result 0 <<EOF
+RECMASTER: YES
+LMASTER: YES
+LVS: NO
+NATGW: YES
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x0     CURRENT RECMASTER
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.getcapabilities.002.sh b/ctdb/tests/tool/stubby.getcapabilities.002.sh
new file mode 100755 (executable)
index 0000000..9a37c4a
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 1 disconnected"
+
+required_result 0 <<EOF
+RECMASTER: YES
+LMASTER: YES
+LVS: NO
+NATGW: YES
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x0     CURRENT RECMASTER
+1       192.168.20.42   0x1
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.getcapabilities.003.sh b/ctdb/tests/tool/stubby.getcapabilities.003.sh
new file mode 100755 (executable)
index 0000000..33b1b74
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, current disconnected"
+
+required_result 10 <<EOF
+DATE TIME [PID]: Unable to get nodemap from local node
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x1     CURRENT RECMASTER
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.lvs.001.sh b/ctdb/tests/tool/stubby.lvs.001.sh
new file mode 100755 (executable)
index 0000000..29e9ce0
--- /dev/null
@@ -0,0 +1,27 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all ok"
+
+# This isn't very useful, since the stub for capabilities does set LVS :-)
+required_result 0 <<EOF
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x0     CURRENT RECMASTER
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.lvsmaster.001.sh b/ctdb/tests/tool/stubby.lvsmaster.001.sh
new file mode 100755 (executable)
index 0000000..38de280
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all ok"
+
+# This isn't very useful, since the stub for capabilities doesn't set LVS :-)
+required_result 255 <<EOF
+There is no LVS master
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x0     CURRENT RECMASTER
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.lvsmaster.002.sh b/ctdb/tests/tool/stubby.lvsmaster.002.sh
new file mode 100755 (executable)
index 0000000..ea6e441
--- /dev/null
@@ -0,0 +1,28 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, current disconnected"
+
+# This isn't very useful, since the stub for capabilities doesn't set LVS :-)
+required_result 10 <<EOF
+DATE TIME [PID]: Unable to get nodemap from local node
+EOF
+
+simple_test -Y <<EOF
+NODEMAP
+0       192.168.20.41   0x1     CURRENT RECMASTER
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.natgwlist.001.sh b/ctdb/tests/tool/stubby.natgwlist.001.sh
new file mode 100755 (executable)
index 0000000..f1d2d37
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all in natgw group, all ok"
+
+setup_natgw <<EOF
+192.168.20.41
+192.168.20.42
+192.168.20.43
+EOF
+
+required_result 0 <<EOF
+0 192.168.20.41
+Number of nodes:3
+pnn:0 192.168.20.41    OK (THIS NODE)
+pnn:1 192.168.20.42    OK
+pnn:2 192.168.20.43    OK
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x0     CURRENT RECMASTER
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.natgwlist.002.sh b/ctdb/tests/tool/stubby.natgwlist.002.sh
new file mode 100755 (executable)
index 0000000..37f1722
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all in natgw group, 1 unhealthy"
+
+setup_natgw <<EOF
+192.168.20.41
+192.168.20.42
+192.168.20.43
+EOF
+
+required_result 0 <<EOF
+1 192.168.20.42
+Number of nodes:3
+pnn:0 192.168.20.41    UNHEALTHY
+pnn:1 192.168.20.42    OK (THIS NODE)
+pnn:2 192.168.20.43    OK
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x2
+1       192.168.20.42   0x0     CURRENT RECMASTER
+2       192.168.20.43   0x0
+
+VNNMAP
+654321
+0
+1
+2
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+EOF
diff --git a/ctdb/tests/tool/stubby.natgwlist.003.sh b/ctdb/tests/tool/stubby.natgwlist.003.sh
new file mode 100755 (executable)
index 0000000..19b1797
--- /dev/null
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, 2 in natgw group, 1 unhealthy"
+
+setup_natgw <<EOF
+192.168.20.41
+192.168.20.43
+EOF
+
+required_result 0 <<EOF
+2 192.168.20.43
+Number of nodes:2
+pnn:0 192.168.20.41    UNHEALTHY
+pnn:2 192.168.20.43    OK
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x2
+1       192.168.20.42   0x0     CURRENT RECMASTER
+2       192.168.20.43   0x0
+
+VNNMAP
+654321
+0
+1
+2
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+EOF
diff --git a/ctdb/tests/tool/stubby.natgwlist.004.sh b/ctdb/tests/tool/stubby.natgwlist.004.sh
new file mode 100755 (executable)
index 0000000..2abec5e
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all unhealthy, all but 1 stopped"
+
+setup_natgw <<EOF
+192.168.20.41
+192.168.20.42
+192.168.20.43
+EOF
+
+required_result 0 <<EOF
+2 192.168.20.43
+Number of nodes:3
+pnn:0 192.168.20.41    UNHEALTHY|STOPPED|INACTIVE
+pnn:1 192.168.20.42    UNHEALTHY|STOPPED|INACTIVE (THIS NODE)
+pnn:2 192.168.20.43    UNHEALTHY
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x22
+1       192.168.20.42   0x22     CURRENT RECMASTER
+2       192.168.20.43   0x2
+
+VNNMAP
+654321
+0
+1
+2
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+EOF
diff --git a/ctdb/tests/tool/stubby.natgwlist.005.sh b/ctdb/tests/tool/stubby.natgwlist.005.sh
new file mode 100755 (executable)
index 0000000..42c7dbb
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "3 nodes, all stopped"
+
+setup_natgw <<EOF
+192.168.20.41
+192.168.20.42
+192.168.20.43
+EOF
+
+required_result 0 <<EOF
+0 192.168.20.41
+Number of nodes:3
+pnn:0 192.168.20.41    STOPPED|INACTIVE
+pnn:1 192.168.20.42    STOPPED|INACTIVE (THIS NODE)
+pnn:2 192.168.20.43    STOPPED|INACTIVE
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x20
+1       192.168.20.42   0x20     CURRENT RECMASTER
+2       192.168.20.43   0x20
+
+VNNMAP
+654321
+0
+1
+2
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+EOF
diff --git a/ctdb/tests/tool/stubby.nodestatus.001.sh b/ctdb/tests/tool/stubby.nodestatus.001.sh
new file mode 100755 (executable)
index 0000000..6392b8d
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all, 3 nodes, all OK"
+
+required_result 0 <<EOF
+Number of nodes:3
+pnn:0 192.168.20.41    OK
+pnn:1 192.168.20.42    OK
+pnn:2 192.168.20.43    OK (THIS NODE)
+EOF
+
+simple_test all <<EOF
+NODEMAP
+0       192.168.20.41   0x0
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0     CURRENT RECMASTER
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.nodestatus.002.sh b/ctdb/tests/tool/stubby.nodestatus.002.sh
new file mode 100755 (executable)
index 0000000..f5b1909
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "-n all, 3 nodes, all OK"
+
+required_result 0 <<EOF
+Number of nodes:3
+pnn:0 192.168.20.41    OK
+pnn:1 192.168.20.42    OK
+pnn:2 192.168.20.43    OK (THIS NODE)
+EOF
+
+simple_test all <<EOF
+NODEMAP
+0       192.168.20.41   0x0
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0     CURRENT RECMASTER
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.nodestatus.003.sh b/ctdb/tests/tool/stubby.nodestatus.003.sh
new file mode 100755 (executable)
index 0000000..a3a7a42
--- /dev/null
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all, 3 nodes, 1 disconnected"
+
+required_result 1 <<EOF
+Number of nodes:3
+pnn:0 192.168.20.41    OK
+pnn:1 192.168.20.42    DISCONNECTED|INACTIVE
+pnn:2 192.168.20.43    OK (THIS NODE)
+EOF
+
+simple_test all <<EOF
+NODEMAP
+0       192.168.20.41   0x0
+1       192.168.20.42   0x1
+2       192.168.20.43   0x0     CURRENT RECMASTER
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.nodestatus.004.sh b/ctdb/tests/tool/stubby.nodestatus.004.sh
new file mode 100755 (executable)
index 0000000..bc98905
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "-n all, 3 nodes, 1 disconnected"
+
+# -n all asks each node for the node status and 
+#        thus reports THIS NODE for each node
+
+required_result 0 <<EOF
+pnn:0 192.168.20.41    OK (THIS NODE)
+pnn:2 192.168.20.43    OK (THIS NODE)
+EOF
+
+simple_test -n all <<EOF
+NODEMAP
+0       192.168.20.41   0x0
+1       192.168.20.42   0x1
+2       192.168.20.43   0x0     CURRENT RECMASTER
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.nodestatus.005.sh b/ctdb/tests/tool/stubby.nodestatus.005.sh
new file mode 100755 (executable)
index 0000000..cb532e7
--- /dev/null
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "-n all all, 3 nodes, 1 disconnected"
+
+required_result 1 <<EOF
+Number of nodes:3
+pnn:0 192.168.20.41    OK (THIS NODE)
+pnn:1 192.168.20.42    DISCONNECTED|INACTIVE
+pnn:2 192.168.20.43    OK
+Number of nodes:3
+pnn:0 192.168.20.41    OK
+pnn:1 192.168.20.42    DISCONNECTED|INACTIVE
+pnn:2 192.168.20.43    OK (THIS NODE)
+EOF
+
+simple_test -n all all <<EOF
+NODEMAP
+0       192.168.20.41   0x0
+1       192.168.20.42   0x1
+2       192.168.20.43   0x0     CURRENT RECMASTER
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.status.001.sh b/ctdb/tests/tool/stubby.status.001.sh
new file mode 100755 (executable)
index 0000000..48b5bac
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all, 3 nodes, all ok"
+
+required_result 0 <<EOF
+Number of nodes:3
+pnn:0 192.168.20.41    OK (THIS NODE)
+pnn:1 192.168.20.42    OK
+pnn:2 192.168.20.43    OK
+Generation:654321
+Size:3
+hash:0 lmaster:0
+hash:1 lmaster:1
+hash:2 lmaster:2
+Recovery mode:NORMAL (0)
+Recovery master:0
+EOF
+
+simple_test all <<EOF
+NODEMAP
+0       192.168.20.41   0x0     CURRENT RECMASTER
+1       192.168.20.42   0x0
+2       192.168.20.43   0x0
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tests/tool/stubby.status.002.sh b/ctdb/tests/tool/stubby.status.002.sh
new file mode 100755 (executable)
index 0000000..fceceb3
--- /dev/null
@@ -0,0 +1,37 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "all, 3 nodes, 1 unhealthy"
+
+required_result 0 <<EOF
+Number of nodes:3
+pnn:0 192.168.20.41    UNHEALTHY
+pnn:1 192.168.20.42    OK (THIS NODE)
+pnn:2 192.168.20.43    OK
+Generation:654321
+Size:3
+hash:0 lmaster:0
+hash:1 lmaster:1
+hash:2 lmaster:2
+Recovery mode:NORMAL (0)
+Recovery master:1
+EOF
+
+simple_test <<EOF
+NODEMAP
+0       192.168.20.41   0x2
+1       192.168.20.42   0x0     CURRENT RECMASTER
+2       192.168.20.43   0x0
+
+VNNMAP
+654321
+0
+1
+2
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+EOF
diff --git a/ctdb/tests/tool/testcases/stubby.nodestatus.005.sh b/ctdb/tests/tool/testcases/stubby.nodestatus.005.sh
new file mode 100755 (executable)
index 0000000..a18608d
--- /dev/null
@@ -0,0 +1,34 @@
+#!/bin/sh
+
+. "${TESTS_SUBDIR}/common.sh"
+
+define_test "-n all all, 3 nodes, 1 disconnected"
+
+required_result 1 <<EOF
+Number of nodes:3
+pnn:0 192.168.20.41    OK (THIS NODE)
+pnn:1 192.168.20.42    DISCONNECTED|INACTIVE
+pnn:2 192.168.20.43    OK
+Number of nodes:3
+pnn:0 192.168.20.41    OK
+pnn:1 192.168.20.42    DISCONNECTED|INACTIVE
+pnn:2 192.168.20.43    OK (THIS NODE)
+EOF
+
+simple_test -n all all <<EOF
+NODEMAP
+0       192.168.20.41   0x0
+1       192.168.20.42   0x1
+2       192.168.20.43   0x0     CURRENT RECMASTER
+
+IFACES
+:Name:LinkStatus:References:
+:eth2:1:2:
+:eth1:1:4:
+
+VNNMAP
+654321
+0
+1
+2
+EOF
diff --git a/ctdb/tools/ctdb.c b/ctdb/tools/ctdb.c
new file mode 100644 (file)
index 0000000..c5d4a4a
--- /dev/null
@@ -0,0 +1,6281 @@
+/* 
+   ctdb control tool
+
+   Copyright (C) Andrew Tridgell  2007
+   Copyright (C) Ronnie Sahlberg  2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/time.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "system/locale.h"
+#include "popt.h"
+#include "cmdline.h"
+#include "../include/ctdb_version.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+#include "db_wrap.h"
+
+#define ERR_TIMEOUT    20      /* timed out trying to reach node */
+#define ERR_NONODE     21      /* node does not exist */
+#define ERR_DISNODE    22      /* node is disconnected */
+
+static void usage(void);
+
+static struct {
+       int timelimit;
+       uint32_t pnn;
+       uint32_t *nodes;
+       int machinereadable;
+       int verbose;
+       int maxruntime;
+       int printemptyrecords;
+       int printdatasize;
+       int printlmaster;
+       int printhash;
+       int printrecordflags;
+} options;
+
+#define LONGTIMEOUT options.timelimit*10
+
+#define TIMELIMIT() timeval_current_ofs(options.timelimit, 0)
+#define LONGTIMELIMIT() timeval_current_ofs(LONGTIMEOUT, 0)
+
+static int control_version(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       printf("CTDB version: %s\n", CTDB_VERSION_STRING);
+       return 0;
+}
+
+#define CTDB_NOMEM_ABORT(p) do { if (!(p)) {                           \
+               DEBUG(DEBUG_ALERT,("ctdb fatal error: %s\n",            \
+                                  "Out of memory in " __location__ )); \
+               abort();                                                \
+       }} while (0)
+
+static uint32_t getpnn(struct ctdb_context *ctdb)
+{
+       if ((options.pnn == CTDB_BROADCAST_ALL) ||
+           (options.pnn == CTDB_MULTICAST)) {
+               DEBUG(DEBUG_ERR,
+                     ("Cannot get PNN for node %u\n", options.pnn));
+               exit(1);
+       }
+
+       if (options.pnn == CTDB_CURRENT_NODE) {
+               return ctdb_get_pnn(ctdb);
+       } else {
+               return options.pnn;
+       }
+}
+
+static void assert_single_node_only(void)
+{
+       if ((options.pnn == CTDB_BROADCAST_ALL) ||
+           (options.pnn == CTDB_MULTICAST)) {
+               DEBUG(DEBUG_ERR,
+                     ("This control can not be applied to multiple PNNs\n"));
+               exit(1);
+       }
+}
+
+/* Pretty print the flags to a static buffer in human-readable format.
+ * This never returns NULL!
+ */
+static const char *pretty_print_flags(uint32_t flags)
+{
+       int j;
+       static const struct {
+               uint32_t flag;
+               const char *name;
+       } flag_names[] = {
+               { NODE_FLAGS_DISCONNECTED,          "DISCONNECTED" },
+               { NODE_FLAGS_PERMANENTLY_DISABLED,  "DISABLED" },
+               { NODE_FLAGS_BANNED,                "BANNED" },
+               { NODE_FLAGS_UNHEALTHY,             "UNHEALTHY" },
+               { NODE_FLAGS_DELETED,               "DELETED" },
+               { NODE_FLAGS_STOPPED,               "STOPPED" },
+               { NODE_FLAGS_INACTIVE,              "INACTIVE" },
+       };
+       static char flags_str[512]; /* Big enough to contain all flag names */
+
+       flags_str[0] = '\0';
+       for (j=0;j<ARRAY_SIZE(flag_names);j++) {
+               if (flags & flag_names[j].flag) {
+                       if (flags_str[0] == '\0') {
+                               (void) strcpy(flags_str, flag_names[j].name);
+                       } else {
+                               (void) strcat(flags_str, "|");
+                               (void) strcat(flags_str, flag_names[j].name);
+                       }
+               }
+       }
+       if (flags_str[0] == '\0') {
+               (void) strcpy(flags_str, "OK");
+       }
+
+       return flags_str;
+}
+
+static int h2i(char h)
+{
+       if (h >= 'a' && h <= 'f') return h - 'a' + 10;
+       if (h >= 'A' && h <= 'F') return h - 'f' + 10;
+       return h - '0';
+}
+
+static TDB_DATA hextodata(TALLOC_CTX *mem_ctx, const char *str)
+{
+       int i, len;
+       TDB_DATA key = {NULL, 0};
+
+       len = strlen(str);
+       if (len & 0x01) {
+               DEBUG(DEBUG_ERR,("Key specified with odd number of hexadecimal digits\n"));
+               return key;
+       }
+
+       key.dsize = len>>1;
+       key.dptr  = talloc_size(mem_ctx, key.dsize);
+
+       for (i=0; i < len/2; i++) {
+               key.dptr[i] = h2i(str[i*2]) << 4 | h2i(str[i*2+1]);
+       }
+       return key;
+}
+
+/* Parse a nodestring.  Parameter dd_ok controls what happens to nodes
+ * that are disconnected or deleted.  If dd_ok is true those nodes are
+ * included in the output list of nodes.  If dd_ok is false, those
+ * nodes are filtered from the "all" case and cause an error if
+ * explicitly specified.
+ */
+static bool parse_nodestring(struct ctdb_context *ctdb,
+                            TALLOC_CTX *mem_ctx,
+                            const char * nodestring,
+                            uint32_t current_pnn,
+                            bool dd_ok,
+                            uint32_t **nodes,
+                            uint32_t *pnn_mode)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(mem_ctx);
+       int n;
+       uint32_t i;
+       struct ctdb_node_map *nodemap;
+       int ret;
+
+       *nodes = NULL;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
+               talloc_free(tmp_ctx);
+               exit(10);
+       }
+
+       if (nodestring != NULL) {
+               *nodes = talloc_array(mem_ctx, uint32_t, 0);
+               if (*nodes == NULL) {
+                       goto failed;
+               }
+
+               n = 0;
+
+               if (strcmp(nodestring, "all") == 0) {
+                       *pnn_mode = CTDB_BROADCAST_ALL;
+
+                       /* all */
+                       for (i = 0; i < nodemap->num; i++) {
+                               if ((nodemap->nodes[i].flags &
+                                    (NODE_FLAGS_DISCONNECTED |
+                                     NODE_FLAGS_DELETED)) && !dd_ok) {
+                                       continue;
+                               }
+                               *nodes = talloc_realloc(mem_ctx, *nodes,
+                                                       uint32_t, n+1);
+                               if (*nodes == NULL) {
+                                       goto failed;
+                               }
+                               (*nodes)[n] = i;
+                               n++;
+                       }
+               } else {
+                       /* x{,y...} */
+                       char *ns, *tok;
+
+                       ns = talloc_strdup(tmp_ctx, nodestring);
+                       tok = strtok(ns, ",");
+                       while (tok != NULL) {
+                               uint32_t pnn;
+                               i = (uint32_t)strtoul(tok, NULL, 0);
+                               if (i >= nodemap->num) {
+                                       DEBUG(DEBUG_ERR, ("Node %u does not exist\n", i));
+                                       talloc_free(tmp_ctx);
+                                       exit(ERR_NONODE);
+                               }
+                               if ((nodemap->nodes[i].flags & 
+                                    (NODE_FLAGS_DISCONNECTED |
+                                     NODE_FLAGS_DELETED)) && !dd_ok) {
+                                       DEBUG(DEBUG_ERR, ("Node %u has status %s\n", i, pretty_print_flags(nodemap->nodes[i].flags)));
+                                       talloc_free(tmp_ctx);
+                                       exit(ERR_DISNODE);
+                               }
+                               if ((pnn = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), i)) < 0) {
+                                       DEBUG(DEBUG_ERR, ("Can not access node %u. Node is not operational.\n", i));
+                                       talloc_free(tmp_ctx);
+                                       exit(10);
+                               }
+
+                               *nodes = talloc_realloc(mem_ctx, *nodes,
+                                                       uint32_t, n+1);
+                               if (*nodes == NULL) {
+                                       goto failed;
+                               }
+
+                               (*nodes)[n] = i;
+                               n++;
+
+                               tok = strtok(NULL, ",");
+                       }
+                       talloc_free(ns);
+
+                       if (n == 1) {
+                               *pnn_mode = (*nodes)[0];
+                       } else {
+                               *pnn_mode = CTDB_MULTICAST;
+                       }
+               }
+       } else {
+               /* default - no nodes specified */
+               *nodes = talloc_array(mem_ctx, uint32_t, 1);
+               if (*nodes == NULL) {
+                       goto failed;
+               }
+               *pnn_mode = CTDB_CURRENT_NODE;
+
+               if (((*nodes)[0] = ctdb_ctrl_getpnn(ctdb, TIMELIMIT(), current_pnn)) < 0) {
+                       goto failed;
+               }
+       }
+
+       talloc_free(tmp_ctx);
+       return true;
+
+failed:
+       talloc_free(tmp_ctx);
+       return false;
+}
+
+/*
+ check if a database exists
+*/
+static bool db_exists(struct ctdb_context *ctdb, const char *dbarg, uint32_t *dbid, uint8_t *flags)
+{
+       int i, ret;
+       struct ctdb_dbid_map *dbmap=NULL;
+       bool dbid_given = false, found = false;
+       uint32_t id;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       ret = ctdb_ctrl_getdbmap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &dbmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get dbids from node %u\n", options.pnn));
+               goto fail;
+       }
+
+       if (strncmp(dbarg, "0x", 2) == 0) {
+               id = strtoul(dbarg, NULL, 0);
+               dbid_given = true;
+       }
+
+       for(i=0; i<dbmap->num; i++) {
+               if (dbid_given) {
+                       if (id == dbmap->dbs[i].dbid) {
+                               found = true;
+                               break;
+                       }
+               } else {
+                       const char *name;
+                       ret = ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, tmp_ctx, &name);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, ("Unable to get dbname from dbid %u\n", dbmap->dbs[i].dbid));
+                               goto fail;
+                       }
+
+                       if (strcmp(name, dbarg) == 0) {
+                               id = dbmap->dbs[i].dbid;
+                               found = true;
+                               break;
+                       }
+               }
+       }
+
+       if (found) {
+               if (dbid) *dbid = id;
+               if (flags) *flags = dbmap->dbs[i].flags;
+       } else {
+               DEBUG(DEBUG_ERR,("No database matching '%s' found\n", dbarg));
+       }
+
+fail:
+       talloc_free(tmp_ctx);
+       return found;
+}
+
+/*
+  see if a process exists
+ */
+static int control_process_exists(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t pnn, pid;
+       int ret;
+       if (argc < 1) {
+               usage();
+       }
+
+       if (sscanf(argv[0], "%u:%u", &pnn, &pid) != 2) {
+               DEBUG(DEBUG_ERR, ("Badly formed pnn:pid\n"));
+               return -1;
+       }
+
+       ret = ctdb_ctrl_process_exists(ctdb, pnn, pid);
+       if (ret == 0) {
+               printf("%u:%u exists\n", pnn, pid);
+       } else {
+               printf("%u:%u does not exist\n", pnn, pid);
+       }
+       return ret;
+}
+
+/*
+  display statistics structure
+ */
+static void show_statistics(struct ctdb_statistics *s, int show_header)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       int i;
+       const char *prefix=NULL;
+       int preflen=0;
+       int tmp, days, hours, minutes, seconds;
+       const struct {
+               const char *name;
+               uint32_t offset;
+       } fields[] = {
+#define STATISTICS_FIELD(n) { #n, offsetof(struct ctdb_statistics, n) }
+               STATISTICS_FIELD(num_clients),
+               STATISTICS_FIELD(frozen),
+               STATISTICS_FIELD(recovering),
+               STATISTICS_FIELD(num_recoveries),
+               STATISTICS_FIELD(client_packets_sent),
+               STATISTICS_FIELD(client_packets_recv),
+               STATISTICS_FIELD(node_packets_sent),
+               STATISTICS_FIELD(node_packets_recv),
+               STATISTICS_FIELD(keepalive_packets_sent),
+               STATISTICS_FIELD(keepalive_packets_recv),
+               STATISTICS_FIELD(node.req_call),
+               STATISTICS_FIELD(node.reply_call),
+               STATISTICS_FIELD(node.req_dmaster),
+               STATISTICS_FIELD(node.reply_dmaster),
+               STATISTICS_FIELD(node.reply_error),
+               STATISTICS_FIELD(node.req_message),
+               STATISTICS_FIELD(node.req_control),
+               STATISTICS_FIELD(node.reply_control),
+               STATISTICS_FIELD(client.req_call),
+               STATISTICS_FIELD(client.req_message),
+               STATISTICS_FIELD(client.req_control),
+               STATISTICS_FIELD(timeouts.call),
+               STATISTICS_FIELD(timeouts.control),
+               STATISTICS_FIELD(timeouts.traverse),
+               STATISTICS_FIELD(locks.num_calls),
+               STATISTICS_FIELD(locks.num_current),
+               STATISTICS_FIELD(locks.num_pending),
+               STATISTICS_FIELD(locks.num_failed),
+               STATISTICS_FIELD(total_calls),
+               STATISTICS_FIELD(pending_calls),
+               STATISTICS_FIELD(childwrite_calls),
+               STATISTICS_FIELD(pending_childwrite_calls),
+               STATISTICS_FIELD(memory_used),
+               STATISTICS_FIELD(max_hop_count),
+               STATISTICS_FIELD(total_ro_delegations),
+               STATISTICS_FIELD(total_ro_revokes),
+       };
+       
+       tmp = s->statistics_current_time.tv_sec - s->statistics_start_time.tv_sec;
+       seconds = tmp%60;
+       tmp    /= 60;
+       minutes = tmp%60;
+       tmp    /= 60;
+       hours   = tmp%24;
+       tmp    /= 24;
+       days    = tmp;
+
+       if (options.machinereadable){
+               if (show_header) {
+                       printf("CTDB version:");
+                       printf("Current time of statistics:");
+                       printf("Statistics collected since:");
+                       for (i=0;i<ARRAY_SIZE(fields);i++) {
+                               printf("%s:", fields[i].name);
+                       }
+                       printf("num_reclock_ctdbd_latency:");
+                       printf("min_reclock_ctdbd_latency:");
+                       printf("avg_reclock_ctdbd_latency:");
+                       printf("max_reclock_ctdbd_latency:");
+
+                       printf("num_reclock_recd_latency:");
+                       printf("min_reclock_recd_latency:");
+                       printf("avg_reclock_recd_latency:");
+                       printf("max_reclock_recd_latency:");
+
+                       printf("num_call_latency:");
+                       printf("min_call_latency:");
+                       printf("avg_call_latency:");
+                       printf("max_call_latency:");
+
+                       printf("num_lockwait_latency:");
+                       printf("min_lockwait_latency:");
+                       printf("avg_lockwait_latency:");
+                       printf("max_lockwait_latency:");
+
+                       printf("num_childwrite_latency:");
+                       printf("min_childwrite_latency:");
+                       printf("avg_childwrite_latency:");
+                       printf("max_childwrite_latency:");
+                       printf("\n");
+               }
+               printf("%d:", CTDB_VERSION);
+               printf("%d:", (int)s->statistics_current_time.tv_sec);
+               printf("%d:", (int)s->statistics_start_time.tv_sec);
+               for (i=0;i<ARRAY_SIZE(fields);i++) {
+                       printf("%d:", *(uint32_t *)(fields[i].offset+(uint8_t *)s));
+               }
+               printf("%d:", s->reclock.ctdbd.num);
+               printf("%.6f:", s->reclock.ctdbd.min);
+               printf("%.6f:", s->reclock.ctdbd.num?s->reclock.ctdbd.total/s->reclock.ctdbd.num:0.0);
+               printf("%.6f:", s->reclock.ctdbd.max);
+
+               printf("%d:", s->reclock.recd.num);
+               printf("%.6f:", s->reclock.recd.min);
+               printf("%.6f:", s->reclock.recd.num?s->reclock.recd.total/s->reclock.recd.num:0.0);
+               printf("%.6f:", s->reclock.recd.max);
+
+               printf("%d:", s->call_latency.num);
+               printf("%.6f:", s->call_latency.min);
+               printf("%.6f:", s->call_latency.num?s->call_latency.total/s->call_latency.num:0.0);
+               printf("%.6f:", s->call_latency.max);
+
+               printf("%d:", s->childwrite_latency.num);
+               printf("%.6f:", s->childwrite_latency.min);
+               printf("%.6f:", s->childwrite_latency.num?s->childwrite_latency.total/s->childwrite_latency.num:0.0);
+               printf("%.6f:", s->childwrite_latency.max);
+               printf("\n");
+       } else {
+               printf("CTDB version %u\n", CTDB_VERSION);
+               printf("Current time of statistics  :                %s", ctime(&s->statistics_current_time.tv_sec));
+               printf("Statistics collected since  : (%03d %02d:%02d:%02d) %s", days, hours, minutes, seconds, ctime(&s->statistics_start_time.tv_sec));
+
+               for (i=0;i<ARRAY_SIZE(fields);i++) {
+                       if (strchr(fields[i].name, '.')) {
+                               preflen = strcspn(fields[i].name, ".")+1;
+                               if (!prefix || strncmp(prefix, fields[i].name, preflen) != 0) {
+                                       prefix = fields[i].name;
+                                       printf(" %*.*s\n", preflen-1, preflen-1, fields[i].name);
+                               }
+                       } else {
+                               preflen = 0;
+                       }
+                       printf(" %*s%-22s%*s%10u\n", 
+                              preflen?4:0, "",
+                              fields[i].name+preflen, 
+                              preflen?0:4, "",
+                              *(uint32_t *)(fields[i].offset+(uint8_t *)s));
+               }
+               printf(" hop_count_buckets:");
+               for (i=0;i<MAX_COUNT_BUCKETS;i++) {
+                       printf(" %d", s->hop_count_bucket[i]);
+               }
+               printf("\n");
+               printf(" lock_buckets:");
+               for (i=0; i<MAX_COUNT_BUCKETS; i++) {
+                       printf(" %d", s->locks.buckets[i]);
+               }
+               printf("\n");
+               printf(" %-30s     %.6f/%.6f/%.6f sec out of %d\n", "locks_latency      MIN/AVG/MAX", s->locks.latency.min, s->locks.latency.num?s->locks.latency.total/s->locks.latency.num:0.0, s->locks.latency.max, s->locks.latency.num);
+
+               printf(" %-30s     %.6f/%.6f/%.6f sec out of %d\n", "reclock_ctdbd      MIN/AVG/MAX", s->reclock.ctdbd.min, s->reclock.ctdbd.num?s->reclock.ctdbd.total/s->reclock.ctdbd.num:0.0, s->reclock.ctdbd.max, s->reclock.ctdbd.num);
+
+               printf(" %-30s     %.6f/%.6f/%.6f sec out of %d\n", "reclock_recd       MIN/AVG/MAX", s->reclock.recd.min, s->reclock.recd.num?s->reclock.recd.total/s->reclock.recd.num:0.0, s->reclock.recd.max, s->reclock.recd.num);
+
+               printf(" %-30s     %.6f/%.6f/%.6f sec out of %d\n", "call_latency       MIN/AVG/MAX", s->call_latency.min, s->call_latency.num?s->call_latency.total/s->call_latency.num:0.0, s->call_latency.max, s->call_latency.num);
+               printf(" %-30s     %.6f/%.6f/%.6f sec out of %d\n", "childwrite_latency MIN/AVG/MAX", s->childwrite_latency.min, s->childwrite_latency.num?s->childwrite_latency.total/s->childwrite_latency.num:0.0, s->childwrite_latency.max, s->childwrite_latency.num);
+       }
+
+       talloc_free(tmp_ctx);
+}
+
+/*
+  display remote ctdb statistics combined from all nodes
+ */
+static int control_statistics_all(struct ctdb_context *ctdb)
+{
+       int ret, i;
+       struct ctdb_statistics statistics;
+       uint32_t *nodes;
+       uint32_t num_nodes;
+
+       nodes = ctdb_get_connected_nodes(ctdb, TIMELIMIT(), ctdb, &num_nodes);
+       CTDB_NO_MEMORY(ctdb, nodes);
+       
+       ZERO_STRUCT(statistics);
+
+       for (i=0;i<num_nodes;i++) {
+               struct ctdb_statistics s1;
+               int j;
+               uint32_t *v1 = (uint32_t *)&s1;
+               uint32_t *v2 = (uint32_t *)&statistics;
+               uint32_t num_ints = 
+                       offsetof(struct ctdb_statistics, __last_counter) / sizeof(uint32_t);
+               ret = ctdb_ctrl_statistics(ctdb, nodes[i], &s1);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get statistics from node %u\n", nodes[i]));
+                       return ret;
+               }
+               for (j=0;j<num_ints;j++) {
+                       v2[j] += v1[j];
+               }
+               statistics.max_hop_count = 
+                       MAX(statistics.max_hop_count, s1.max_hop_count);
+               statistics.call_latency.max = 
+                       MAX(statistics.call_latency.max, s1.call_latency.max);
+       }
+       talloc_free(nodes);
+       printf("Gathered statistics for %u nodes\n", num_nodes);
+       show_statistics(&statistics, 1);
+       return 0;
+}
+
+/*
+  display remote ctdb statistics
+ */
+static int control_statistics(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct ctdb_statistics statistics;
+
+       if (options.pnn == CTDB_BROADCAST_ALL) {
+               return control_statistics_all(ctdb);
+       }
+
+       ret = ctdb_ctrl_statistics(ctdb, options.pnn, &statistics);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get statistics from node %u\n", options.pnn));
+               return ret;
+       }
+       show_statistics(&statistics, 1);
+       return 0;
+}
+
+
+/*
+  reset remote ctdb statistics
+ */
+static int control_statistics_reset(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+
+       ret = ctdb_statistics_reset(ctdb, options.pnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to reset statistics on node %u\n", options.pnn));
+               return ret;
+       }
+       return 0;
+}
+
+
+/*
+  display remote ctdb rolling statistics
+ */
+static int control_stats(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct ctdb_statistics_wire *stats;
+       int i, num_records = -1;
+
+       assert_single_node_only();
+
+       if (argc ==1) {
+               num_records = atoi(argv[0]) - 1;
+       }
+
+       ret = ctdb_ctrl_getstathistory(ctdb, TIMELIMIT(), options.pnn, ctdb, &stats);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get rolling statistics from node %u\n", options.pnn));
+               return ret;
+       }
+       for (i=0;i<stats->num;i++) {
+               if (stats->stats[i].statistics_start_time.tv_sec == 0) {
+                       continue;
+               }
+               show_statistics(&stats->stats[i], i==0);
+               if (i == num_records) {
+                       break;
+               }
+       }
+       return 0;
+}
+
+
+/*
+  display remote ctdb db statistics
+ */
+static int control_dbstatistics(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_db_statistics *dbstat;
+       int i;
+       uint32_t db_id;
+       int num_hot_keys;
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       if (!db_exists(ctdb, argv[0], &db_id, NULL)) {
+               return -1;
+       }
+
+       ret = ctdb_ctrl_dbstatistics(ctdb, options.pnn, db_id, tmp_ctx, &dbstat);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to read db statistics from node\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       printf("DB Statistics: %s\n", argv[0]);
+       printf(" %*s%-22s%*s%10u\n", 0, "", "ro_delegations", 4, "",
+               dbstat->db_ro_delegations);
+       printf(" %*s%-22s%*s%10u\n", 0, "", "ro_revokes", 4, "",
+               dbstat->db_ro_delegations);
+       printf(" %s\n", "locks");
+       printf(" %*s%-22s%*s%10u\n", 4, "", "total", 0, "",
+               dbstat->locks.num_calls);
+       printf(" %*s%-22s%*s%10u\n", 4, "", "failed", 0, "",
+               dbstat->locks.num_failed);
+       printf(" %*s%-22s%*s%10u\n", 4, "", "current", 0, "",
+               dbstat->locks.num_current);
+       printf(" %*s%-22s%*s%10u\n", 4, "", "pending", 0, "",
+               dbstat->locks.num_pending);
+       printf(" %s", "hop_count_buckets:");
+       for (i=0; i<MAX_COUNT_BUCKETS; i++) {
+               printf(" %d", dbstat->hop_count_bucket[i]);
+       }
+       printf("\n");
+       printf(" %s", "lock_buckets:");
+       for (i=0; i<MAX_COUNT_BUCKETS; i++) {
+               printf(" %d", dbstat->locks.buckets[i]);
+       }
+       printf("\n");
+       printf(" %-30s     %.6f/%.6f/%.6f sec out of %d\n",
+               "locks_latency      MIN/AVG/MAX",
+               dbstat->locks.latency.min,
+               (dbstat->locks.latency.num ?
+                dbstat->locks.latency.total /dbstat->locks.latency.num :
+                0.0),
+               dbstat->locks.latency.max,
+               dbstat->locks.latency.num);
+       num_hot_keys = 0;
+       for (i=0; i<dbstat->num_hot_keys; i++) {
+               if (dbstat->hot_keys[i].count > 0) {
+                       num_hot_keys++;
+               }
+       }
+       dbstat->num_hot_keys = num_hot_keys;
+
+       printf(" Num Hot Keys:     %d\n", dbstat->num_hot_keys);
+       for (i = 0; i < dbstat->num_hot_keys; i++) {
+               int j;
+               printf("     Count:%d Key:", dbstat->hot_keys[i].count);
+               for (j = 0; j < dbstat->hot_keys[i].key.dsize; j++) {
+                       printf("%02x", dbstat->hot_keys[i].key.dptr[j]&0xff);
+               }
+               printf("\n");
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  display uptime of remote node
+ */
+static int control_uptime(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct ctdb_uptime *uptime = NULL;
+       int tmp, days, hours, minutes, seconds;
+
+       ret = ctdb_ctrl_uptime(ctdb, ctdb, TIMELIMIT(), options.pnn, &uptime);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get uptime from node %u\n", options.pnn));
+               return ret;
+       }
+
+       if (options.machinereadable){
+               printf(":Current Node Time:Ctdb Start Time:Last Recovery/Failover Time:Last Recovery/IPFailover Duration:\n");
+               printf(":%u:%u:%u:%lf\n",
+                       (unsigned int)uptime->current_time.tv_sec,
+                       (unsigned int)uptime->ctdbd_start_time.tv_sec,
+                       (unsigned int)uptime->last_recovery_finished.tv_sec,
+                       timeval_delta(&uptime->last_recovery_finished,
+                                     &uptime->last_recovery_started)
+               );
+               return 0;
+       }
+
+       printf("Current time of node          :                %s", ctime(&uptime->current_time.tv_sec));
+
+       tmp = uptime->current_time.tv_sec - uptime->ctdbd_start_time.tv_sec;
+       seconds = tmp%60;
+       tmp    /= 60;
+       minutes = tmp%60;
+       tmp    /= 60;
+       hours   = tmp%24;
+       tmp    /= 24;
+       days    = tmp;
+       printf("Ctdbd start time              : (%03d %02d:%02d:%02d) %s", days, hours, minutes, seconds, ctime(&uptime->ctdbd_start_time.tv_sec));
+
+       tmp = uptime->current_time.tv_sec - uptime->last_recovery_finished.tv_sec;
+       seconds = tmp%60;
+       tmp    /= 60;
+       minutes = tmp%60;
+       tmp    /= 60;
+       hours   = tmp%24;
+       tmp    /= 24;
+       days    = tmp;
+       printf("Time of last recovery/failover: (%03d %02d:%02d:%02d) %s", days, hours, minutes, seconds, ctime(&uptime->last_recovery_finished.tv_sec));
+       
+       printf("Duration of last recovery/failover: %lf seconds\n",
+               timeval_delta(&uptime->last_recovery_finished,
+                             &uptime->last_recovery_started));
+
+       return 0;
+}
+
+/*
+  show the PNN of the current node
+ */
+static int control_pnn(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t mypnn;
+
+       mypnn = getpnn(ctdb);
+
+       printf("PNN:%d\n", mypnn);
+       return 0;
+}
+
+
+struct pnn_node {
+       struct pnn_node *next;
+       const char *addr;
+       int pnn;
+};
+
+static struct pnn_node *read_nodes_file(TALLOC_CTX *mem_ctx)
+{
+       const char *nodes_list;
+       int nlines;
+       char **lines;
+       int i, pnn;
+       struct pnn_node *pnn_nodes = NULL;
+       struct pnn_node *pnn_node;
+       struct pnn_node *tmp_node;
+
+       /* read the nodes file */
+       nodes_list = getenv("CTDB_NODES");
+       if (nodes_list == NULL) {
+               nodes_list = talloc_asprintf(mem_ctx, "%s/nodes",
+                                            getenv("CTDB_BASE"));
+               if (nodes_list == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " Out of memory\n"));
+                       exit(1);
+               }
+       }
+       lines = file_lines_load(nodes_list, &nlines, mem_ctx);
+       if (lines == NULL) {
+               return NULL;
+       }
+       while (nlines > 0 && strcmp(lines[nlines-1], "") == 0) {
+               nlines--;
+       }
+       for (i=0, pnn=0; i<nlines; i++) {
+               char *node;
+
+               node = lines[i];
+               /* strip leading spaces */
+               while((*node == ' ') || (*node == '\t')) {
+                       node++;
+               }
+               if (*node == '#') {
+                       pnn++;
+                       continue;
+               }
+               if (strcmp(node, "") == 0) {
+                       continue;
+               }
+               pnn_node = talloc(mem_ctx, struct pnn_node);
+               pnn_node->pnn = pnn++;
+               pnn_node->addr = talloc_strdup(pnn_node, node);
+               pnn_node->next = pnn_nodes;
+               pnn_nodes = pnn_node;
+       }
+
+       /* swap them around so we return them in incrementing order */
+       pnn_node = pnn_nodes;
+       pnn_nodes = NULL;
+       while (pnn_node) {
+               tmp_node = pnn_node;
+               pnn_node = pnn_node->next;
+
+               tmp_node->next = pnn_nodes;
+               pnn_nodes = tmp_node;
+       }
+
+       return pnn_nodes;
+}
+
+/*
+  show the PNN of the current node
+  discover the pnn by loading the nodes file and try to bind to all
+  addresses one at a time until the ip address is found.
+ */
+static int control_xpnn(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       struct pnn_node *pnn_nodes;
+       struct pnn_node *pnn_node;
+
+       assert_single_node_only();
+
+       pnn_nodes = read_nodes_file(mem_ctx);
+       if (pnn_nodes == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to read nodes file\n"));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       for(pnn_node=pnn_nodes;pnn_node;pnn_node=pnn_node->next) {
+               ctdb_sock_addr addr;
+
+               if (parse_ip(pnn_node->addr, NULL, 63999, &addr) == 0) {
+                       DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s' in nodes file\n", pnn_node->addr));
+                       talloc_free(mem_ctx);
+                       return -1;
+               }
+
+               if (ctdb_sys_have_ip(&addr)) {
+                       printf("PNN:%d\n", pnn_node->pnn);
+                       talloc_free(mem_ctx);
+                       return 0;
+               }
+       }
+
+       printf("Failed to detect which PNN this node is\n");
+       talloc_free(mem_ctx);
+       return -1;
+}
+
+/* Helpers for ctdb status
+ */
+static bool is_partially_online(struct ctdb_context *ctdb, struct ctdb_node_and_flags *node)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       int j;
+       bool ret = false;
+
+       if (node->flags == 0) {
+               struct ctdb_control_get_ifaces *ifaces;
+
+               if (ctdb_ctrl_get_ifaces(ctdb, TIMELIMIT(), node->pnn,
+                                        tmp_ctx, &ifaces) == 0) {
+                       for (j=0; j < ifaces->num; j++) {
+                               if (ifaces->ifaces[j].link_state != 0) {
+                                       continue;
+                               }
+                               ret = true;
+                               break;
+                       }
+               }
+       }
+       talloc_free(tmp_ctx);
+
+       return ret;
+}
+
+static void control_status_header_machine(void)
+{
+       printf(":Node:IP:Disconnected:Banned:Disabled:Unhealthy:Stopped"
+              ":Inactive:PartiallyOnline:ThisNode:\n");
+}
+
+static int control_status_1_machine(struct ctdb_context *ctdb, int mypnn,
+                                   struct ctdb_node_and_flags *node)
+{
+       printf(":%d:%s:%d:%d:%d:%d:%d:%d:%d:%c:\n", node->pnn,
+              ctdb_addr_to_str(&node->addr),
+              !!(node->flags&NODE_FLAGS_DISCONNECTED),
+              !!(node->flags&NODE_FLAGS_BANNED),
+              !!(node->flags&NODE_FLAGS_PERMANENTLY_DISABLED),
+              !!(node->flags&NODE_FLAGS_UNHEALTHY),
+              !!(node->flags&NODE_FLAGS_STOPPED),
+              !!(node->flags&NODE_FLAGS_INACTIVE),
+              is_partially_online(ctdb, node) ? 1 : 0,
+              (node->pnn == mypnn)?'Y':'N');
+
+       return node->flags;
+}
+
+static int control_status_1_human(struct ctdb_context *ctdb, int mypnn,
+                                 struct ctdb_node_and_flags *node)
+{
+       printf("pnn:%d %-16s %s%s\n", node->pnn,
+              ctdb_addr_to_str(&node->addr),
+              is_partially_online(ctdb, node) ? "PARTIALLYONLINE" : pretty_print_flags(node->flags),
+              node->pnn == mypnn?" (THIS NODE)":"");
+
+       return node->flags;
+}
+
+/*
+  display remote ctdb status
+ */
+static int control_status(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       int i;
+       struct ctdb_vnn_map *vnnmap=NULL;
+       struct ctdb_node_map *nodemap=NULL;
+       uint32_t recmode, recmaster, mypnn;
+       int num_deleted_nodes = 0;
+       int ret;
+
+       mypnn = getpnn(ctdb);
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (options.machinereadable) {
+               control_status_header_machine();
+               for (i=0;i<nodemap->num;i++) {
+                       if (nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
+                               continue;
+                       }
+                       (void) control_status_1_machine(ctdb, mypnn,
+                                                       &nodemap->nodes[i]);
+               }
+               talloc_free(tmp_ctx);
+               return 0;
+       }
+
+       for (i=0; i<nodemap->num; i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
+                       num_deleted_nodes++;
+               }
+       }
+       if (num_deleted_nodes == 0) {
+               printf("Number of nodes:%d\n", nodemap->num);
+       } else {
+               printf("Number of nodes:%d (including %d deleted nodes)\n",
+                      nodemap->num, num_deleted_nodes);
+       }
+       for(i=0;i<nodemap->num;i++){
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               (void) control_status_1_human(ctdb, mypnn, &nodemap->nodes[i]);
+       }
+
+       ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &vnnmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get vnnmap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (vnnmap->generation == INVALID_GENERATION) {
+               printf("Generation:INVALID\n");
+       } else {
+               printf("Generation:%d\n",vnnmap->generation);
+       }
+       printf("Size:%d\n",vnnmap->size);
+       for(i=0;i<vnnmap->size;i++){
+               printf("hash:%d lmaster:%d\n", i, vnnmap->map[i]);
+       }
+
+       ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, TIMELIMIT(), options.pnn, &recmode);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get recmode from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       printf("Recovery mode:%s (%d)\n",recmode==CTDB_RECOVERY_NORMAL?"NORMAL":"RECOVERY",recmode);
+
+       ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, TIMELIMIT(), options.pnn, &recmaster);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get recmaster from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       printf("Recovery master:%d\n",recmaster);
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+static int control_nodestatus(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       int i, ret;
+       struct ctdb_node_map *nodemap=NULL;
+       uint32_t * nodes;
+       uint32_t pnn_mode, mypnn;
+
+       if (argc > 1) {
+               usage();
+       }
+
+       if (!parse_nodestring(ctdb, tmp_ctx, argc == 1 ? argv[0] : NULL,
+                             options.pnn, true, &nodes, &pnn_mode)) {
+               return -1;
+       }
+
+       if (options.machinereadable) {
+               control_status_header_machine();
+       } else if (pnn_mode == CTDB_BROADCAST_ALL) {
+               printf("Number of nodes:%d\n", (int) talloc_array_length(nodes));
+       }
+
+       mypnn = getpnn(ctdb);
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ret = 0;
+
+       for (i = 0; i < talloc_array_length(nodes); i++) {
+               if (options.machinereadable) {
+                       ret |= control_status_1_machine(ctdb, mypnn,
+                                                       &nodemap->nodes[nodes[i]]);
+               } else {
+                       ret |= control_status_1_human(ctdb, mypnn,
+                                                     &nodemap->nodes[nodes[i]]);
+               }
+       }
+
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+struct natgw_node {
+       struct natgw_node *next;
+       const char *addr;
+};
+
+static int find_natgw(struct ctdb_context *ctdb,
+                      struct ctdb_node_map *nodemap, uint32_t flags,
+                      uint32_t *pnn, const char **ip)
+{
+       int i;
+       uint32_t capabilities;
+       int ret;
+
+       for (i=0;i<nodemap->num;i++) {
+               if (!(nodemap->nodes[i].flags & flags)) {
+                       ret = ctdb_ctrl_getcapabilities(ctdb, TIMELIMIT(),
+                                                       nodemap->nodes[i].pnn,
+                                                       &capabilities);
+                       if (ret != 0) {
+                               DEBUG(DEBUG_ERR, ("Unable to get capabilities from node %u\n",
+                                                 nodemap->nodes[i].pnn));
+                               return -1;
+                       }
+                       if (!(capabilities&CTDB_CAP_NATGW)) {
+                               continue;
+                       }
+                       *pnn = nodemap->nodes[i].pnn;
+                       *ip = ctdb_addr_to_str(&nodemap->nodes[i].addr);
+                       return 0;
+               }
+       }
+
+       return 2; /* matches ENOENT */
+}
+
+/*
+  display the list of nodes belonging to this natgw configuration
+ */
+static int control_natgwlist(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       int i, ret;
+       const char *natgw_list;
+       int nlines;
+       char **lines;
+       struct natgw_node *natgw_nodes = NULL;
+       struct natgw_node *natgw_node;
+       struct ctdb_node_map *nodemap=NULL;
+       uint32_t mypnn, pnn;
+       const char *ip;
+
+       /* When we have some nodes that could be the NATGW, make a
+        * series of attempts to find the first node that doesn't have
+        * certain status flags set.
+        */
+       uint32_t exclude_flags[] = {
+               /* Look for a nice healthy node */
+               NODE_FLAGS_DISCONNECTED|NODE_FLAGS_STOPPED|NODE_FLAGS_DELETED|NODE_FLAGS_BANNED|NODE_FLAGS_UNHEALTHY,
+               /* If not found, an UNHEALTHY/BANNED node will do */
+               NODE_FLAGS_DISCONNECTED|NODE_FLAGS_STOPPED|NODE_FLAGS_DELETED,
+               /* If not found, a STOPPED node will do */
+               NODE_FLAGS_DISCONNECTED|NODE_FLAGS_DELETED,
+               0,
+       };
+
+       /* read the natgw nodes file into a linked list */
+       natgw_list = getenv("CTDB_NATGW_NODES");
+       if (natgw_list == NULL) {
+               natgw_list = talloc_asprintf(tmp_ctx, "%s/natgw_nodes",
+                                            getenv("CTDB_BASE"));
+               if (natgw_list == NULL) {
+                       DEBUG(DEBUG_ALERT,(__location__ " Out of memory\n"));
+                       exit(1);
+               }
+       }
+       lines = file_lines_load(natgw_list, &nlines, ctdb);
+       if (lines == NULL) {
+               ctdb_set_error(ctdb, "Failed to load natgw node list '%s'\n", natgw_list);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       for (i=0;i<nlines;i++) {
+               char *node;
+
+               node = lines[i];
+               /* strip leading spaces */
+               while((*node == ' ') || (*node == '\t')) {
+                       node++;
+               }
+               if (*node == '#') {
+                       continue;
+               }
+               if (strcmp(node, "") == 0) {
+                       continue;
+               }
+               natgw_node = talloc(ctdb, struct natgw_node);
+               natgw_node->addr = talloc_strdup(natgw_node, node);
+               CTDB_NO_MEMORY(ctdb, natgw_node->addr);
+               natgw_node->next = natgw_nodes;
+               natgw_nodes = natgw_node;
+       }
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node.\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* Trim the nodemap so it only includes connected nodes in the
+        * current natgw group.
+        */
+       i=0;
+       while(i<nodemap->num) {
+               for(natgw_node=natgw_nodes;natgw_node;natgw_node=natgw_node->next) {
+                       if (!strcmp(natgw_node->addr, ctdb_addr_to_str(&nodemap->nodes[i].addr))) {
+                               break;
+                       }
+               }
+
+               /* this node was not in the natgw so we just remove it from
+                * the list
+                */
+               if ((natgw_node == NULL) 
+               ||  (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) ) {
+                       int j;
+
+                       for (j=i+1; j<nodemap->num; j++) {
+                               nodemap->nodes[j-1] = nodemap->nodes[j];
+                       }
+                       nodemap->num--;
+                       continue;
+               }
+
+               i++;
+       }
+
+       ret = 2; /* matches ENOENT */
+       pnn = -1;
+       ip = "0.0.0.0";
+       for (i = 0; exclude_flags[i] != 0; i++) {
+               ret = find_natgw(ctdb, nodemap,
+                                exclude_flags[i],
+                                &pnn, &ip);
+               if (ret == -1) {
+                       goto done;
+               }
+               if (ret == 0) {
+                       break;
+               }
+       }
+
+       if (options.machinereadable) {
+               printf(":Node:IP:\n");
+               printf(":%d:%s:\n", pnn, ip);
+       } else {
+               printf("%d %s\n", pnn, ip);
+       }
+
+       /* print the pruned list of nodes belonging to this natgw list */
+       mypnn = getpnn(ctdb);
+       if (options.machinereadable) {
+               control_status_header_machine();
+       } else {
+               printf("Number of nodes:%d\n", nodemap->num);
+       }
+       for(i=0;i<nodemap->num;i++){
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               if (options.machinereadable) {
+                       control_status_1_machine(ctdb, mypnn, &(nodemap->nodes[i]));
+               } else {
+                       control_status_1_human(ctdb, mypnn, &(nodemap->nodes[i]));
+               }
+       }
+
+done:
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+/*
+  display the status of the scripts for monitoring (or other events)
+ */
+static int control_one_scriptstatus(struct ctdb_context *ctdb,
+                                   enum ctdb_eventscript_call type)
+{
+       struct ctdb_scripts_wire *script_status;
+       int ret, i;
+
+       ret = ctdb_ctrl_getscriptstatus(ctdb, TIMELIMIT(), options.pnn, ctdb, type, &script_status);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get script status from node %u\n", options.pnn));
+               return ret;
+       }
+
+       if (script_status == NULL) {
+               if (!options.machinereadable) {
+                       printf("%s cycle never run\n",
+                              ctdb_eventscript_call_names[type]);
+               }
+               return 0;
+       }
+
+       if (!options.machinereadable) {
+               printf("%d scripts were executed last %s cycle\n",
+                      script_status->num_scripts,
+                      ctdb_eventscript_call_names[type]);
+       }
+       for (i=0; i<script_status->num_scripts; i++) {
+               const char *status = NULL;
+
+               switch (script_status->scripts[i].status) {
+               case -ETIME:
+                       status = "TIMEDOUT";
+                       break;
+               case -ENOEXEC:
+                       status = "DISABLED";
+                       break;
+               case 0:
+                       status = "OK";
+                       break;
+               default:
+                       if (script_status->scripts[i].status > 0)
+                               status = "ERROR";
+                       break;
+               }
+               if (options.machinereadable) {
+                       printf(":%s:%s:%i:%s:%lu.%06lu:%lu.%06lu:%s:\n",
+                              ctdb_eventscript_call_names[type],
+                              script_status->scripts[i].name,
+                              script_status->scripts[i].status,
+                              status,
+                              (long)script_status->scripts[i].start.tv_sec,
+                              (long)script_status->scripts[i].start.tv_usec,
+                              (long)script_status->scripts[i].finished.tv_sec,
+                              (long)script_status->scripts[i].finished.tv_usec,
+                              script_status->scripts[i].output);
+                       continue;
+               }
+               if (status)
+                       printf("%-20s Status:%s    ",
+                              script_status->scripts[i].name, status);
+               else
+                       /* Some other error, eg from stat. */
+                       printf("%-20s Status:CANNOT RUN (%s)",
+                              script_status->scripts[i].name,
+                              strerror(-script_status->scripts[i].status));
+
+               if (script_status->scripts[i].status >= 0) {
+                       printf("Duration:%.3lf ",
+                       timeval_delta(&script_status->scripts[i].finished,
+                             &script_status->scripts[i].start));
+               }
+               if (script_status->scripts[i].status != -ENOEXEC) {
+                       printf("%s",
+                              ctime(&script_status->scripts[i].start.tv_sec));
+                       if (script_status->scripts[i].status != 0) {
+                               printf("   OUTPUT:%s\n",
+                                      script_status->scripts[i].output);
+                       }
+               } else {
+                       printf("\n");
+               }
+       }
+       return 0;
+}
+
+
+static int control_scriptstatus(struct ctdb_context *ctdb,
+                               int argc, const char **argv)
+{
+       int ret;
+       enum ctdb_eventscript_call type, min, max;
+       const char *arg;
+
+       if (argc > 1) {
+               DEBUG(DEBUG_ERR, ("Unknown arguments to scriptstatus\n"));
+               return -1;
+       }
+
+       if (argc == 0)
+               arg = ctdb_eventscript_call_names[CTDB_EVENT_MONITOR];
+       else
+               arg = argv[0];
+
+       for (type = 0; type < CTDB_EVENT_MAX; type++) {
+               if (strcmp(arg, ctdb_eventscript_call_names[type]) == 0) {
+                       min = type;
+                       max = type+1;
+                       break;
+               }
+       }
+       if (type == CTDB_EVENT_MAX) {
+               if (strcmp(arg, "all") == 0) {
+                       min = 0;
+                       max = CTDB_EVENT_MAX;
+               } else {
+                       DEBUG(DEBUG_ERR, ("Unknown event type %s\n", argv[0]));
+                       return -1;
+               }
+       }
+
+       if (options.machinereadable) {
+               printf(":Type:Name:Code:Status:Start:End:Error Output...:\n");
+       }
+
+       for (type = min; type < max; type++) {
+               ret = control_one_scriptstatus(ctdb, type);
+               if (ret != 0) {
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
+/*
+  enable an eventscript
+ */
+static int control_enablescript(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       ret = ctdb_ctrl_enablescript(ctdb, TIMELIMIT(), options.pnn, argv[0]);
+       if (ret != 0) {
+         DEBUG(DEBUG_ERR, ("Unable to enable script %s on node %u\n", argv[0], options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  disable an eventscript
+ */
+static int control_disablescript(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       ret = ctdb_ctrl_disablescript(ctdb, TIMELIMIT(), options.pnn, argv[0]);
+       if (ret != 0) {
+         DEBUG(DEBUG_ERR, ("Unable to disable script %s on node %u\n", argv[0], options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  display the pnn of the recovery master
+ */
+static int control_recmaster(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t recmaster;
+       int ret;
+
+       ret = ctdb_ctrl_getrecmaster(ctdb, ctdb, TIMELIMIT(), options.pnn, &recmaster);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get recmaster from node %u\n", options.pnn));
+               return -1;
+       }
+       printf("%d\n",recmaster);
+
+       return 0;
+}
+
+/*
+  add a tickle to a public address
+ */
+static int control_add_tickle(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       struct ctdb_tcp_connection t;
+       TDB_DATA data;
+       int ret;
+
+       assert_single_node_only();
+
+       if (argc < 2) {
+               usage();
+       }
+
+       if (parse_ip_port(argv[0], &t.src_addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0]));
+               return -1;
+       }
+       if (parse_ip_port(argv[1], &t.dst_addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[1]));
+               return -1;
+       }
+
+       data.dptr = (uint8_t *)&t;
+       data.dsize = sizeof(t);
+
+       /* tell all nodes about this tcp connection */
+       ret = ctdb_control(ctdb, options.pnn, 0, CTDB_CONTROL_TCP_ADD_DELAYED_UPDATE,
+                          0, data, ctdb, NULL, NULL, NULL, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to add tickle\n"));
+               return -1;
+       }
+       
+       return 0;
+}
+
+
+/*
+  delete a tickle from a node
+ */
+static int control_del_tickle(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       struct ctdb_tcp_connection t;
+       TDB_DATA data;
+       int ret;
+
+       assert_single_node_only();
+
+       if (argc < 2) {
+               usage();
+       }
+
+       if (parse_ip_port(argv[0], &t.src_addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0]));
+               return -1;
+       }
+       if (parse_ip_port(argv[1], &t.dst_addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[1]));
+               return -1;
+       }
+
+       data.dptr = (uint8_t *)&t;
+       data.dsize = sizeof(t);
+
+       /* tell all nodes about this tcp connection */
+       ret = ctdb_control(ctdb, options.pnn, 0, CTDB_CONTROL_TCP_REMOVE,
+                          0, data, ctdb, NULL, NULL, NULL, NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to remove tickle\n"));
+               return -1;
+       }
+       
+       return 0;
+}
+
+
+/*
+  get a list of all tickles for this pnn
+ */
+static int control_get_tickles(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       struct ctdb_control_tcp_tickle_list *list;
+       ctdb_sock_addr addr;
+       int i, ret;
+       unsigned port = 0;
+
+       assert_single_node_only();
+
+       if (argc < 1) {
+               usage();
+       }
+
+       if (argc == 2) {
+               port = atoi(argv[1]);
+       }
+
+       if (parse_ip(argv[0], NULL, 0, &addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0]));
+               return -1;
+       }
+
+       ret = ctdb_ctrl_get_tcp_tickles(ctdb, TIMELIMIT(), options.pnn, ctdb, &addr, &list);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR, ("Unable to list tickles\n"));
+               return -1;
+       }
+
+       if (options.machinereadable){
+               printf(":source ip:port:destination ip:port:\n");
+               for (i=0;i<list->tickles.num;i++) {
+                       if (port && port != ntohs(list->tickles.connections[i].dst_addr.ip.sin_port)) {
+                               continue;
+                       }
+                       printf(":%s:%u", ctdb_addr_to_str(&list->tickles.connections[i].src_addr), ntohs(list->tickles.connections[i].src_addr.ip.sin_port));
+                       printf(":%s:%u:\n", ctdb_addr_to_str(&list->tickles.connections[i].dst_addr), ntohs(list->tickles.connections[i].dst_addr.ip.sin_port));
+               }
+       } else {
+               printf("Tickles for ip:%s\n", ctdb_addr_to_str(&list->addr));
+               printf("Num tickles:%u\n", list->tickles.num);
+               for (i=0;i<list->tickles.num;i++) {
+                       if (port && port != ntohs(list->tickles.connections[i].dst_addr.ip.sin_port)) {
+                               continue;
+                       }
+                       printf("SRC: %s:%u   ", ctdb_addr_to_str(&list->tickles.connections[i].src_addr), ntohs(list->tickles.connections[i].src_addr.ip.sin_port));
+                       printf("DST: %s:%u\n", ctdb_addr_to_str(&list->tickles.connections[i].dst_addr), ntohs(list->tickles.connections[i].dst_addr.ip.sin_port));
+               }
+       }
+
+       talloc_free(list);
+       
+       return 0;
+}
+
+
+static int move_ip(struct ctdb_context *ctdb, ctdb_sock_addr *addr, uint32_t pnn)
+{
+       struct ctdb_all_public_ips *ips;
+       struct ctdb_public_ip ip;
+       int i, ret;
+       uint32_t *nodes;
+       uint32_t disable_time;
+       TDB_DATA data;
+       struct ctdb_node_map *nodemap=NULL;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       disable_time = 30;
+       data.dptr  = (uint8_t*)&disable_time;
+       data.dsize = sizeof(disable_time);
+       ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_DISABLE_IP_CHECK, data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send message to disable ipcheck\n"));
+               return -1;
+       }
+
+
+
+       /* read the public ip list from the node */
+       ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), pnn, ctdb, &ips);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get public ip list from node %u\n", pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       for (i=0;i<ips->num;i++) {
+               if (ctdb_same_ip(addr, &ips->ips[i].addr)) {
+                       break;
+               }
+       }
+       if (i==ips->num) {
+               DEBUG(DEBUG_ERR, ("Node %u can not host ip address '%s'\n",
+                       pnn, ctdb_addr_to_str(addr)));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ip.pnn  = pnn;
+       ip.addr = *addr;
+
+       data.dptr  = (uint8_t *)&ip;
+       data.dsize = sizeof(ip);
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       nodes = list_of_nodes(ctdb, nodemap, tmp_ctx, NODE_FLAGS_INACTIVE, pnn);
+       ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_RELEASE_IP,
+                                       nodes, 0,
+                                       LONGTIMELIMIT(),
+                                       false, data,
+                                       NULL, NULL,
+                                       NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to release IP on nodes\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ret = ctdb_ctrl_takeover_ip(ctdb, LONGTIMELIMIT(), pnn, &ip);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to take over IP on node %d\n", pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* update the recovery daemon so it now knows to expect the new
+          node assignment for this ip.
+       */
+       ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_RECD_UPDATE_IP, data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send message to update the ip on the recovery master.\n"));
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
+/* 
+ * scans all other nodes and returns a pnn for another node that can host this 
+ * ip address or -1
+ */
+static int
+find_other_host_for_public_ip(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_all_public_ips *ips;
+       struct ctdb_node_map *nodemap=NULL;
+       int i, j, ret;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       for(i=0;i<nodemap->num;i++){
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (nodemap->nodes[i].pnn == options.pnn) {
+                       continue;
+               }
+
+               /* read the public ip list from this node */
+               ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), nodemap->nodes[i].pnn, tmp_ctx, &ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get public ip list from node %u\n", nodemap->nodes[i].pnn));
+                       return -1;
+               }
+
+               for (j=0;j<ips->num;j++) {
+                       if (ctdb_same_ip(addr, &ips->ips[j].addr)) {
+                               talloc_free(tmp_ctx);
+                               return nodemap->nodes[i].pnn;
+                       }
+               }
+               talloc_free(ips);
+       }
+
+       talloc_free(tmp_ctx);
+       return -1;
+}
+
+/* If pnn is -1 then try to find a node to move IP to... */
+static bool try_moveip(struct ctdb_context *ctdb, ctdb_sock_addr *addr, uint32_t pnn)
+{
+       bool pnn_specified = (pnn == -1 ? false : true);
+       int retries = 0;
+
+       while (retries < 5) {
+               if (!pnn_specified) {
+                       pnn = find_other_host_for_public_ip(ctdb, addr);
+                       if (pnn == -1) {
+                               return false;
+                       }
+                       DEBUG(DEBUG_NOTICE,
+                             ("Trying to move public IP to node %u\n", pnn));
+               }
+
+               if (move_ip(ctdb, addr, pnn) == 0) {
+                       return true;
+               }
+
+               sleep(3);
+               retries++;
+       }
+
+       return false;
+}
+
+
+/*
+  move/failover an ip address to a specific node
+ */
+static int control_moveip(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t pnn;
+       ctdb_sock_addr addr;
+
+       assert_single_node_only();
+
+       if (argc < 2) {
+               usage();
+               return -1;
+       }
+
+       if (parse_ip(argv[0], NULL, 0, &addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0]));
+               return -1;
+       }
+
+
+       if (sscanf(argv[1], "%u", &pnn) != 1) {
+               DEBUG(DEBUG_ERR, ("Badly formed pnn\n"));
+               return -1;
+       }
+
+       if (!try_moveip(ctdb, &addr, pnn)) {
+               DEBUG(DEBUG_ERR,("Failed to move IP to node %d.\n", pnn));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int rebalance_node(struct ctdb_context *ctdb, uint32_t pnn)
+{
+       TDB_DATA data;
+
+       data.dptr  = (uint8_t *)&pnn;
+       data.dsize = sizeof(uint32_t);
+       if (ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_REBALANCE_NODE, data) != 0) {
+               DEBUG(DEBUG_ERR,
+                     ("Failed to send message to force node %u to be a rebalancing target\n",
+                      pnn));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  rebalance a node by setting it to allow failback and triggering a
+  takeover run
+ */
+static int control_rebalancenode(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t *nodes;
+       uint32_t pnn_mode;
+       int i, ret;
+
+       assert_single_node_only();
+
+       if (argc > 1) {
+               usage();
+       }
+
+       /* Determine the nodes where IPs need to be reloaded */
+       if (!parse_nodestring(ctdb, tmp_ctx, argc == 1 ? argv[0] : NULL,
+                             options.pnn, true, &nodes, &pnn_mode)) {
+               ret = -1;
+               goto done;
+       }
+
+       for (i = 0; i < talloc_array_length(nodes); i++) {
+               if (!rebalance_node(ctdb, nodes[i])) {
+                       ret = -1;
+               }
+       }
+
+done:
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+static int rebalance_ip(struct ctdb_context *ctdb, ctdb_sock_addr *addr)
+{
+       struct ctdb_public_ip ip;
+       int ret;
+       uint32_t *nodes;
+       uint32_t disable_time;
+       TDB_DATA data;
+       struct ctdb_node_map *nodemap=NULL;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       disable_time = 30;
+       data.dptr  = (uint8_t*)&disable_time;
+       data.dsize = sizeof(disable_time);
+       ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, CTDB_SRVID_DISABLE_IP_CHECK, data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send message to disable ipcheck\n"));
+               return -1;
+       }
+
+       ip.pnn  = -1;
+       ip.addr = *addr;
+
+       data.dptr  = (uint8_t *)&ip;
+       data.dsize = sizeof(ip);
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+               nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_RELEASE_IP,
+                                       nodes, 0,
+                                       LONGTIMELIMIT(),
+                                       false, data,
+                                       NULL, NULL,
+                                       NULL);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to release IP on nodes\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  release an ip form all nodes and have it re-assigned by recd
+ */
+static int control_rebalanceip(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       ctdb_sock_addr addr;
+
+       assert_single_node_only();
+
+       if (argc < 1) {
+               usage();
+               return -1;
+       }
+
+       if (parse_ip(argv[0], NULL, 0, &addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0]));
+               return -1;
+       }
+
+       if (rebalance_ip(ctdb, &addr) != 0) {
+               DEBUG(DEBUG_ERR,("Error when trying to reassign ip\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+static int getips_store_callback(void *param, void *data)
+{
+       struct ctdb_public_ip *node_ip = (struct ctdb_public_ip *)data;
+       struct ctdb_all_public_ips *ips = param;
+       int i;
+
+       i = ips->num++;
+       ips->ips[i].pnn  = node_ip->pnn;
+       ips->ips[i].addr = node_ip->addr;
+       return 0;
+}
+
+static int getips_count_callback(void *param, void *data)
+{
+       uint32_t *count = param;
+
+       (*count)++;
+       return 0;
+}
+
+#define IP_KEYLEN      4
+static uint32_t *ip_key(ctdb_sock_addr *ip)
+{
+       static uint32_t key[IP_KEYLEN];
+
+       bzero(key, sizeof(key));
+
+       switch (ip->sa.sa_family) {
+       case AF_INET:
+               key[0]  = ip->ip.sin_addr.s_addr;
+               break;
+       case AF_INET6: {
+               uint32_t *s6_a32 = (uint32_t *)&(ip->ip6.sin6_addr.s6_addr);
+               key[0]  = s6_a32[3];
+               key[1]  = s6_a32[2];
+               key[2]  = s6_a32[1];
+               key[3]  = s6_a32[0];
+               break;
+       }
+       default:
+               DEBUG(DEBUG_ERR, (__location__ " ERROR, unknown family passed :%u\n", ip->sa.sa_family));
+               return key;
+       }
+
+       return key;
+}
+
+static void *add_ip_callback(void *parm, void *data)
+{
+       return parm;
+}
+
+static int
+control_get_all_public_ips(struct ctdb_context *ctdb, TALLOC_CTX *tmp_ctx, struct ctdb_all_public_ips **ips)
+{
+       struct ctdb_all_public_ips *tmp_ips;
+       struct ctdb_node_map *nodemap=NULL;
+       trbt_tree_t *ip_tree;
+       int i, j, len, ret;
+       uint32_t count;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               return ret;
+       }
+
+       ip_tree = trbt_create(tmp_ctx, 0);
+
+       for(i=0;i<nodemap->num;i++){
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+               if (nodemap->nodes[i].flags & NODE_FLAGS_DISCONNECTED) {
+                       continue;
+               }
+
+               /* read the public ip list from this node */
+               ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), nodemap->nodes[i].pnn, tmp_ctx, &tmp_ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get public ip list from node %u\n", nodemap->nodes[i].pnn));
+                       return -1;
+               }
+       
+               for (j=0; j<tmp_ips->num;j++) {
+                       struct ctdb_public_ip *node_ip;
+
+                       node_ip = talloc(tmp_ctx, struct ctdb_public_ip);
+                       node_ip->pnn  = tmp_ips->ips[j].pnn;
+                       node_ip->addr = tmp_ips->ips[j].addr;
+
+                       trbt_insertarray32_callback(ip_tree,
+                               IP_KEYLEN, ip_key(&tmp_ips->ips[j].addr),
+                               add_ip_callback,
+                               node_ip);
+               }
+               talloc_free(tmp_ips);
+       }
+
+       /* traverse */
+       count = 0;
+       trbt_traversearray32(ip_tree, IP_KEYLEN, getips_count_callback, &count);
+
+       len = offsetof(struct ctdb_all_public_ips, ips) + 
+               count*sizeof(struct ctdb_public_ip);
+       tmp_ips = talloc_zero_size(tmp_ctx, len);
+       trbt_traversearray32(ip_tree, IP_KEYLEN, getips_store_callback, tmp_ips);
+
+       *ips = tmp_ips;
+
+       return 0;
+}
+
+
+static void ctdb_every_second(struct event_context *ev, struct timed_event *te, struct timeval t, void *p)
+{
+       struct ctdb_context *ctdb = talloc_get_type(p, struct ctdb_context);
+
+       event_add_timed(ctdb->ev, ctdb, 
+                               timeval_current_ofs(1, 0),
+                               ctdb_every_second, ctdb);
+}
+
+struct srvid_reply_handler_data {
+       bool done;
+       bool wait_for_all;
+       uint32_t *nodes;
+       const char *srvid_str;
+};
+
+static void srvid_broadcast_reply_handler(struct ctdb_context *ctdb,
+                                        uint64_t srvid,
+                                        TDB_DATA data,
+                                        void *private_data)
+{
+       struct srvid_reply_handler_data *d =
+               (struct srvid_reply_handler_data *)private_data;
+       int i;
+       int32_t ret;
+
+       if (data.dsize != sizeof(ret)) {
+               DEBUG(DEBUG_ERR, (__location__ " Wrong reply size\n"));
+               return;
+       }
+
+       /* ret will be a PNN (i.e. >=0) on success, or negative on error */
+       ret = *(int32_t *)data.dptr;
+       if (ret < 0) {
+               DEBUG(DEBUG_ERR,
+                     ("%s failed with result %d\n", d->srvid_str, ret));
+               return;
+       }
+
+       if (!d->wait_for_all) {
+               d->done = true;
+               return;
+       }
+
+       /* Wait for all replies */
+       d->done = true;
+       for (i = 0; i < talloc_array_length(d->nodes); i++) {
+               if (d->nodes[i] == ret) {
+                       DEBUG(DEBUG_INFO,
+                             ("%s reply received from node %u\n",
+                              d->srvid_str, ret));
+                       d->nodes[i] = -1;
+               }
+               if (d->nodes[i] != -1) {
+                       /* Found a node that hasn't yet replied */
+                       d->done = false;
+               }
+       }
+}
+
+/* Broadcast the given SRVID to all connected nodes.  Wait for 1 reply
+ * or replies from all connected nodes.  arg is the data argument to
+ * pass in the srvid_request structure - pass 0 if this isn't needed.
+ */
+static int srvid_broadcast(struct ctdb_context *ctdb,
+                          uint64_t srvid, uint32_t arg,
+                          const char *srvid_str, bool wait_for_all)
+{
+       int ret;
+       TDB_DATA data;
+       struct srvid_request request;
+       struct srvid_reply_handler_data reply_data;
+       struct timeval tv;
+
+       ZERO_STRUCT(request);
+
+       /* Time ticks to enable timeouts to be processed */
+       event_add_timed(ctdb->ev, ctdb, 
+                               timeval_current_ofs(1, 0),
+                               ctdb_every_second, ctdb);
+
+       request.pnn = ctdb_get_pnn(ctdb);
+       request.srvid = getpid();
+       request.data = arg;
+
+       /* Register message port for reply from recovery master */
+       ctdb_client_set_message_handler(ctdb, request.srvid,
+                                       srvid_broadcast_reply_handler,
+                                       &reply_data);
+
+       data.dptr = (uint8_t *)&request;
+       data.dsize = sizeof(request);
+
+       reply_data.wait_for_all = wait_for_all;
+       reply_data.nodes = NULL;
+       reply_data.srvid_str = srvid_str;
+
+again:
+       reply_data.done = false;
+
+       if (wait_for_all) {
+               struct ctdb_node_map *nodemap;
+
+               ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(),
+                                          CTDB_CURRENT_NODE, ctdb, &nodemap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,
+                             ("Unable to get nodemap from current node, try again\n"));
+                       sleep(1);
+                       goto again;
+               }
+
+               if (reply_data.nodes != NULL) {
+                       talloc_free(reply_data.nodes);
+               }
+               reply_data.nodes = list_of_connected_nodes(ctdb, nodemap,
+                                                          NULL, true);
+
+               talloc_free(nodemap);
+       }
+
+       /* Send to all connected nodes. Only recmaster replies */
+       ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED,
+                                      srvid, data);
+       if (ret != 0) {
+               /* This can only happen if the socket is closed and
+                * there's no way to recover from that, so don't try
+                * again.
+                */
+               DEBUG(DEBUG_ERR,
+                     ("Failed to send %s request to connected nodes\n",
+                      srvid_str));
+               return -1;
+       }
+
+       tv = timeval_current();
+       /* This loop terminates the reply is received */
+       while (timeval_elapsed(&tv) < 5.0 && !reply_data.done) {
+               event_loop_once(ctdb->ev);
+       }
+
+       if (!reply_data.done) {
+               DEBUG(DEBUG_NOTICE,
+                     ("Still waiting for confirmation of %s\n", srvid_str));
+               sleep(1);
+               goto again;
+       }
+
+       ctdb_client_remove_message_handler(ctdb, request.srvid, &reply_data);
+
+       talloc_free(reply_data.nodes);
+
+       return 0;
+}
+
+static int ipreallocate(struct ctdb_context *ctdb)
+{
+       return srvid_broadcast(ctdb, CTDB_SRVID_TAKEOVER_RUN, 0,
+                              "IP reallocation", false);
+}
+
+
+static int control_ipreallocate(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       return ipreallocate(ctdb);
+}
+
+/*
+  add a public ip address to a node
+ */
+static int control_addip(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       int len, retries = 0;
+       unsigned mask;
+       ctdb_sock_addr addr;
+       struct ctdb_control_ip_iface *pub;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_all_public_ips *ips;
+
+
+       if (argc != 2) {
+               talloc_free(tmp_ctx);
+               usage();
+       }
+
+       if (!parse_ip_mask(argv[0], argv[1], &addr, &mask)) {
+               DEBUG(DEBUG_ERR, ("Badly formed ip/mask : %s\n", argv[0]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* read the public ip list from the node */
+       ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &ips);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get public ip list from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       for (i=0;i<ips->num;i++) {
+               if (ctdb_same_ip(&addr, &ips->ips[i].addr)) {
+                       DEBUG(DEBUG_ERR,("Can not add ip to node. Node already hosts this ip\n"));
+                       return 0;
+               }
+       }
+
+
+
+       /* Dont timeout. This command waits for an ip reallocation
+          which sometimes can take wuite a while if there has
+          been a recent recovery
+       */
+       alarm(0);
+
+       len = offsetof(struct ctdb_control_ip_iface, iface) + strlen(argv[1]) + 1;
+       pub = talloc_size(tmp_ctx, len); 
+       CTDB_NO_MEMORY(ctdb, pub);
+
+       pub->addr  = addr;
+       pub->mask  = mask;
+       pub->len   = strlen(argv[1])+1;
+       memcpy(&pub->iface[0], argv[1], strlen(argv[1])+1);
+
+       do {
+               ret = ctdb_ctrl_add_public_ip(ctdb, TIMELIMIT(), options.pnn, pub);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to add public ip to node %u. Wait 3 seconds and try again.\n", options.pnn));
+                       sleep(3);
+                       retries++;
+               }
+       } while (retries < 5 && ret != 0);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to add public ip to node %u. Giving up.\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       if (rebalance_node(ctdb, options.pnn) != 0) {
+               DEBUG(DEBUG_ERR,("Error when trying to rebalance node\n"));
+               return ret;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  add a public ip address to a node
+ */
+static int control_ipiface(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       ctdb_sock_addr addr;
+
+       if (argc != 1) {
+               usage();
+       }
+
+       if (!parse_ip(argv[0], NULL, 0, &addr)) {
+               printf("Badly formed ip : %s\n", argv[0]);
+               return -1;
+       }
+
+       printf("IP on interface %s\n", ctdb_sys_find_ifname(&addr));
+
+       return 0;
+}
+
+static int control_delip(struct ctdb_context *ctdb, int argc, const char **argv);
+
+static int control_delip_all(struct ctdb_context *ctdb, int argc, const char **argv, ctdb_sock_addr *addr)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_node_map *nodemap=NULL;
+       struct ctdb_all_public_ips *ips;
+       int ret, i, j;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from current node\n"));
+               return ret;
+       }
+
+       /* remove it from the nodes that are not hosting the ip currently */
+       for(i=0;i<nodemap->num;i++){
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), nodemap->nodes[i].pnn, tmp_ctx, &ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get public ip list from node %d\n", nodemap->nodes[i].pnn));
+                       continue;
+               }
+
+               for (j=0;j<ips->num;j++) {
+                       if (ctdb_same_ip(addr, &ips->ips[j].addr)) {
+                               break;
+                       }
+               }
+               if (j==ips->num) {
+                       continue;
+               }
+
+               if (ips->ips[j].pnn == nodemap->nodes[i].pnn) {
+                       continue;
+               }
+
+               options.pnn = nodemap->nodes[i].pnn;
+               control_delip(ctdb, argc, argv);
+       }
+
+
+       /* remove it from every node (also the one hosting it) */
+       for(i=0;i<nodemap->num;i++){
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), nodemap->nodes[i].pnn, tmp_ctx, &ips);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get public ip list from node %d\n", nodemap->nodes[i].pnn));
+                       continue;
+               }
+
+               for (j=0;j<ips->num;j++) {
+                       if (ctdb_same_ip(addr, &ips->ips[j].addr)) {
+                               break;
+                       }
+               }
+               if (j==ips->num) {
+                       continue;
+               }
+
+               options.pnn = nodemap->nodes[i].pnn;
+               control_delip(ctdb, argc, argv);
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+       
+/*
+  delete a public ip address from a node
+ */
+static int control_delip(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       ctdb_sock_addr addr;
+       struct ctdb_control_ip_iface pub;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_all_public_ips *ips;
+
+       if (argc != 1) {
+               talloc_free(tmp_ctx);
+               usage();
+       }
+
+       if (parse_ip(argv[0], NULL, 0, &addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0]));
+               return -1;
+       }
+
+       if (options.pnn == CTDB_BROADCAST_ALL) {
+               return control_delip_all(ctdb, argc, argv, &addr);
+       }
+
+       pub.addr  = addr;
+       pub.mask  = 0;
+       pub.len   = 0;
+
+       ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &ips);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get public ip list from cluster\n"));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+       
+       for (i=0;i<ips->num;i++) {
+               if (ctdb_same_ip(&addr, &ips->ips[i].addr)) {
+                       break;
+               }
+       }
+
+       if (i==ips->num) {
+               DEBUG(DEBUG_ERR, ("This node does not support this public address '%s'\n",
+                       ctdb_addr_to_str(&addr)));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* This is an optimisation.  If this node is hosting the IP
+        * then try to move it somewhere else without invoking a full
+        * takeover run.  We don't care if this doesn't work!
+        */
+       if (ips->ips[i].pnn == options.pnn) {
+               (void) try_moveip(ctdb, &addr, -1);
+       }
+
+       ret = ctdb_ctrl_del_public_ip(ctdb, TIMELIMIT(), options.pnn, &pub);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to del public ip from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+static int kill_tcp_from_file(struct ctdb_context *ctdb,
+                             int argc, const char **argv)
+{
+       struct ctdb_control_killtcp *killtcp;
+       int max_entries, current, i;
+       struct timeval timeout;
+       char line[128], src[128], dst[128];
+       int linenum;
+       TDB_DATA data;
+       struct client_async_data *async_data;
+       struct ctdb_client_control_state *state;
+
+       if (argc != 0) {
+               usage();
+       }
+
+       linenum = 1;
+       killtcp = NULL;
+       max_entries = 0;
+       current = 0;
+       while (!feof(stdin)) {
+               if (fgets(line, sizeof(line), stdin) == NULL) {
+                       continue;
+               }
+
+               /* Silently skip empty lines */
+               if (line[0] == '\n') {
+                       continue;
+               }
+
+               if (sscanf(line, "%s %s\n", src, dst) != 2) {
+                       DEBUG(DEBUG_ERR, ("Bad line [%d]: '%s'\n",
+                                         linenum, line));
+                       talloc_free(killtcp);
+                       return -1;
+               }
+
+               if (current >= max_entries) {
+                       max_entries += 1024;
+                       killtcp = talloc_realloc(ctdb, killtcp,
+                                                struct ctdb_control_killtcp,
+                                                max_entries);
+                       CTDB_NO_MEMORY(ctdb, killtcp);
+               }
+
+               if (!parse_ip_port(src, &killtcp[current].src_addr)) {
+                       DEBUG(DEBUG_ERR, ("Bad IP:port on line [%d]: '%s'\n",
+                                         linenum, src));
+                       talloc_free(killtcp);
+                       return -1;
+               }
+
+               if (!parse_ip_port(dst, &killtcp[current].dst_addr)) {
+                       DEBUG(DEBUG_ERR, ("Bad IP:port on line [%d]: '%s'\n",
+                                         linenum, dst));
+                       talloc_free(killtcp);
+                       return -1;
+               }
+
+               current++;
+       }
+
+       async_data = talloc_zero(ctdb, struct client_async_data);
+       if (async_data == NULL) {
+               talloc_free(killtcp);
+               return -1;
+       }
+
+       for (i = 0; i < current; i++) {
+
+               data.dsize = sizeof(struct ctdb_control_killtcp);
+               data.dptr  = (unsigned char *)&killtcp[i];
+
+               timeout = TIMELIMIT();
+               state = ctdb_control_send(ctdb, options.pnn, 0,
+                                         CTDB_CONTROL_KILL_TCP, 0, data,
+                                         async_data, &timeout, NULL);
+
+               if (state == NULL) {
+                       DEBUG(DEBUG_ERR,
+                             ("Failed to call async killtcp control to node %u\n",
+                              options.pnn));
+                       talloc_free(killtcp);
+                       return -1;
+               }
+               
+               ctdb_client_async_add(async_data, state);
+       }
+
+       if (ctdb_client_async_wait(ctdb, async_data) != 0) {
+               DEBUG(DEBUG_ERR,("killtcp failed\n"));
+               talloc_free(killtcp);
+               return -1;
+       }
+
+       talloc_free(killtcp);
+       return 0;
+}
+
+
+/*
+  kill a tcp connection
+ */
+static int kill_tcp(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct ctdb_control_killtcp killtcp;
+
+       assert_single_node_only();
+
+       if (argc == 0) {
+               return kill_tcp_from_file(ctdb, argc, argv);
+       }
+
+       if (argc < 2) {
+               usage();
+       }
+
+       if (!parse_ip_port(argv[0], &killtcp.src_addr)) {
+               DEBUG(DEBUG_ERR, ("Bad IP:port '%s'\n", argv[0]));
+               return -1;
+       }
+
+       if (!parse_ip_port(argv[1], &killtcp.dst_addr)) {
+               DEBUG(DEBUG_ERR, ("Bad IP:port '%s'\n", argv[1]));
+               return -1;
+       }
+
+       ret = ctdb_ctrl_killtcp(ctdb, TIMELIMIT(), options.pnn, &killtcp);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to killtcp from node %u\n", options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+
+/*
+  send a gratious arp
+ */
+static int control_gratious_arp(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       ctdb_sock_addr addr;
+
+       assert_single_node_only();
+
+       if (argc < 2) {
+               usage();
+       }
+
+       if (!parse_ip(argv[0], NULL, 0, &addr)) {
+               DEBUG(DEBUG_ERR, ("Bad IP '%s'\n", argv[0]));
+               return -1;
+       }
+
+       ret = ctdb_ctrl_gratious_arp(ctdb, TIMELIMIT(), options.pnn, &addr, argv[1]);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to send gratious_arp from node %u\n", options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  register a server id
+ */
+static int regsrvid(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct ctdb_server_id server_id;
+
+       if (argc < 3) {
+               usage();
+       }
+
+       server_id.pnn       = strtoul(argv[0], NULL, 0);
+       server_id.type      = strtoul(argv[1], NULL, 0);
+       server_id.server_id = strtoul(argv[2], NULL, 0);
+
+       ret = ctdb_ctrl_register_server_id(ctdb, TIMELIMIT(), &server_id);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to register server_id from node %u\n", options.pnn));
+               return ret;
+       }
+       DEBUG(DEBUG_ERR,("Srvid registered. Sleeping for 999 seconds\n"));
+       sleep(999);
+       return -1;
+}
+
+/*
+  unregister a server id
+ */
+static int unregsrvid(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct ctdb_server_id server_id;
+
+       if (argc < 3) {
+               usage();
+       }
+
+       server_id.pnn       = strtoul(argv[0], NULL, 0);
+       server_id.type      = strtoul(argv[1], NULL, 0);
+       server_id.server_id = strtoul(argv[2], NULL, 0);
+
+       ret = ctdb_ctrl_unregister_server_id(ctdb, TIMELIMIT(), &server_id);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to unregister server_id from node %u\n", options.pnn));
+               return ret;
+       }
+       return -1;
+}
+
+/*
+  check if a server id exists
+ */
+static int chksrvid(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t status;
+       int ret;
+       struct ctdb_server_id server_id;
+
+       if (argc < 3) {
+               usage();
+       }
+
+       server_id.pnn       = strtoul(argv[0], NULL, 0);
+       server_id.type      = strtoul(argv[1], NULL, 0);
+       server_id.server_id = strtoul(argv[2], NULL, 0);
+
+       ret = ctdb_ctrl_check_server_id(ctdb, TIMELIMIT(), options.pnn, &server_id, &status);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to check server_id from node %u\n", options.pnn));
+               return ret;
+       }
+
+       if (status) {
+               printf("Server id %d:%d:%d EXISTS\n", server_id.pnn, server_id.type, server_id.server_id);
+       } else {
+               printf("Server id %d:%d:%d does NOT exist\n", server_id.pnn, server_id.type, server_id.server_id);
+       }
+       return 0;
+}
+
+/*
+  get a list of all server ids that are registered on a node
+ */
+static int getsrvids(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       struct ctdb_server_id_list *server_ids;
+
+       ret = ctdb_ctrl_get_server_id_list(ctdb, ctdb, TIMELIMIT(), options.pnn, &server_ids);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get server_id list from node %u\n", options.pnn));
+               return ret;
+       }
+
+       for (i=0; i<server_ids->num; i++) {
+               printf("Server id %d:%d:%d\n", 
+                       server_ids->server_ids[i].pnn, 
+                       server_ids->server_ids[i].type, 
+                       server_ids->server_ids[i].server_id); 
+       }
+
+       return -1;
+}
+
+/*
+  check if a server id exists
+ */
+static int check_srvids(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       uint64_t *ids;
+       uint8_t *result;
+       int i;
+
+       if (argc < 1) {
+               talloc_free(tmp_ctx);
+               usage();
+       }
+
+       ids    = talloc_array(tmp_ctx, uint64_t, argc);
+       result = talloc_array(tmp_ctx, uint8_t, argc);
+
+       for (i = 0; i < argc; i++) {
+               ids[i] = strtoull(argv[i], NULL, 0);
+       }
+
+       if (!ctdb_client_check_message_handlers(ctdb, ids, argc, result)) {
+               DEBUG(DEBUG_ERR, ("Unable to check server_id from node %u\n",
+                                 options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       for (i=0; i < argc; i++) {
+               printf("Server id %d:%llu %s\n", options.pnn, (long long)ids[i],
+                      result[i] ? "exists" : "does not exist");
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  send a tcp tickle ack
+ */
+static int tickle_tcp(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       ctdb_sock_addr  src, dst;
+
+       if (argc < 2) {
+               usage();
+       }
+
+       if (!parse_ip_port(argv[0], &src)) {
+               DEBUG(DEBUG_ERR, ("Bad IP:port '%s'\n", argv[0]));
+               return -1;
+       }
+
+       if (!parse_ip_port(argv[1], &dst)) {
+               DEBUG(DEBUG_ERR, ("Bad IP:port '%s'\n", argv[1]));
+               return -1;
+       }
+
+       ret = ctdb_sys_send_tcp(&src, &dst, 0, 0, 0);
+       if (ret==0) {
+               return 0;
+       }
+       DEBUG(DEBUG_ERR, ("Error while sending tickle ack\n"));
+
+       return -1;
+}
+
+
+/*
+  display public ip status
+ */
+static int control_ip(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_all_public_ips *ips;
+
+       if (options.pnn == CTDB_BROADCAST_ALL) {
+               /* read the list of public ips from all nodes */
+               ret = control_get_all_public_ips(ctdb, tmp_ctx, &ips);
+       } else {
+               /* read the public ip list from this node */
+               ret = ctdb_ctrl_get_public_ips(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &ips);
+       }
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get public ips from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       if (options.machinereadable){
+               printf(":Public IP:Node:");
+               if (options.verbose){
+                       printf("ActiveInterface:AvailableInterfaces:ConfiguredInterfaces:");
+               }
+               printf("\n");
+       } else {
+               if (options.pnn == CTDB_BROADCAST_ALL) {
+                       printf("Public IPs on ALL nodes\n");
+               } else {
+                       printf("Public IPs on node %u\n", options.pnn);
+               }
+       }
+
+       for (i=1;i<=ips->num;i++) {
+               struct ctdb_control_public_ip_info *info = NULL;
+               int32_t pnn;
+               char *aciface = NULL;
+               char *avifaces = NULL;
+               char *cifaces = NULL;
+
+               if (options.pnn == CTDB_BROADCAST_ALL) {
+                       pnn = ips->ips[ips->num-i].pnn;
+               } else {
+                       pnn = options.pnn;
+               }
+
+               if (pnn != -1) {
+                       ret = ctdb_ctrl_get_public_ip_info(ctdb, TIMELIMIT(), pnn, ctdb,
+                                                  &ips->ips[ips->num-i].addr, &info);
+               } else {
+                       ret = -1;
+               }
+
+               if (ret == 0) {
+                       int j;
+                       for (j=0; j < info->num; j++) {
+                               if (cifaces == NULL) {
+                                       cifaces = talloc_strdup(info,
+                                                               info->ifaces[j].name);
+                               } else {
+                                       cifaces = talloc_asprintf_append(cifaces,
+                                                                        ",%s",
+                                                                        info->ifaces[j].name);
+                               }
+
+                               if (info->active_idx == j) {
+                                       aciface = info->ifaces[j].name;
+                               }
+
+                               if (info->ifaces[j].link_state == 0) {
+                                       continue;
+                               }
+
+                               if (avifaces == NULL) {
+                                       avifaces = talloc_strdup(info, info->ifaces[j].name);
+                               } else {
+                                       avifaces = talloc_asprintf_append(avifaces,
+                                                                         ",%s",
+                                                                         info->ifaces[j].name);
+                               }
+                       }
+               }
+
+               if (options.machinereadable){
+                       printf(":%s:%d:",
+                               ctdb_addr_to_str(&ips->ips[ips->num-i].addr),
+                               ips->ips[ips->num-i].pnn);
+                       if (options.verbose){
+                               printf("%s:%s:%s:",
+                                       aciface?aciface:"",
+                                       avifaces?avifaces:"",
+                                       cifaces?cifaces:"");
+                       }
+                       printf("\n");
+               } else {
+                       if (options.verbose) {
+                               printf("%s node[%d] active[%s] available[%s] configured[%s]\n",
+                                       ctdb_addr_to_str(&ips->ips[ips->num-i].addr),
+                                       ips->ips[ips->num-i].pnn,
+                                       aciface?aciface:"",
+                                       avifaces?avifaces:"",
+                                       cifaces?cifaces:"");
+                       } else {
+                               printf("%s %d\n",
+                                       ctdb_addr_to_str(&ips->ips[ips->num-i].addr),
+                                       ips->ips[ips->num-i].pnn);
+                       }
+               }
+               talloc_free(info);
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  public ip info
+ */
+static int control_ipinfo(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       ctdb_sock_addr addr;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_control_public_ip_info *info;
+
+       if (argc != 1) {
+               talloc_free(tmp_ctx);
+               usage();
+       }
+
+       if (parse_ip(argv[0], NULL, 0, &addr) == 0) {
+               DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s'\n", argv[0]));
+               return -1;
+       }
+
+       /* read the public ip info from this node */
+       ret = ctdb_ctrl_get_public_ip_info(ctdb, TIMELIMIT(), options.pnn,
+                                          tmp_ctx, &addr, &info);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get public ip[%s]info from node %u\n",
+                                 argv[0], options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       printf("Public IP[%s] info on node %u\n",
+              ctdb_addr_to_str(&info->ip.addr),
+              options.pnn);
+
+       printf("IP:%s\nCurrentNode:%d\nNumInterfaces:%u\n",
+              ctdb_addr_to_str(&info->ip.addr),
+              info->ip.pnn, info->num);
+
+       for (i=0; i<info->num; i++) {
+               info->ifaces[i].name[CTDB_IFACE_SIZE] = '\0';
+
+               printf("Interface[%u]: Name:%s Link:%s References:%u%s\n",
+                      i+1, info->ifaces[i].name,
+                      info->ifaces[i].link_state?"up":"down",
+                      (unsigned int)info->ifaces[i].references,
+                      (i==info->active_idx)?" (active)":"");
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  display interfaces status
+ */
+static int control_ifaces(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       int i;
+       struct ctdb_control_get_ifaces *ifaces;
+       int ret;
+
+       /* read the public ip list from this node */
+       ret = ctdb_ctrl_get_ifaces(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &ifaces);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get interfaces from node %u\n",
+                                 options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (options.machinereadable){
+               printf(":Name:LinkStatus:References:\n");
+       } else {
+               printf("Interfaces on node %u\n", options.pnn);
+       }
+
+       for (i=0; i<ifaces->num; i++) {
+               if (options.machinereadable){
+                       printf(":%s:%s:%u\n",
+                              ifaces->ifaces[i].name,
+                              ifaces->ifaces[i].link_state?"1":"0",
+                              (unsigned int)ifaces->ifaces[i].references);
+               } else {
+                       printf("name:%s link:%s references:%u\n",
+                              ifaces->ifaces[i].name,
+                              ifaces->ifaces[i].link_state?"up":"down",
+                              (unsigned int)ifaces->ifaces[i].references);
+               }
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+
+/*
+  set link status of an interface
+ */
+static int control_setifacelink(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_control_iface_info info;
+
+       ZERO_STRUCT(info);
+
+       if (argc != 2) {
+               usage();
+       }
+
+       if (strlen(argv[0]) > CTDB_IFACE_SIZE) {
+               DEBUG(DEBUG_ERR, ("interfaces name '%s' too long\n",
+                                 argv[0]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       strcpy(info.name, argv[0]);
+
+       if (strcmp(argv[1], "up") == 0) {
+               info.link_state = 1;
+       } else if (strcmp(argv[1], "down") == 0) {
+               info.link_state = 0;
+       } else {
+               DEBUG(DEBUG_ERR, ("link state invalid '%s' should be 'up' or 'down'\n",
+                                 argv[1]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* read the public ip list from this node */
+       ret = ctdb_ctrl_set_iface_link(ctdb, TIMELIMIT(), options.pnn,
+                                  tmp_ctx, &info);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to set link state for interfaces %s node %u\n",
+                                 argv[0], options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  display pid of a ctdb daemon
+ */
+static int control_getpid(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t pid;
+       int ret;
+
+       ret = ctdb_ctrl_getpid(ctdb, TIMELIMIT(), options.pnn, &pid);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get daemon pid from node %u\n", options.pnn));
+               return ret;
+       }
+       printf("Pid:%d\n", pid);
+
+       return 0;
+}
+
+typedef bool update_flags_handler_t(struct ctdb_context *ctdb, void *data);
+
+static int update_flags_and_ipreallocate(struct ctdb_context *ctdb,
+                                             void *data,
+                                             update_flags_handler_t handler,
+                                             uint32_t flag,
+                                             const char *desc,
+                                             bool set_flag)
+{
+       struct ctdb_node_map *nodemap = NULL;
+       bool flag_is_set;
+       int ret;
+
+       /* Check if the node is already in the desired state */
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
+               exit(10);
+       }
+       flag_is_set = nodemap->nodes[options.pnn].flags & flag;
+       if (set_flag == flag_is_set) {
+               DEBUG(DEBUG_NOTICE, ("Node %d is %s %s\n", options.pnn,
+                                    (set_flag ? "already" : "not"), desc));
+               return 0;
+       }
+
+       do {
+               if (!handler(ctdb, data)) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Failed to send control to set state %s on node %u, try again\n",
+                              desc, options.pnn));
+               }
+
+               sleep(1);
+
+               /* Read the nodemap and verify the change took effect.
+                * Even if the above control/hanlder timed out then it
+                * could still have worked!
+                */
+               ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE,
+                                        ctdb, &nodemap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_WARNING,
+                             ("Unable to get nodemap from local node, try again\n"));
+               }
+               flag_is_set = nodemap->nodes[options.pnn].flags & flag;
+       } while (nodemap == NULL || (set_flag != flag_is_set));
+
+       return ipreallocate(ctdb);
+}
+
+/* Administratively disable a node */
+static bool update_flags_disabled(struct ctdb_context *ctdb, void *data)
+{
+       int ret;
+
+       ret = ctdb_ctrl_modflags(ctdb, TIMELIMIT(), options.pnn,
+                                NODE_FLAGS_PERMANENTLY_DISABLED, 0);
+       return ret == 0;
+}
+
+static int control_disable(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       return update_flags_and_ipreallocate(ctdb, NULL,
+                                                 update_flags_disabled,
+                                                 NODE_FLAGS_PERMANENTLY_DISABLED,
+                                                 "disabled",
+                                                 true /* set_flag*/);
+}
+
+/* Administratively re-enable a node */
+static bool update_flags_not_disabled(struct ctdb_context *ctdb, void *data)
+{
+       int ret;
+
+       ret = ctdb_ctrl_modflags(ctdb, TIMELIMIT(), options.pnn,
+                                0, NODE_FLAGS_PERMANENTLY_DISABLED);
+       return ret == 0;
+}
+
+static int control_enable(struct ctdb_context *ctdb,  int argc, const char **argv)
+{
+       return update_flags_and_ipreallocate(ctdb, NULL,
+                                                 update_flags_not_disabled,
+                                                 NODE_FLAGS_PERMANENTLY_DISABLED,
+                                                 "disabled",
+                                                 false /* set_flag*/);
+}
+
+/* Stop a node */
+static bool update_flags_stopped(struct ctdb_context *ctdb, void *data)
+{
+       int ret;
+
+       ret = ctdb_ctrl_stop_node(ctdb, TIMELIMIT(), options.pnn);
+
+       return ret == 0;
+}
+
+static int control_stop(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       return update_flags_and_ipreallocate(ctdb, NULL,
+                                                 update_flags_stopped,
+                                                 NODE_FLAGS_STOPPED,
+                                                 "stopped",
+                                                 true /* set_flag*/);
+}
+
+/* Continue a stopped node */
+static bool update_flags_not_stopped(struct ctdb_context *ctdb, void *data)
+{
+       int ret;
+
+       ret = ctdb_ctrl_continue_node(ctdb, TIMELIMIT(), options.pnn);
+
+       return ret == 0;
+}
+
+static int control_continue(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       return update_flags_and_ipreallocate(ctdb, NULL,
+                                                 update_flags_not_stopped,
+                                                 NODE_FLAGS_STOPPED,
+                                                 "stopped",
+                                                 false /* set_flag */);
+}
+
+static uint32_t get_generation(struct ctdb_context *ctdb)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_vnn_map *vnnmap=NULL;
+       int ret;
+       uint32_t generation;
+
+       /* wait until the recmaster is not in recovery mode */
+       while (1) {
+               uint32_t recmode, recmaster;
+               
+               if (vnnmap != NULL) {
+                       talloc_free(vnnmap);
+                       vnnmap = NULL;
+               }
+
+               /* get the recmaster */
+               ret = ctdb_ctrl_getrecmaster(ctdb, tmp_ctx, TIMELIMIT(), CTDB_CURRENT_NODE, &recmaster);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get recmaster from node %u\n", options.pnn));
+                       talloc_free(tmp_ctx);
+                       exit(10);
+               }
+
+               /* get recovery mode */
+               ret = ctdb_ctrl_getrecmode(ctdb, tmp_ctx, TIMELIMIT(), recmaster, &recmode);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get recmode from node %u\n", options.pnn));
+                       talloc_free(tmp_ctx);
+                       exit(10);
+               }
+
+               /* get the current generation number */
+               ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), recmaster, tmp_ctx, &vnnmap);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get vnnmap from recmaster (%u)\n", recmaster));
+                       talloc_free(tmp_ctx);
+                       exit(10);
+               }
+
+               if ((recmode == CTDB_RECOVERY_NORMAL) && (vnnmap->generation != 1)) {
+                       generation = vnnmap->generation;
+                       talloc_free(tmp_ctx);
+                       return generation;
+               }
+               sleep(1);
+       }
+}
+
+/* Ban a node */
+static bool update_state_banned(struct ctdb_context *ctdb, void *data)
+{
+       struct ctdb_ban_time *bantime = (struct ctdb_ban_time *)data;
+       int ret;
+
+       ret = ctdb_ctrl_set_ban(ctdb, TIMELIMIT(), options.pnn, bantime);
+
+       return ret == 0;
+}
+
+static int control_ban(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       struct ctdb_ban_time bantime;
+
+       if (argc < 1) {
+               usage();
+       }
+       
+       bantime.pnn  = options.pnn;
+       bantime.time = strtoul(argv[0], NULL, 0);
+
+       if (bantime.time == 0) {
+               DEBUG(DEBUG_ERR, ("Invalid ban time specified - must be >0\n"));
+               return -1;
+       }
+
+       return update_flags_and_ipreallocate(ctdb, &bantime,
+                                                 update_state_banned,
+                                                 NODE_FLAGS_BANNED,
+                                                 "banned",
+                                                 true /* set_flag*/);
+}
+
+
+/* Unban a node */
+static int control_unban(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       struct ctdb_ban_time bantime;
+
+       bantime.pnn  = options.pnn;
+       bantime.time = 0;
+
+       return update_flags_and_ipreallocate(ctdb, &bantime,
+                                                 update_state_banned,
+                                                 NODE_FLAGS_BANNED,
+                                                 "banned",
+                                                 false /* set_flag*/);
+}
+
+/*
+  show ban information for a node
+ */
+static int control_showban(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct ctdb_node_map *nodemap=NULL;
+       struct ctdb_ban_time *bantime;
+
+       /* verify the node exists */
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
+               return ret;
+       }
+
+       ret = ctdb_ctrl_get_ban(ctdb, TIMELIMIT(), options.pnn, ctdb, &bantime);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Showing ban info for node %d failed.\n", options.pnn));
+               return -1;
+       }       
+
+       if (bantime->time == 0) {
+               printf("Node %u is not banned\n", bantime->pnn);
+       } else {
+               printf("Node %u is banned, %d seconds remaining\n",
+                      bantime->pnn, bantime->time);
+       }
+
+       return 0;
+}
+
+/*
+  shutdown a daemon
+ */
+static int control_shutdown(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+
+       ret = ctdb_ctrl_shutdown(ctdb, TIMELIMIT(), options.pnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to shutdown node %u\n", options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  trigger a recovery
+ */
+static int control_recover(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       uint32_t generation, next_generation;
+       bool force;
+
+       /* "force" option ignores freeze failure and forces recovery */
+       force = (argc == 1) && (strcasecmp(argv[0], "force") == 0);
+
+       /* record the current generation number */
+       generation = get_generation(ctdb);
+
+       ret = ctdb_ctrl_freeze_priority(ctdb, TIMELIMIT(), options.pnn, 1);
+       if (ret != 0) {
+               if (!force) {
+                       DEBUG(DEBUG_ERR, ("Unable to freeze node\n"));
+                       return ret;
+               }
+               DEBUG(DEBUG_WARNING, ("Unable to freeze node but proceeding because \"force\" option given\n"));
+       }
+
+       ret = ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to set recovery mode\n"));
+               return ret;
+       }
+
+       /* wait until we are in a new generation */
+       while (1) {
+               next_generation = get_generation(ctdb);
+               if (next_generation != generation) {
+                       return 0;
+               }
+               sleep(1);
+       }
+
+       return 0;
+}
+
+
+/*
+  display monitoring mode of a remote node
+ */
+static int control_getmonmode(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t monmode;
+       int ret;
+
+       ret = ctdb_ctrl_getmonmode(ctdb, TIMELIMIT(), options.pnn, &monmode);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get monmode from node %u\n", options.pnn));
+               return ret;
+       }
+       if (!options.machinereadable){
+               printf("Monitoring mode:%s (%d)\n",monmode==CTDB_MONITORING_ACTIVE?"ACTIVE":"DISABLED",monmode);
+       } else {
+               printf(":mode:\n");
+               printf(":%d:\n",monmode);
+       }
+       return 0;
+}
+
+
+/*
+  display capabilities of a remote node
+ */
+static int control_getcapabilities(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t capabilities;
+       int ret;
+
+       ret = ctdb_ctrl_getcapabilities(ctdb, TIMELIMIT(), options.pnn, &capabilities);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get capabilities from node %u\n", options.pnn));
+               return -1;
+       }
+       
+       if (!options.machinereadable){
+               printf("RECMASTER: %s\n", (capabilities&CTDB_CAP_RECMASTER)?"YES":"NO");
+               printf("LMASTER: %s\n", (capabilities&CTDB_CAP_LMASTER)?"YES":"NO");
+               printf("LVS: %s\n", (capabilities&CTDB_CAP_LVS)?"YES":"NO");
+               printf("NATGW: %s\n", (capabilities&CTDB_CAP_NATGW)?"YES":"NO");
+       } else {
+               printf(":RECMASTER:LMASTER:LVS:NATGW:\n");
+               printf(":%d:%d:%d:%d:\n",
+                       !!(capabilities&CTDB_CAP_RECMASTER),
+                       !!(capabilities&CTDB_CAP_LMASTER),
+                       !!(capabilities&CTDB_CAP_LVS),
+                       !!(capabilities&CTDB_CAP_NATGW));
+       }
+       return 0;
+}
+
+/*
+  display lvs configuration
+ */
+static int control_lvs(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t *capabilities;
+       struct ctdb_node_map *nodemap=NULL;
+       int i, ret;
+       int healthy_count = 0;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       capabilities = talloc_array(ctdb, uint32_t, nodemap->num);
+       CTDB_NO_MEMORY(ctdb, capabilities);
+       
+       ret = 0;
+
+       /* collect capabilities for all connected nodes */
+       for (i=0; i<nodemap->num; i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (nodemap->nodes[i].flags & NODE_FLAGS_PERMANENTLY_DISABLED) {
+                       continue;
+               }
+
+               ret = ctdb_ctrl_getcapabilities(ctdb, TIMELIMIT(), i, &capabilities[i]);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get capabilities from node %u\n", i));
+                       ret = -1;
+                       goto done;
+               }
+
+               if (!(capabilities[i] & CTDB_CAP_LVS)) {
+                       continue;
+               }
+
+               if (!(nodemap->nodes[i].flags & NODE_FLAGS_UNHEALTHY)) {
+                       healthy_count++;
+               }
+       }
+
+       /* Print all LVS nodes */
+       for (i=0; i<nodemap->num; i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (nodemap->nodes[i].flags & NODE_FLAGS_PERMANENTLY_DISABLED) {
+                       continue;
+               }
+               if (!(capabilities[i] & CTDB_CAP_LVS)) {
+                       continue;
+               }
+
+               if (healthy_count != 0) {
+                       if (nodemap->nodes[i].flags & NODE_FLAGS_UNHEALTHY) {
+                               continue;
+                       }
+               }
+
+               printf("%d:%s\n", i, 
+                       ctdb_addr_to_str(&nodemap->nodes[i].addr));
+       }
+
+done:
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+/*
+  display who is the lvs master
+ */
+static int control_lvsmaster(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t *capabilities;
+       struct ctdb_node_map *nodemap=NULL;
+       int i, ret;
+       int healthy_count = 0;
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       capabilities = talloc_array(tmp_ctx, uint32_t, nodemap->num);
+       if (capabilities == NULL) {
+               talloc_free(tmp_ctx);
+               CTDB_NO_MEMORY(ctdb, capabilities);
+       }
+
+       /* collect capabilities for all connected nodes */
+       for (i=0; i<nodemap->num; i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (nodemap->nodes[i].flags & NODE_FLAGS_PERMANENTLY_DISABLED) {
+                       continue;
+               }
+       
+               ret = ctdb_ctrl_getcapabilities(ctdb, TIMELIMIT(), i, &capabilities[i]);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get capabilities from node %u\n", i));
+                       ret = -1;
+                       goto done;
+               }
+
+               if (!(capabilities[i] & CTDB_CAP_LVS)) {
+                       continue;
+               }
+
+               if (!(nodemap->nodes[i].flags & NODE_FLAGS_UNHEALTHY)) {
+                       healthy_count++;
+               }
+       }
+
+       ret = -1;
+
+       /* find and show the lvsmaster */
+       for (i=0; i<nodemap->num; i++) {
+               if (nodemap->nodes[i].flags & NODE_FLAGS_INACTIVE) {
+                       continue;
+               }
+               if (nodemap->nodes[i].flags & NODE_FLAGS_PERMANENTLY_DISABLED) {
+                       continue;
+               }
+               if (!(capabilities[i] & CTDB_CAP_LVS)) {
+                       continue;
+               }
+
+               if (healthy_count != 0) {
+                       if (nodemap->nodes[i].flags & NODE_FLAGS_UNHEALTHY) {
+                               continue;
+                       }
+               }
+
+               if (options.machinereadable){
+                       printf("%d\n", i);
+               } else {
+                       printf("Node %d is LVS master\n", i);
+               }
+               ret = 0;
+               goto done;
+       }
+
+       printf("There is no LVS master\n");
+done:
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+/*
+  disable monitoring on a  node
+ */
+static int control_disable_monmode(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       
+       int ret;
+
+       ret = ctdb_ctrl_disable_monmode(ctdb, TIMELIMIT(), options.pnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to disable monmode on node %u\n", options.pnn));
+               return ret;
+       }
+       printf("Monitoring mode:%s\n","DISABLED");
+
+       return 0;
+}
+
+/*
+  enable monitoring on a  node
+ */
+static int control_enable_monmode(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       
+       int ret;
+
+       ret = ctdb_ctrl_enable_monmode(ctdb, TIMELIMIT(), options.pnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to enable monmode on node %u\n", options.pnn));
+               return ret;
+       }
+       printf("Monitoring mode:%s\n","ACTIVE");
+
+       return 0;
+}
+
+/*
+  display remote list of keys/data for a db
+ */
+static int control_catdb(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       int ret;
+       struct ctdb_dump_db_context c;
+       uint8_t flags;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       db_name = argv[0];
+
+       if (!db_exists(ctdb, db_name, NULL, &flags)) {
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, flags & CTDB_DB_FLAGS_PERSISTENT, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+               return -1;
+       }
+
+       if (options.printlmaster) {
+               ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), options.pnn,
+                                         ctdb, &ctdb->vnn_map);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to get vnnmap from node %u\n",
+                                         options.pnn));
+                       return ret;
+               }
+       }
+
+       ZERO_STRUCT(c);
+       c.f = stdout;
+       c.printemptyrecords = (bool)options.printemptyrecords;
+       c.printdatasize = (bool)options.printdatasize;
+       c.printlmaster = (bool)options.printlmaster;
+       c.printhash = (bool)options.printhash;
+       c.printrecordflags = (bool)options.printrecordflags;
+
+       /* traverse and dump the cluster tdb */
+       ret = ctdb_dump_db(ctdb_db, &c);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR, ("Unable to dump database\n"));
+               DEBUG(DEBUG_ERR, ("Maybe try 'ctdb getdbstatus %s'"
+                                 " and 'ctdb getvar AllowUnhealthyDBRead'\n",
+                                 db_name));
+               return -1;
+       }
+       talloc_free(ctdb_db);
+
+       printf("Dumped %d records\n", ret);
+       return 0;
+}
+
+struct cattdb_data {
+       struct ctdb_context *ctdb;
+       uint32_t count;
+};
+
+static int cattdb_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private_data)
+{
+       struct cattdb_data *d = private_data;
+       struct ctdb_dump_db_context c;
+
+       d->count++;
+
+       ZERO_STRUCT(c);
+       c.f = stdout;
+       c.printemptyrecords = (bool)options.printemptyrecords;
+       c.printdatasize = (bool)options.printdatasize;
+       c.printlmaster = false;
+       c.printhash = (bool)options.printhash;
+       c.printrecordflags = true;
+
+       return ctdb_dumpdb_record(d->ctdb, key, data, &c);
+}
+
+/*
+  cat the local tdb database using same format as catdb
+ */
+static int control_cattdb(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       struct cattdb_data d;
+       uint8_t flags;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       db_name = argv[0];
+
+       if (!db_exists(ctdb, db_name, NULL, &flags)) {
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, flags & CTDB_DB_FLAGS_PERSISTENT, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+               return -1;
+       }
+
+       /* traverse the local tdb */
+       d.count = 0;
+       d.ctdb  = ctdb;
+       if (tdb_traverse_read(ctdb_db->ltdb->tdb, cattdb_traverse, &d) == -1) {
+               printf("Failed to cattdb data\n");
+               exit(10);
+       }
+       talloc_free(ctdb_db);
+
+       printf("Dumped %d records\n", d.count);
+       return 0;
+}
+
+/*
+  display the content of a database key
+ */
+static int control_readkey(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_record_handle *h;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA key, data;
+       uint8_t flags;
+
+       if (argc < 2) {
+               usage();
+       }
+
+       db_name = argv[0];
+
+       if (!db_exists(ctdb, db_name, NULL, &flags)) {
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, flags & CTDB_DB_FLAGS_PERSISTENT, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+               return -1;
+       }
+
+       key.dptr  = discard_const(argv[1]);
+       key.dsize = strlen((char *)key.dptr);
+
+       h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+       if (h == NULL) {
+               printf("Failed to fetch record '%s' on node %d\n", 
+                       (const char *)key.dptr, ctdb_get_pnn(ctdb));
+               talloc_free(tmp_ctx);
+               exit(10);
+       }
+
+       printf("Data: size:%d ptr:[%.*s]\n", (int)data.dsize, (int)data.dsize, data.dptr);
+
+       talloc_free(ctdb_db);
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  display the content of a database key
+ */
+static int control_writekey(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_record_handle *h;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA key, data;
+       uint8_t flags;
+
+       if (argc < 3) {
+               usage();
+       }
+
+       db_name = argv[0];
+
+       if (!db_exists(ctdb, db_name, NULL, &flags)) {
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, flags & CTDB_DB_FLAGS_PERSISTENT, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+               return -1;
+       }
+
+       key.dptr  = discard_const(argv[1]);
+       key.dsize = strlen((char *)key.dptr);
+
+       h = ctdb_fetch_lock(ctdb_db, tmp_ctx, key, &data);
+       if (h == NULL) {
+               printf("Failed to fetch record '%s' on node %d\n", 
+                       (const char *)key.dptr, ctdb_get_pnn(ctdb));
+               talloc_free(tmp_ctx);
+               exit(10);
+       }
+
+       data.dptr  = discard_const(argv[2]);
+       data.dsize = strlen((char *)data.dptr);
+
+       if (ctdb_record_store(h, data) != 0) {
+               printf("Failed to store record\n");
+       }
+
+       talloc_free(h);
+       talloc_free(ctdb_db);
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  fetch a record from a persistent database
+ */
+static int control_pfetch(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_transaction_handle *h;
+       TDB_DATA key, data;
+       int fd, ret;
+       bool persistent;
+       uint8_t flags;
+
+       if (argc < 2) {
+               talloc_free(tmp_ctx);
+               usage();
+       }
+
+       db_name = argv[0];
+
+       if (!db_exists(ctdb, db_name, NULL, &flags)) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       persistent = flags & CTDB_DB_FLAGS_PERSISTENT;
+       if (!persistent) {
+               DEBUG(DEBUG_ERR,("Database '%s' is not persistent\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, persistent, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       h = ctdb_transaction_start(ctdb_db, tmp_ctx);
+       if (h == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to start transaction on database %s\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       key.dptr  = discard_const(argv[1]);
+       key.dsize = strlen(argv[1]);
+       ret = ctdb_transaction_fetch(h, tmp_ctx, key, &data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to fetch record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (data.dsize == 0 || data.dptr == NULL) {
+               DEBUG(DEBUG_ERR,("Record is empty\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (argc == 3) {
+         fd = open(argv[2], O_WRONLY|O_CREAT|O_TRUNC, 0600);
+               if (fd == -1) {
+                       DEBUG(DEBUG_ERR,("Failed to open output file %s\n", argv[2]));
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+               write(fd, data.dptr, data.dsize);
+               close(fd);
+       } else {
+               write(1, data.dptr, data.dsize);
+       }
+
+       /* abort the transaction */
+       talloc_free(h);
+
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  fetch a record from a tdb-file
+ */
+static int control_tfetch(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *tdb_file;
+       TDB_CONTEXT *tdb;
+       TDB_DATA key, data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+       int fd;
+
+       if (argc < 2) {
+               usage();
+       }
+
+       tdb_file = argv[0];
+
+       tdb = tdb_open(tdb_file, 0, 0, O_RDONLY, 0);
+       if (tdb == NULL) {
+               printf("Failed to open TDB file %s\n", tdb_file);
+               return -1;
+       }
+
+       if (!strncmp(argv[1], "0x", 2)) {
+               key = hextodata(tmp_ctx, argv[1] + 2);
+               if (key.dsize == 0) {
+                       printf("Failed to convert \"%s\" into a TDB_DATA\n", argv[1]);
+                       return -1;
+               }
+       } else {
+               key.dptr  = discard_const(argv[1]);
+               key.dsize = strlen(argv[1]);
+       }
+
+       data = tdb_fetch(tdb, key);
+       if (data.dptr == NULL || data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               printf("Failed to read record %s from tdb %s\n", argv[1], tdb_file);
+               tdb_close(tdb);
+               return -1;
+       }
+
+       tdb_close(tdb);
+
+       if (argc == 3) {
+         fd = open(argv[2], O_WRONLY|O_CREAT|O_TRUNC, 0600);
+               if (fd == -1) {
+                       printf("Failed to open output file %s\n", argv[2]);
+                       return -1;
+               }
+               if (options.verbose){
+                       write(fd, data.dptr, data.dsize);
+               } else {
+                       write(fd, data.dptr+sizeof(struct ctdb_ltdb_header), data.dsize-sizeof(struct ctdb_ltdb_header));
+               }
+               close(fd);
+       } else {
+               if (options.verbose){
+                       write(1, data.dptr, data.dsize);
+               } else {
+                       write(1, data.dptr+sizeof(struct ctdb_ltdb_header), data.dsize-sizeof(struct ctdb_ltdb_header));
+               }
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  store a record and header to a tdb-file
+ */
+static int control_tstore(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *tdb_file;
+       TDB_CONTEXT *tdb;
+       TDB_DATA key, data;
+       TALLOC_CTX *tmp_ctx = talloc_new(NULL);
+
+       if (argc < 3) {
+               usage();
+       }
+
+       tdb_file = argv[0];
+
+       tdb = tdb_open(tdb_file, 0, 0, O_RDWR, 0);
+       if (tdb == NULL) {
+               printf("Failed to open TDB file %s\n", tdb_file);
+               return -1;
+       }
+
+       if (!strncmp(argv[1], "0x", 2)) {
+               key = hextodata(tmp_ctx, argv[1] + 2);
+               if (key.dsize == 0) {
+                       printf("Failed to convert \"%s\" into a TDB_DATA\n", argv[1]);
+                       return -1;
+               }
+       } else {
+               key.dptr  = discard_const(argv[1]);
+               key.dsize = strlen(argv[1]);
+       }
+
+       if (!strncmp(argv[2], "0x", 2)) {
+               data = hextodata(tmp_ctx, argv[2] + 2);
+               if (data.dsize == 0) {
+                       printf("Failed to convert \"%s\" into a TDB_DATA\n", argv[2]);
+                       return -1;
+               }
+       } else {
+               data.dptr  = discard_const(argv[2]);
+               data.dsize = strlen(argv[2]);
+       }
+
+       if (data.dsize < sizeof(struct ctdb_ltdb_header)) {
+               printf("Not enough data. You must specify the full ctdb_ltdb_header too when storing\n");
+               return -1;
+       }
+       if (tdb_store(tdb, key, data, TDB_REPLACE) != 0) {
+               printf("Failed to write record %s to tdb %s\n", argv[1], tdb_file);
+               tdb_close(tdb);
+               return -1;
+       }
+
+       tdb_close(tdb);
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  write a record to a persistent database
+ */
+static int control_pstore(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_transaction_handle *h;
+       struct stat st;
+       TDB_DATA key, data;
+       int fd, ret;
+
+       if (argc < 3) {
+               talloc_free(tmp_ctx);
+               usage();
+       }
+
+       fd = open(argv[2], O_RDONLY);
+       if (fd == -1) {
+               DEBUG(DEBUG_ERR,("Failed to open file containing record data : %s  %s\n", argv[2], strerror(errno)));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       
+       ret = fstat(fd, &st);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR,("fstat of file %s failed: %s\n", argv[2], strerror(errno)));
+               close(fd);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       if (!S_ISREG(st.st_mode)) {
+               DEBUG(DEBUG_ERR,("Not a regular file %s\n", argv[2]));
+               close(fd);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       data.dsize = st.st_size;
+       if (data.dsize == 0) {
+               data.dptr  = NULL;
+       } else {
+               data.dptr = talloc_size(tmp_ctx, data.dsize);
+               if (data.dptr == NULL) {
+                       DEBUG(DEBUG_ERR,("Failed to talloc %d of memory to store record data\n", (int)data.dsize));
+                       close(fd);
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+               ret = read(fd, data.dptr, data.dsize);
+               if (ret != data.dsize) {
+                       DEBUG(DEBUG_ERR,("Failed to read %d bytes of record data\n", (int)data.dsize));
+                       close(fd);
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+       }
+       close(fd);
+
+
+       db_name = argv[0];
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, true, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       h = ctdb_transaction_start(ctdb_db, tmp_ctx);
+       if (h == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to start transaction on database %s\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       key.dptr  = discard_const(argv[1]);
+       key.dsize = strlen(argv[1]);
+       ret = ctdb_transaction_store(h, key, data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to store record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ret = ctdb_transaction_commit(h);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to commit transaction\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+ * delete a record from a persistent database
+ */
+static int control_pdelete(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct ctdb_transaction_handle *h;
+       TDB_DATA key;
+       int ret;
+       bool persistent;
+       uint8_t flags;
+
+       if (argc < 2) {
+               talloc_free(tmp_ctx);
+               usage();
+       }
+
+       db_name = argv[0];
+
+       if (!db_exists(ctdb, db_name, NULL, &flags)) {
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       persistent = flags & CTDB_DB_FLAGS_PERSISTENT;
+       if (!persistent) {
+               DEBUG(DEBUG_ERR, ("Database '%s' is not persistent\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, persistent, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR, ("Unable to attach to database '%s'\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       h = ctdb_transaction_start(ctdb_db, tmp_ctx);
+       if (h == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to start transaction on database %s\n", db_name));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       key.dptr = discard_const(argv[1]);
+       key.dsize = strlen(argv[1]);
+       ret = ctdb_transaction_store(h, key, tdb_null);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to delete record\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ret = ctdb_transaction_commit(h);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to commit transaction\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  check if a service is bound to a port or not
+ */
+static int control_chktcpport(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int s, ret;
+       unsigned v;
+       int port;
+        struct sockaddr_in sin;
+
+       if (argc != 1) {
+               printf("Use: ctdb chktcport <port>\n");
+               return EINVAL;
+       }
+
+       port = atoi(argv[0]);
+
+       s = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+       if (s == -1) {
+               printf("Failed to open local socket\n");
+               return errno;
+       }
+
+       v = fcntl(s, F_GETFL, 0);
+        fcntl(s, F_SETFL, v | O_NONBLOCK);
+
+       bzero(&sin, sizeof(sin));
+       sin.sin_family = PF_INET;
+       sin.sin_port   = htons(port);
+       ret = bind(s, (struct sockaddr *)&sin, sizeof(sin));
+       close(s);
+       if (ret == -1) {
+               printf("Failed to bind to local socket: %d %s\n", errno, strerror(errno));
+               return errno;
+       }
+
+       return 0;
+}
+
+
+
+static void log_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       DEBUG(DEBUG_ERR,("Log data received\n"));
+       if (data.dsize > 0) {
+               printf("%s", data.dptr);
+       }
+
+       exit(0);
+}
+
+/*
+  display a list of log messages from the in memory ringbuffer
+ */
+static int control_getlog(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret, i;
+       bool main_daemon;
+       struct ctdb_get_log_addr log_addr;
+       TDB_DATA data;
+       struct timeval tv;
+
+       /* Process options */
+       main_daemon = true;
+       log_addr.pnn = ctdb_get_pnn(ctdb);
+       log_addr.level = DEBUG_NOTICE;
+       for (i = 0; i < argc; i++) {
+               if (strcmp(argv[i], "recoverd") == 0) {
+                       main_daemon = false;
+               } else {
+                       if (isalpha(argv[i][0]) || argv[i][0] == '-') { 
+                               log_addr.level = get_debug_by_desc(argv[i]);
+                       } else {
+                               log_addr.level = strtol(argv[i], NULL, 0);
+                       }
+               }
+       }
+
+       /* Our message port is our PID */
+       log_addr.srvid = getpid();
+
+       data.dptr = (unsigned char *)&log_addr;
+       data.dsize = sizeof(log_addr);
+
+       DEBUG(DEBUG_ERR, ("Pulling logs from node %u\n", options.pnn));
+
+       ctdb_client_set_message_handler(ctdb, log_addr.srvid, log_handler, NULL);
+       sleep(1);
+
+       DEBUG(DEBUG_ERR,("Listen for response on %d\n", (int)log_addr.srvid));
+
+       if (main_daemon) {
+               int32_t res;
+               char *errmsg;
+               TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+               ret = ctdb_control(ctdb, options.pnn, 0, CTDB_CONTROL_GET_LOG,
+                                  0, data, tmp_ctx, NULL, &res, NULL, &errmsg);
+               if (ret != 0 || res != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to get logs - %s\n", errmsg));
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+               talloc_free(tmp_ctx);
+       } else {
+               ret = ctdb_client_send_message(ctdb, options.pnn,
+                                              CTDB_SRVID_GETLOG, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to send getlog request message to %u\n", options.pnn));
+                       return -1;
+               }
+       }
+
+       tv = timeval_current();
+       /* this loop will terminate when we have received the reply */
+       while (timeval_elapsed(&tv) < (double)options.timelimit) {
+               event_loop_once(ctdb->ev);
+       }
+
+       DEBUG(DEBUG_INFO,("Timed out waiting for log data.\n"));
+
+       return 0;
+}
+
+/*
+  clear the in memory log area
+ */
+static int control_clearlog(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+
+       if (argc == 0 || (argc >= 1 && strcmp(argv[0], "recoverd") != 0)) {
+               /* "recoverd" not given - get logs from main daemon */
+               int32_t res;
+               char *errmsg;
+               TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+               ret = ctdb_control(ctdb, options.pnn, 0, CTDB_CONTROL_CLEAR_LOG,
+                                  0, tdb_null, tmp_ctx, NULL, &res, NULL, &errmsg);
+               if (ret != 0 || res != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to clear logs\n"));
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+
+               talloc_free(tmp_ctx);
+       } else {
+               TDB_DATA data; /* unused in recoverd... */
+               data.dsize = 0;
+
+               ret = ctdb_client_send_message(ctdb, options.pnn, CTDB_SRVID_CLEARLOG, data);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to send clearlog request message to %u\n", options.pnn));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+/* Reload public IPs on a specified nodes */
+static int control_reloadips(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t *nodes;
+       uint32_t pnn_mode;
+       int ret;
+
+       assert_single_node_only();
+
+       if (argc > 1) {
+               usage();
+       }
+
+       /* Determine the nodes where IPs need to be reloaded */
+       if (!parse_nodestring(ctdb, tmp_ctx, argc == 1 ? argv[0] : NULL,
+                             options.pnn, true, &nodes, &pnn_mode)) {
+               ret = -1;
+               goto done;
+       }
+
+again:
+       /* Disable takeover runs on all connected nodes.  A reply
+        * indicating success is needed from each node so all nodes
+        * will need to be active.  This will retry until maxruntime
+        * is exceeded, hence no error handling.
+        * 
+        * A check could be added to not allow reloading of IPs when
+        * there are disconnected nodes.  However, this should
+        * probably be left up to the administrator.
+        */
+       srvid_broadcast(ctdb, CTDB_SRVID_DISABLE_TAKEOVER_RUNS, LONGTIMEOUT,
+                       "Disable takeover runs", true);
+
+       /* Now tell all the desired nodes to reload their public IPs.
+        * Keep trying this until it succeeds.  This assumes all
+        * failures are transient, which might not be true...
+        */
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_RELOAD_PUBLIC_IPS,
+                                     nodes, 0, LONGTIMELIMIT(),
+                                     false, tdb_null,
+                                     NULL, NULL, NULL) != 0) {
+               DEBUG(DEBUG_ERR,
+                     ("Unable to reload IPs on some nodes, try again.\n"));
+               goto again;
+       }
+
+       /* It isn't strictly necessary to wait until takeover runs are
+        * re-enabled but doing so can't hurt.
+        */
+       srvid_broadcast(ctdb, CTDB_SRVID_DISABLE_TAKEOVER_RUNS, 0,
+                       "Enable takeover runs", true);
+
+       ipreallocate(ctdb);
+
+       ret = 0;
+done:
+       talloc_free(tmp_ctx);
+       return ret;
+}
+
+/*
+  display a list of the databases on a remote ctdb
+ */
+static int control_getdbmap(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       struct ctdb_dbid_map *dbmap=NULL;
+
+       ret = ctdb_ctrl_getdbmap(ctdb, TIMELIMIT(), options.pnn, ctdb, &dbmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get dbids from node %u\n", options.pnn));
+               return ret;
+       }
+
+       if(options.machinereadable){
+               printf(":ID:Name:Path:Persistent:Sticky:Unhealthy:ReadOnly:\n");
+               for(i=0;i<dbmap->num;i++){
+                       const char *path;
+                       const char *name;
+                       const char *health;
+                       bool persistent;
+                       bool readonly;
+                       bool sticky;
+
+                       ctdb_ctrl_getdbpath(ctdb, TIMELIMIT(), options.pnn,
+                                           dbmap->dbs[i].dbid, ctdb, &path);
+                       ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), options.pnn,
+                                           dbmap->dbs[i].dbid, ctdb, &name);
+                       ctdb_ctrl_getdbhealth(ctdb, TIMELIMIT(), options.pnn,
+                                             dbmap->dbs[i].dbid, ctdb, &health);
+                       persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
+                       readonly   = dbmap->dbs[i].flags & CTDB_DB_FLAGS_READONLY;
+                       sticky     = dbmap->dbs[i].flags & CTDB_DB_FLAGS_STICKY;
+                       printf(":0x%08X:%s:%s:%d:%d:%d:%d:\n",
+                              dbmap->dbs[i].dbid, name, path,
+                              !!(persistent), !!(sticky),
+                              !!(health), !!(readonly));
+               }
+               return 0;
+       }
+
+       printf("Number of databases:%d\n", dbmap->num);
+       for(i=0;i<dbmap->num;i++){
+               const char *path;
+               const char *name;
+               const char *health;
+               bool persistent;
+               bool readonly;
+               bool sticky;
+
+               ctdb_ctrl_getdbpath(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &path);
+               ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &name);
+               ctdb_ctrl_getdbhealth(ctdb, TIMELIMIT(), options.pnn, dbmap->dbs[i].dbid, ctdb, &health);
+               persistent = dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT;
+               readonly   = dbmap->dbs[i].flags & CTDB_DB_FLAGS_READONLY;
+               sticky     = dbmap->dbs[i].flags & CTDB_DB_FLAGS_STICKY;
+               printf("dbid:0x%08x name:%s path:%s%s%s%s%s\n",
+                      dbmap->dbs[i].dbid, name, path,
+                      persistent?" PERSISTENT":"",
+                      sticky?" STICKY":"",
+                      readonly?" READONLY":"",
+                      health?" UNHEALTHY":"");
+       }
+
+       return 0;
+}
+
+/*
+  display the status of a database on a remote ctdb
+ */
+static int control_getdbstatus(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       uint32_t db_id;
+       uint8_t flags;
+       const char *path;
+       const char *health;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       db_name = argv[0];
+
+       if (!db_exists(ctdb, db_name, &db_id, &flags)) {
+               return -1;
+       }
+
+       ctdb_ctrl_getdbpath(ctdb, TIMELIMIT(), options.pnn, db_id, ctdb, &path);
+       ctdb_ctrl_getdbhealth(ctdb, TIMELIMIT(), options.pnn, db_id, ctdb, &health);
+       printf("dbid: 0x%08x\nname: %s\npath: %s\nPERSISTENT: %s\nSTICKY: %s\nREADONLY: %s\nHEALTH: %s\n",
+              db_id, db_name, path,
+              (flags & CTDB_DB_FLAGS_PERSISTENT ? "yes" : "no"),
+              (flags & CTDB_DB_FLAGS_STICKY ? "yes" : "no"),
+              (flags & CTDB_DB_FLAGS_READONLY ? "yes" : "no"),
+              (health ? health : "OK"));
+
+       return 0;
+}
+
+/*
+  check if the local node is recmaster or not
+  it will return 1 if this node is the recmaster and 0 if it is not
+  or if the local ctdb daemon could not be contacted
+ */
+static int control_isnotrecmaster(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t mypnn, recmaster;
+       int ret;
+
+       assert_single_node_only();
+
+       mypnn = getpnn(ctdb);
+
+       ret = ctdb_ctrl_getrecmaster(ctdb, ctdb, TIMELIMIT(), options.pnn, &recmaster);
+       if (ret != 0) {
+               printf("Failed to get the recmaster\n");
+               return 1;
+       }
+
+       if (recmaster != mypnn) {
+               printf("this node is not the recmaster\n");
+               return 1;
+       }
+
+       printf("this node is the recmaster\n");
+       return 0;
+}
+
+/*
+  ping a node
+ */
+static int control_ping(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       struct timeval tv = timeval_current();
+       ret = ctdb_ctrl_ping(ctdb, options.pnn);
+       if (ret == -1) {
+               printf("Unable to get ping response from node %u\n", options.pnn);
+               return -1;
+       } else {
+               printf("response from %u time=%.6f sec  (%d clients)\n", 
+                      options.pnn, timeval_elapsed(&tv), ret);
+       }
+       return 0;
+}
+
+
+/*
+  get a node's runstate
+ */
+static int control_runstate(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       enum ctdb_runstate runstate;
+
+       ret = ctdb_ctrl_get_runstate(ctdb, TIMELIMIT(), options.pnn, &runstate);
+       if (ret == -1) {
+               printf("Unable to get runstate response from node %u\n",
+                      options.pnn);
+               return -1;
+       } else {
+               bool found = true;
+               enum ctdb_runstate t;
+               int i;
+               for (i=0; i<argc; i++) {
+                       found = false;
+                       t = runstate_from_string(argv[i]);
+                       if (t == CTDB_RUNSTATE_UNKNOWN) {
+                               printf("Invalid run state (%s)\n", argv[i]);
+                               return -1;
+                       }
+
+                       if (t == runstate) {
+                               found = true;
+                               break;
+                       }
+               }
+
+               if (!found) {
+                       printf("CTDB not in required run state (got %s)\n", 
+                              runstate_to_string((enum ctdb_runstate)runstate));
+                       return -1;
+               }
+       }
+
+       printf("%s\n", runstate_to_string(runstate));
+       return 0;
+}
+
+
+/*
+  get a tunable
+ */
+static int control_getvar(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *name;
+       uint32_t value;
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       name = argv[0];
+       ret = ctdb_ctrl_get_tunable(ctdb, TIMELIMIT(), options.pnn, name, &value);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get tunable variable '%s'\n", name));
+               return -1;
+       }
+
+       printf("%-23s = %u\n", name, value);
+       return 0;
+}
+
+/*
+  set a tunable
+ */
+static int control_setvar(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *name;
+       uint32_t value;
+       int ret;
+
+       if (argc < 2) {
+               usage();
+       }
+
+       name = argv[0];
+       value = strtoul(argv[1], NULL, 0);
+
+       ret = ctdb_ctrl_set_tunable(ctdb, TIMELIMIT(), options.pnn, name, value);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR, ("Unable to set tunable variable '%s'\n", name));
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  list all tunables
+ */
+static int control_listvars(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t count;
+       const char **list;
+       int ret, i;
+
+       ret = ctdb_ctrl_list_tunables(ctdb, TIMELIMIT(), options.pnn, ctdb, &list, &count);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR, ("Unable to list tunable variables\n"));
+               return -1;
+       }
+
+       for (i=0;i<count;i++) {
+               control_getvar(ctdb, 1, &list[i]);
+       }
+
+       talloc_free(list);
+       
+       return 0;
+}
+
+/*
+  display debug level on a node
+ */
+static int control_getdebug(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       int32_t level;
+
+       ret = ctdb_ctrl_get_debuglevel(ctdb, options.pnn, &level);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get debuglevel response from node %u\n", options.pnn));
+               return ret;
+       } else {
+               if (options.machinereadable){
+                       printf(":Name:Level:\n");
+                       printf(":%s:%d:\n",get_debug_by_level(level),level);
+               } else {
+                       printf("Node %u is at debug level %s (%d)\n", options.pnn, get_debug_by_level(level), level);
+               }
+       }
+       return 0;
+}
+
+/*
+  display reclock file of a node
+ */
+static int control_getreclock(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       const char *reclock;
+
+       ret = ctdb_ctrl_getreclock(ctdb, TIMELIMIT(), options.pnn, ctdb, &reclock);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get reclock file from node %u\n", options.pnn));
+               return ret;
+       } else {
+               if (options.machinereadable){
+                       if (reclock != NULL) {
+                               printf("%s", reclock);
+                       }
+               } else {
+                       if (reclock == NULL) {
+                               printf("No reclock file used.\n");
+                       } else {
+                               printf("Reclock file:%s\n", reclock);
+                       }
+               }
+       }
+       return 0;
+}
+
+/*
+  set the reclock file of a node
+ */
+static int control_setreclock(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       const char *reclock;
+
+       if (argc == 0) {
+               reclock = NULL;
+       } else if (argc == 1) {
+               reclock = argv[0];
+       } else {
+               usage();
+       }
+
+       ret = ctdb_ctrl_setreclock(ctdb, TIMELIMIT(), options.pnn, reclock);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get reclock file from node %u\n", options.pnn));
+               return ret;
+       }
+       return 0;
+}
+
+/*
+  set the natgw state on/off
+ */
+static int control_setnatgwstate(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       uint32_t natgwstate;
+
+       if (argc == 0) {
+               usage();
+       }
+
+       if (!strcmp(argv[0], "on")) {
+               natgwstate = 1;
+       } else if (!strcmp(argv[0], "off")) {
+               natgwstate = 0;
+       } else {
+               usage();
+       }
+
+       ret = ctdb_ctrl_setnatgwstate(ctdb, TIMELIMIT(), options.pnn, natgwstate);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to set the natgw state for node %u\n", options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  set the lmaster role on/off
+ */
+static int control_setlmasterrole(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       uint32_t lmasterrole;
+
+       if (argc == 0) {
+               usage();
+       }
+
+       if (!strcmp(argv[0], "on")) {
+               lmasterrole = 1;
+       } else if (!strcmp(argv[0], "off")) {
+               lmasterrole = 0;
+       } else {
+               usage();
+       }
+
+       ret = ctdb_ctrl_setlmasterrole(ctdb, TIMELIMIT(), options.pnn, lmasterrole);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to set the lmaster role for node %u\n", options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  set the recmaster role on/off
+ */
+static int control_setrecmasterrole(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       uint32_t recmasterrole;
+
+       if (argc == 0) {
+               usage();
+       }
+
+       if (!strcmp(argv[0], "on")) {
+               recmasterrole = 1;
+       } else if (!strcmp(argv[0], "off")) {
+               recmasterrole = 0;
+       } else {
+               usage();
+       }
+
+       ret = ctdb_ctrl_setrecmasterrole(ctdb, TIMELIMIT(), options.pnn, recmasterrole);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to set the recmaster role for node %u\n", options.pnn));
+               return ret;
+       }
+
+       return 0;
+}
+
+/*
+  set debug level on a node or all nodes
+ */
+static int control_setdebug(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       int32_t level;
+
+       if (argc == 0) {
+               printf("You must specify the debug level. Valid levels are:\n");
+               for (i=0; debug_levels[i].description != NULL; i++) {
+                       printf("%s (%d)\n", debug_levels[i].description, debug_levels[i].level);
+               }
+
+               return 0;
+       }
+
+       if (isalpha(argv[0][0]) || argv[0][0] == '-') { 
+               level = get_debug_by_desc(argv[0]);
+       } else {
+               level = strtol(argv[0], NULL, 0);
+       }
+
+       for (i=0; debug_levels[i].description != NULL; i++) {
+               if (level == debug_levels[i].level) {
+                       break;
+               }
+       }
+       if (debug_levels[i].description == NULL) {
+               printf("Invalid debug level, must be one of\n");
+               for (i=0; debug_levels[i].description != NULL; i++) {
+                       printf("%s (%d)\n", debug_levels[i].description, debug_levels[i].level);
+               }
+               return -1;
+       }
+
+       ret = ctdb_ctrl_set_debuglevel(ctdb, options.pnn, level);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to set debug level on node %u\n", options.pnn));
+       }
+       return 0;
+}
+
+
+/*
+  thaw a node
+ */
+static int control_thaw(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       uint32_t priority;
+       
+       if (argc == 1) {
+               priority = strtol(argv[0], NULL, 0);
+       } else {
+               priority = 0;
+       }
+       DEBUG(DEBUG_ERR,("Thaw by priority %u\n", priority));
+
+       ret = ctdb_ctrl_thaw_priority(ctdb, TIMELIMIT(), options.pnn, priority);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to thaw node %u\n", options.pnn));
+       }               
+       return 0;
+}
+
+
+/*
+  attach to a database
+ */
+static int control_attach(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       const char *db_name;
+       struct ctdb_db_context *ctdb_db;
+       bool persistent = false;
+
+       if (argc < 1) {
+               usage();
+       }
+       db_name = argv[0];
+       if (argc > 2) {
+               usage();
+       }
+       if (argc == 2) {
+               if (strcmp(argv[1], "persistent") != 0) {
+                       usage();
+               }
+               persistent = true;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), db_name, persistent, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", db_name));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  set db priority
+ */
+static int control_setdbprio(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       struct ctdb_db_priority db_prio;
+       int ret;
+
+       if (argc < 2) {
+               usage();
+       }
+
+       db_prio.db_id    = strtoul(argv[0], NULL, 0);
+       db_prio.priority = strtoul(argv[1], NULL, 0);
+
+       ret = ctdb_ctrl_set_db_priority(ctdb, TIMELIMIT(), options.pnn, &db_prio);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Unable to set db prio\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  get db priority
+ */
+static int control_getdbprio(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t db_id, priority;
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       if (!db_exists(ctdb, argv[0], &db_id, NULL)) {
+               return -1;
+       }
+
+       ret = ctdb_ctrl_get_db_priority(ctdb, TIMELIMIT(), options.pnn, db_id, &priority);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Unable to get db prio\n"));
+               return -1;
+       }
+
+       DEBUG(DEBUG_ERR,("Priority:%u\n", priority));
+
+       return 0;
+}
+
+/*
+  set the sticky records capability for a database
+ */
+static int control_setdbsticky(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t db_id;
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       if (!db_exists(ctdb, argv[0], &db_id, NULL)) {
+               return -1;
+       }
+
+       ret = ctdb_ctrl_set_db_sticky(ctdb, options.pnn, db_id);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Unable to set db to support sticky records\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  set the readonly capability for a database
+ */
+static int control_setdbreadonly(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       uint32_t db_id;
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       if (!db_exists(ctdb, argv[0], &db_id, NULL)) {
+               return -1;
+       }
+
+       ret = ctdb_ctrl_set_db_readonly(ctdb, options.pnn, db_id);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Unable to set db to support readonly\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  get db seqnum
+ */
+static int control_getdbseqnum(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint32_t db_id;
+       uint64_t seqnum;
+       int ret;
+
+       if (argc < 1) {
+               usage();
+       }
+
+       if (!db_exists(ctdb, argv[0], &db_id, NULL)) {
+               return -1;
+       }
+
+       ret = ctdb_ctrl_getdbseqnum(ctdb, TIMELIMIT(), options.pnn, db_id, &seqnum);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get seqnum from node."));
+               return -1;
+       }
+
+       printf("Sequence number:%lld\n", (long long)seqnum);
+
+       return 0;
+}
+
+/*
+  run an eventscript on a node
+ */
+static int control_eventscript(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TDB_DATA data;
+       int ret;
+       int32_t res;
+       char *errmsg;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+
+       if (argc != 1) {
+               DEBUG(DEBUG_ERR,("Invalid arguments\n"));
+               return -1;
+       }
+
+       data.dptr = (unsigned char *)discard_const(argv[0]);
+       data.dsize = strlen((char *)data.dptr) + 1;
+
+       DEBUG(DEBUG_ERR, ("Running eventscripts with arguments \"%s\" on node %u\n", data.dptr, options.pnn));
+
+       ret = ctdb_control(ctdb, options.pnn, 0, CTDB_CONTROL_RUN_EVENTSCRIPTS,
+                          0, data, tmp_ctx, NULL, &res, NULL, &errmsg);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("Failed to run eventscripts - %s\n", errmsg));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+#define DB_VERSION 1
+#define MAX_DB_NAME 64
+struct db_file_header {
+       unsigned long version;
+       time_t timestamp;
+       unsigned long persistent;
+       unsigned long size;
+       const char name[MAX_DB_NAME];
+};
+
+struct backup_data {
+       struct ctdb_marshall_buffer *records;
+       uint32_t len;
+       uint32_t total;
+       bool traverse_error;
+};
+
+static int backup_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+       struct backup_data *bd = talloc_get_type(private, struct backup_data);
+       struct ctdb_rec_data *rec;
+
+       /* add the record */
+       rec = ctdb_marshall_record(bd->records, 0, key, NULL, data);
+       if (rec == NULL) {
+               bd->traverse_error = true;
+               DEBUG(DEBUG_ERR,("Failed to marshall record\n"));
+               return -1;
+       }
+       bd->records = talloc_realloc_size(NULL, bd->records, rec->length + bd->len);
+       if (bd->records == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to expand marshalling buffer\n"));
+               bd->traverse_error = true;
+               return -1;
+       }
+       bd->records->count++;
+       memcpy(bd->len+(uint8_t *)bd->records, rec, rec->length);
+       bd->len += rec->length;
+       talloc_free(rec);
+
+       bd->total++;
+       return 0;
+}
+
+/*
+ * backup a database to a file 
+ */
+static int control_backupdb(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       struct db_file_header dbhdr;
+       struct ctdb_db_context *ctdb_db;
+       struct backup_data *bd;
+       int fh = -1;
+       int status = -1;
+       const char *reason = NULL;
+       uint32_t db_id;
+       uint8_t flags;
+
+       assert_single_node_only();
+
+       if (argc != 2) {
+               DEBUG(DEBUG_ERR,("Invalid arguments\n"));
+               return -1;
+       }
+
+       if (!db_exists(ctdb, argv[0], &db_id, &flags)) {
+               return -1;
+       }
+
+       ret = ctdb_ctrl_getdbhealth(ctdb, TIMELIMIT(), options.pnn,
+                                   db_id, tmp_ctx, &reason);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Unable to get dbhealth for database '%s'\n",
+                                argv[0]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       if (reason) {
+               uint32_t allow_unhealthy = 0;
+
+               ctdb_ctrl_get_tunable(ctdb, TIMELIMIT(), options.pnn,
+                                     "AllowUnhealthyDBRead",
+                                     &allow_unhealthy);
+
+               if (allow_unhealthy != 1) {
+                       DEBUG(DEBUG_ERR,("database '%s' is unhealthy: %s\n",
+                                        argv[0], reason));
+
+                       DEBUG(DEBUG_ERR,("disallow backup : tunable AllowUnhealthyDBRead = %u\n",
+                                        allow_unhealthy));
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+
+               DEBUG(DEBUG_WARNING,("WARNING database '%s' is unhealthy - see 'ctdb getdbstatus %s'\n",
+                                    argv[0], argv[0]));
+               DEBUG(DEBUG_WARNING,("WARNING! allow backup of unhealthy database: "
+                                    "tunnable AllowUnhealthyDBRead = %u\n",
+                                    allow_unhealthy));
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), argv[0], flags & CTDB_DB_FLAGS_PERSISTENT, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", argv[0]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       ret = tdb_transaction_start(ctdb_db->ltdb->tdb);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR,("Failed to start transaction\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       bd = talloc_zero(tmp_ctx, struct backup_data);
+       if (bd == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate backup_data\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       bd->records = talloc_zero(bd, struct ctdb_marshall_buffer);
+       if (bd->records == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate ctdb_marshall_buffer\n"));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       bd->len = offsetof(struct ctdb_marshall_buffer, data);
+       bd->records->db_id = ctdb_db->db_id;
+       /* traverse the database collecting all records */
+       if (tdb_traverse_read(ctdb_db->ltdb->tdb, backup_traverse, bd) == -1 ||
+           bd->traverse_error) {
+               DEBUG(DEBUG_ERR,("Traverse error\n"));
+               talloc_free(tmp_ctx);
+               return -1;              
+       }
+
+       tdb_transaction_cancel(ctdb_db->ltdb->tdb);
+
+
+       fh = open(argv[1], O_RDWR|O_CREAT, 0600);
+       if (fh == -1) {
+               DEBUG(DEBUG_ERR,("Failed to open file '%s'\n", argv[1]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       dbhdr.version = DB_VERSION;
+       dbhdr.timestamp = time(NULL);
+       dbhdr.persistent = flags & CTDB_DB_FLAGS_PERSISTENT;
+       dbhdr.size = bd->len;
+       if (strlen(argv[0]) >= MAX_DB_NAME) {
+               DEBUG(DEBUG_ERR,("Too long dbname\n"));
+               goto done;
+       }
+       strncpy(discard_const(dbhdr.name), argv[0], MAX_DB_NAME);
+       ret = write(fh, &dbhdr, sizeof(dbhdr));
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR,("write failed: %s\n", strerror(errno)));
+               goto done;
+       }
+       ret = write(fh, bd->records, bd->len);
+       if (ret == -1) {
+               DEBUG(DEBUG_ERR,("write failed: %s\n", strerror(errno)));
+               goto done;
+       }
+
+       status = 0;
+done:
+       if (fh != -1) {
+               ret = close(fh);
+               if (ret == -1) {
+                       DEBUG(DEBUG_ERR,("close failed: %s\n", strerror(errno)));
+               }
+       }
+
+       DEBUG(DEBUG_ERR,("Database backed up to %s\n", argv[1]));
+
+       talloc_free(tmp_ctx);
+       return status;
+}
+
+/*
+ * restore a database from a file 
+ */
+static int control_restoredb(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA outdata;
+       TDB_DATA data;
+       struct db_file_header dbhdr;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_node_map *nodemap=NULL;
+       struct ctdb_vnn_map *vnnmap=NULL;
+       int i, fh;
+       struct ctdb_control_wipe_database w;
+       uint32_t *nodes;
+       uint32_t generation;
+       struct tm *tm;
+       char tbuf[100];
+       char *dbname;
+
+       assert_single_node_only();
+
+       if (argc < 1 || argc > 2) {
+               DEBUG(DEBUG_ERR,("Invalid arguments\n"));
+               return -1;
+       }
+
+       fh = open(argv[0], O_RDONLY);
+       if (fh == -1) {
+               DEBUG(DEBUG_ERR,("Failed to open file '%s'\n", argv[0]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       read(fh, &dbhdr, sizeof(dbhdr));
+       if (dbhdr.version != DB_VERSION) {
+               DEBUG(DEBUG_ERR,("Invalid version of database dump. File is version %lu but expected version was %u\n", dbhdr.version, DB_VERSION));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       dbname = discard_const(dbhdr.name);
+       if (argc == 2) {
+               dbname = discard_const(argv[1]);
+       }
+
+       outdata.dsize = dbhdr.size;
+       outdata.dptr = talloc_size(tmp_ctx, outdata.dsize);
+       if (outdata.dptr == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate data of size '%lu'\n", dbhdr.size));
+               close(fh);
+               talloc_free(tmp_ctx);
+               return -1;
+       }               
+       read(fh, outdata.dptr, outdata.dsize);
+       close(fh);
+
+       tm = localtime(&dbhdr.timestamp);
+       strftime(tbuf,sizeof(tbuf)-1,"%Y/%m/%d %H:%M:%S", tm);
+       printf("Restoring database '%s' from backup @ %s\n",
+               dbname, tbuf);
+
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), dbname, dbhdr.persistent, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,("Unable to attach to database '%s'\n", dbname));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, ctdb, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+
+       ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx, &vnnmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get vnnmap from node %u\n", options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       /* freeze all nodes */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               if (ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
+                                       nodes, i,
+                                       TIMELIMIT(),
+                                       false, tdb_null,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to freeze nodes.\n"));
+                       ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+       }
+
+       generation = vnnmap->generation;
+       data.dptr = (void *)&generation;
+       data.dsize = sizeof(generation);
+
+       /* start a cluster wide transaction */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to start cluster wide transactions.\n"));
+               return -1;
+       }
+
+
+       w.db_id = ctdb_db->db_id;
+       w.transaction_id = generation;
+
+       data.dptr = (void *)&w;
+       data.dsize = sizeof(w);
+
+       /* wipe all the remote databases. */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to wipe database.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       
+       /* push the database */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_PUSH_DB,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, outdata,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to push database.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       data.dptr = (void *)&ctdb_db->db_id;
+       data.dsize = sizeof(ctdb_db->db_id);
+
+       /* mark the database as healthy */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_DB_SET_HEALTHY,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to mark database as healthy.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       data.dptr = (void *)&generation;
+       data.dsize = sizeof(generation);
+
+       /* commit all the changes */
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to commit databases.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       /* thaw all nodes */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_THAW,
+                                       nodes, 0,
+                                       TIMELIMIT(),
+                                       false, tdb_null,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to thaw nodes.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+ * dump a database backup from a file
+ */
+static int control_dumpdbbackup(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA outdata;
+       struct db_file_header dbhdr;
+       int i, fh;
+       struct tm *tm;
+       char tbuf[100];
+       struct ctdb_rec_data *rec = NULL;
+       struct ctdb_marshall_buffer *m;
+       struct ctdb_dump_db_context c;
+
+       assert_single_node_only();
+
+       if (argc != 1) {
+               DEBUG(DEBUG_ERR,("Invalid arguments\n"));
+               return -1;
+       }
+
+       fh = open(argv[0], O_RDONLY);
+       if (fh == -1) {
+               DEBUG(DEBUG_ERR,("Failed to open file '%s'\n", argv[0]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       read(fh, &dbhdr, sizeof(dbhdr));
+       if (dbhdr.version != DB_VERSION) {
+               DEBUG(DEBUG_ERR,("Invalid version of database dump. File is version %lu but expected version was %u\n", dbhdr.version, DB_VERSION));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       outdata.dsize = dbhdr.size;
+       outdata.dptr = talloc_size(tmp_ctx, outdata.dsize);
+       if (outdata.dptr == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to allocate data of size '%lu'\n", dbhdr.size));
+               close(fh);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       read(fh, outdata.dptr, outdata.dsize);
+       close(fh);
+       m = (struct ctdb_marshall_buffer *)outdata.dptr;
+
+       tm = localtime(&dbhdr.timestamp);
+       strftime(tbuf,sizeof(tbuf)-1,"%Y/%m/%d %H:%M:%S", tm);
+       printf("Backup of database name:'%s' dbid:0x%x08x from @ %s\n",
+               dbhdr.name, m->db_id, tbuf);
+
+       ZERO_STRUCT(c);
+       c.f = stdout;
+       c.printemptyrecords = (bool)options.printemptyrecords;
+       c.printdatasize = (bool)options.printdatasize;
+       c.printlmaster = false;
+       c.printhash = (bool)options.printhash;
+       c.printrecordflags = (bool)options.printrecordflags;
+
+       for (i=0; i < m->count; i++) {
+               uint32_t reqid = 0;
+               TDB_DATA key, data;
+
+               /* we do not want the header splitted, so we pass NULL*/
+               rec = ctdb_marshall_loop_next(m, rec, &reqid,
+                                             NULL, &key, &data);
+
+               ctdb_dumpdb_record(ctdb, key, data, &c);
+       }
+
+       printf("Dumped %d records\n", i);
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+ * wipe a database from a file
+ */
+static int control_wipedb(struct ctdb_context *ctdb, int argc,
+                         const char **argv)
+{
+       int ret;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       TDB_DATA data;
+       struct ctdb_db_context *ctdb_db;
+       struct ctdb_node_map *nodemap = NULL;
+       struct ctdb_vnn_map *vnnmap = NULL;
+       int i;
+       struct ctdb_control_wipe_database w;
+       uint32_t *nodes;
+       uint32_t generation;
+       uint8_t flags;
+
+       assert_single_node_only();
+
+       if (argc != 1) {
+               DEBUG(DEBUG_ERR,("Invalid arguments\n"));
+               return -1;
+       }
+
+       if (!db_exists(ctdb, argv[0], NULL, &flags)) {
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), argv[0], flags & CTDB_DB_FLAGS_PERSISTENT, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR, ("Unable to attach to database '%s'\n",
+                                 argv[0]));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), options.pnn, ctdb,
+                                  &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from node %u\n",
+                                 options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       ret = ctdb_ctrl_getvnnmap(ctdb, TIMELIMIT(), options.pnn, tmp_ctx,
+                                 &vnnmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get vnnmap from node %u\n",
+                                 options.pnn));
+               talloc_free(tmp_ctx);
+               return ret;
+       }
+
+       /* freeze all nodes */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       for (i=1; i<=NUM_DB_PRIORITIES; i++) {
+               ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_FREEZE,
+                                               nodes, i,
+                                               TIMELIMIT(),
+                                               false, tdb_null,
+                                               NULL, NULL,
+                                               NULL);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("Unable to freeze nodes.\n"));
+                       ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn,
+                                            CTDB_RECOVERY_ACTIVE);
+                       talloc_free(tmp_ctx);
+                       return -1;
+               }
+       }
+
+       generation = vnnmap->generation;
+       data.dptr = (void *)&generation;
+       data.dsize = sizeof(generation);
+
+       /* start a cluster wide transaction */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       ret = ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_START,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL);
+       if (ret!= 0) {
+               DEBUG(DEBUG_ERR, ("Unable to start cluster wide "
+                                 "transactions.\n"));
+               return -1;
+       }
+
+       w.db_id = ctdb_db->db_id;
+       w.transaction_id = generation;
+
+       data.dptr = (void *)&w;
+       data.dsize = sizeof(w);
+
+       /* wipe all the remote databases. */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_WIPE_DATABASE,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to wipe database.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       data.dptr = (void *)&ctdb_db->db_id;
+       data.dsize = sizeof(ctdb_db->db_id);
+
+       /* mark the database as healthy */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_DB_SET_HEALTHY,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Failed to mark database as healthy.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       data.dptr = (void *)&generation;
+       data.dsize = sizeof(generation);
+
+       /* commit all the changes */
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_TRANSACTION_COMMIT,
+                                       nodes, 0,
+                                       TIMELIMIT(), false, data,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to commit databases.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       /* thaw all nodes */
+       nodes = list_of_active_nodes(ctdb, nodemap, tmp_ctx, true);
+       if (ctdb_client_async_control(ctdb, CTDB_CONTROL_THAW,
+                                       nodes, 0,
+                                       TIMELIMIT(),
+                                       false, tdb_null,
+                                       NULL, NULL,
+                                       NULL) != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to thaw nodes.\n"));
+               ctdb_ctrl_setrecmode(ctdb, TIMELIMIT(), options.pnn, CTDB_RECOVERY_ACTIVE);
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+
+       DEBUG(DEBUG_ERR, ("Database wiped.\n"));
+
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  dump memory usage
+ */
+static int control_dumpmemory(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TDB_DATA data;
+       int ret;
+       int32_t res;
+       char *errmsg;
+       TALLOC_CTX *tmp_ctx = talloc_new(ctdb);
+       ret = ctdb_control(ctdb, options.pnn, 0, CTDB_CONTROL_DUMP_MEMORY,
+                          0, tdb_null, tmp_ctx, &data, &res, NULL, &errmsg);
+       if (ret != 0 || res != 0) {
+               DEBUG(DEBUG_ERR,("Failed to dump memory - %s\n", errmsg));
+               talloc_free(tmp_ctx);
+               return -1;
+       }
+       write(1, data.dptr, data.dsize);
+       talloc_free(tmp_ctx);
+       return 0;
+}
+
+/*
+  handler for memory dumps
+*/
+static void mem_dump_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       write(1, data.dptr, data.dsize);
+       exit(0);
+}
+
+/*
+  dump memory usage on the recovery daemon
+ */
+static int control_rddumpmemory(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int ret;
+       TDB_DATA data;
+       struct srvid_request rd;
+
+       rd.pnn = ctdb_get_pnn(ctdb);
+       rd.srvid = getpid();
+
+       /* register a message port for receiveing the reply so that we
+          can receive the reply
+       */
+       ctdb_client_set_message_handler(ctdb, rd.srvid, mem_dump_handler, NULL);
+
+
+       data.dptr = (uint8_t *)&rd;
+       data.dsize = sizeof(rd);
+
+       ret = ctdb_client_send_message(ctdb, options.pnn, CTDB_SRVID_MEM_DUMP, data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send memdump request message to %u\n", options.pnn));
+               return -1;
+       }
+
+       /* this loop will terminate when we have received the reply */
+       while (1) {     
+               event_loop_once(ctdb->ev);
+       }
+
+       return 0;
+}
+
+/*
+  send a message to a srvid
+ */
+static int control_msgsend(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       unsigned long srvid;
+       int ret;
+       TDB_DATA data;
+
+       if (argc < 2) {
+               usage();
+       }
+
+       srvid      = strtoul(argv[0], NULL, 0);
+
+       data.dptr = (uint8_t *)discard_const(argv[1]);
+       data.dsize= strlen(argv[1]);
+
+       ret = ctdb_client_send_message(ctdb, CTDB_BROADCAST_CONNECTED, srvid, data);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR,("Failed to send memdump request message to %u\n", options.pnn));
+               return -1;
+       }
+
+       return 0;
+}
+
+/*
+  handler for msglisten
+*/
+static void msglisten_handler(struct ctdb_context *ctdb, uint64_t srvid, 
+                            TDB_DATA data, void *private_data)
+{
+       int i;
+
+       printf("Message received: ");
+       for (i=0;i<data.dsize;i++) {
+               printf("%c", data.dptr[i]);
+       }
+       printf("\n");
+}
+
+/*
+  listen for messages on a messageport
+ */
+static int control_msglisten(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       uint64_t srvid;
+
+       srvid = getpid();
+
+       /* register a message port and listen for messages
+       */
+       ctdb_client_set_message_handler(ctdb, srvid, msglisten_handler, NULL);
+       printf("Listening for messages on srvid:%d\n", (int)srvid);
+
+       while (1) {     
+               event_loop_once(ctdb->ev);
+       }
+
+       return 0;
+}
+
+/*
+  list all nodes in the cluster
+  we parse the nodes file directly
+ */
+static int control_listnodes(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       TALLOC_CTX *mem_ctx = talloc_new(NULL);
+       struct pnn_node *pnn_nodes;
+       struct pnn_node *pnn_node;
+
+       assert_single_node_only();
+
+       pnn_nodes = read_nodes_file(mem_ctx);
+       if (pnn_nodes == NULL) {
+               DEBUG(DEBUG_ERR,("Failed to read nodes file\n"));
+               talloc_free(mem_ctx);
+               return -1;
+       }
+
+       for(pnn_node=pnn_nodes;pnn_node;pnn_node=pnn_node->next) {
+               ctdb_sock_addr addr;
+               if (parse_ip(pnn_node->addr, NULL, 63999, &addr) == 0) {
+                       DEBUG(DEBUG_ERR,("Wrongly formed ip address '%s' in nodes file\n", pnn_node->addr));
+                       talloc_free(mem_ctx);
+                       return -1;
+               }
+               if (options.machinereadable){
+                       printf(":%d:%s:\n", pnn_node->pnn, pnn_node->addr);
+               } else {
+                       printf("%s\n", pnn_node->addr);
+               }
+       }
+       talloc_free(mem_ctx);
+
+       return 0;
+}
+
+/*
+  reload the nodes file on the local node
+ */
+static int control_reload_nodes_file(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       int i, ret;
+       int mypnn;
+       struct ctdb_node_map *nodemap=NULL;
+
+       assert_single_node_only();
+
+       mypnn = ctdb_get_pnn(ctdb);
+
+       ret = ctdb_ctrl_getnodemap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &nodemap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get nodemap from local node\n"));
+               return ret;
+       }
+
+       /* reload the nodes file on all remote nodes */
+       for (i=0;i<nodemap->num;i++) {
+               if (nodemap->nodes[i].pnn == mypnn) {
+                       continue;
+               }
+               DEBUG(DEBUG_NOTICE, ("Reloading nodes file on node %u\n", nodemap->nodes[i].pnn));
+               ret = ctdb_ctrl_reload_nodes_file(ctdb, TIMELIMIT(),
+                       nodemap->nodes[i].pnn);
+               if (ret != 0) {
+                       DEBUG(DEBUG_ERR, ("ERROR: Failed to reload nodes file on node %u. You MUST fix that node manually!\n", nodemap->nodes[i].pnn));
+               }
+       }
+
+       /* reload the nodes file on the local node */
+       DEBUG(DEBUG_NOTICE, ("Reloading nodes file on node %u\n", mypnn));
+       ret = ctdb_ctrl_reload_nodes_file(ctdb, TIMELIMIT(), mypnn);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("ERROR: Failed to reload nodes file on node %u. You MUST fix that node manually!\n", mypnn));
+       }
+
+       /* initiate a recovery */
+       control_recover(ctdb, argc, argv);
+
+       return 0;
+}
+
+
+static const struct {
+       const char *name;
+       int (*fn)(struct ctdb_context *, int, const char **);
+       bool auto_all;
+       bool without_daemon; /* can be run without daemon running ? */
+       const char *msg;
+       const char *args;
+} ctdb_commands[] = {
+       { "version",         control_version,           true,   true,   "show version of ctdb" },
+       { "status",          control_status,            true,   false,  "show node status" },
+       { "uptime",          control_uptime,            true,   false,  "show node uptime" },
+       { "ping",            control_ping,              true,   false,  "ping all nodes" },
+       { "runstate",        control_runstate,          true,   false,  "get/check runstate of a node", "[setup|first_recovery|startup|running]" },
+       { "getvar",          control_getvar,            true,   false,  "get a tunable variable",               "<name>"},
+       { "setvar",          control_setvar,            true,   false,  "set a tunable variable",               "<name> <value>"},
+       { "listvars",        control_listvars,          true,   false,  "list tunable variables"},
+       { "statistics",      control_statistics,        false,  false, "show statistics" },
+       { "statisticsreset", control_statistics_reset,  true,   false,  "reset statistics"},
+       { "stats",           control_stats,             false,  false,  "show rolling statistics", "[number of history records]" },
+       { "ip",              control_ip,                false,  false,  "show which public ip's that ctdb manages" },
+       { "ipinfo",          control_ipinfo,            true,   false,  "show details about a public ip that ctdb manages", "<ip>" },
+       { "ifaces",          control_ifaces,            true,   false,  "show which interfaces that ctdb manages" },
+       { "setifacelink",    control_setifacelink,      true,   false,  "set interface link status", "<iface> <status>" },
+       { "process-exists",  control_process_exists,    true,   false,  "check if a process exists on a node",  "<pid>"},
+       { "getdbmap",        control_getdbmap,          true,   false,  "show the database map" },
+       { "getdbstatus",     control_getdbstatus,       true,   false,  "show the status of a database", "<dbname|dbid>" },
+       { "catdb",           control_catdb,             true,   false,  "dump a ctdb database" ,                     "<dbname|dbid>"},
+       { "cattdb",          control_cattdb,            true,   false,  "dump a local tdb database" ,                     "<dbname|dbid>"},
+       { "getmonmode",      control_getmonmode,        true,   false,  "show monitoring mode" },
+       { "getcapabilities", control_getcapabilities,   true,   false,  "show node capabilities" },
+       { "pnn",             control_pnn,               true,   false,  "show the pnn of the currnet node" },
+       { "lvs",             control_lvs,               true,   false,  "show lvs configuration" },
+       { "lvsmaster",       control_lvsmaster,         true,   false,  "show which node is the lvs master" },
+       { "disablemonitor",      control_disable_monmode,true,  false,  "set monitoring mode to DISABLE" },
+       { "enablemonitor",      control_enable_monmode, true,   false,  "set monitoring mode to ACTIVE" },
+       { "setdebug",        control_setdebug,          true,   false,  "set debug level",                      "<EMERG|ALERT|CRIT|ERR|WARNING|NOTICE|INFO|DEBUG>" },
+       { "getdebug",        control_getdebug,          true,   false,  "get debug level" },
+       { "getlog",          control_getlog,            true,   false,  "get the log data from the in memory ringbuffer", "[<level>] [recoverd]" },
+       { "clearlog",          control_clearlog,        true,   false,  "clear the log data from the in memory ringbuffer", "[recoverd]" },
+       { "attach",          control_attach,            true,   false,  "attach to a database",                 "<dbname> [persistent]" },
+       { "dumpmemory",      control_dumpmemory,        true,   false,  "dump memory map to stdout" },
+       { "rddumpmemory",    control_rddumpmemory,      true,   false,  "dump memory map from the recovery daemon to stdout" },
+       { "getpid",          control_getpid,            true,   false,  "get ctdbd process ID" },
+       { "disable",         control_disable,           true,   false,  "disable a nodes public IP" },
+       { "enable",          control_enable,            true,   false,  "enable a nodes public IP" },
+       { "stop",            control_stop,              true,   false,  "stop a node" },
+       { "continue",        control_continue,          true,   false,  "re-start a stopped node" },
+       { "ban",             control_ban,               true,   false,  "ban a node from the cluster",          "<bantime>"},
+       { "unban",           control_unban,             true,   false,  "unban a node" },
+       { "showban",         control_showban,           true,   false,  "show ban information"},
+       { "shutdown",        control_shutdown,          true,   false,  "shutdown ctdbd" },
+       { "recover",         control_recover,           true,   false,  "force recovery" },
+       { "sync",            control_ipreallocate,      false,  false,  "wait until ctdbd has synced all state changes" },
+       { "ipreallocate",    control_ipreallocate,      false,  false,  "force the recovery daemon to perform a ip reallocation procedure" },
+       { "thaw",            control_thaw,              true,   false,  "thaw databases", "[priority:1-3]" },
+       { "isnotrecmaster",  control_isnotrecmaster,    false,  false,  "check if the local node is recmaster or not" },
+       { "killtcp",         kill_tcp,                  false,  false, "kill a tcp connection.", "[<srcip:port> <dstip:port>]" },
+       { "gratiousarp",     control_gratious_arp,      false,  false, "send a gratious arp", "<ip> <interface>" },
+       { "tickle",          tickle_tcp,                false,  false, "send a tcp tickle ack", "<srcip:port> <dstip:port>" },
+       { "gettickles",      control_get_tickles,       false,  false, "get the list of tickles registered for this ip", "<ip> [<port>]" },
+       { "addtickle",       control_add_tickle,        false,  false, "add a tickle for this ip", "<ip>:<port> <ip>:<port>" },
+
+       { "deltickle",       control_del_tickle,        false,  false, "delete a tickle from this ip", "<ip>:<port> <ip>:<port>" },
+
+       { "regsrvid",        regsrvid,                  false,  false, "register a server id", "<pnn> <type> <id>" },
+       { "unregsrvid",      unregsrvid,                false,  false, "unregister a server id", "<pnn> <type> <id>" },
+       { "chksrvid",        chksrvid,                  false,  false, "check if a server id exists", "<pnn> <type> <id>" },
+       { "getsrvids",       getsrvids,                 false,  false, "get a list of all server ids"},
+       { "check_srvids",    check_srvids,              false,  false, "check if a srvid exists", "<id>+" },
+       { "repack",          ctdb_repack,               false,  false, "repack all databases", "[max_freelist]"},
+       { "listnodes",       control_listnodes,         false,  true, "list all nodes in the cluster"},
+       { "reloadnodes",     control_reload_nodes_file, false,  false, "reload the nodes file and restart the transport on all nodes"},
+       { "moveip",          control_moveip,            false,  false, "move/failover an ip address to another node", "<ip> <node>"},
+       { "rebalanceip",     control_rebalanceip,       false,  false, "release an ip from the node and let recd rebalance it", "<ip>"},
+       { "addip",           control_addip,             true,   false, "add a ip address to a node", "<ip/mask> <iface>"},
+       { "delip",           control_delip,             false,  false, "delete an ip address from a node", "<ip>"},
+       { "eventscript",     control_eventscript,       true,   false, "run the eventscript with the given parameters on a node", "<arguments>"},
+       { "backupdb",        control_backupdb,          false,  false, "backup the database into a file.", "<dbname|dbid> <file>"},
+       { "restoredb",        control_restoredb,        false,  false, "restore the database from a file.", "<file> [dbname]"},
+       { "dumpdbbackup",    control_dumpdbbackup,      false,  true,  "dump database backup from a file.", "<file>"},
+       { "wipedb",           control_wipedb,        false,     false, "wipe the contents of a database.", "<dbname|dbid>"},
+       { "recmaster",        control_recmaster,        true,   false, "show the pnn for the recovery master."},
+       { "scriptstatus",     control_scriptstatus,     true,   false, "show the status of the monitoring scripts (or all scripts)", "[all]"},
+       { "enablescript",     control_enablescript,  true,      false, "enable an eventscript", "<script>"},
+       { "disablescript",    control_disablescript,  true,     false, "disable an eventscript", "<script>"},
+       { "natgwlist",        control_natgwlist,        true,   false, "show the nodes belonging to this natgw configuration"},
+       { "xpnn",             control_xpnn,             false,  true,  "find the pnn of the local node without talking to the daemon (unreliable)" },
+       { "getreclock",       control_getreclock,       true,   false, "Show the reclock file of a node"},
+       { "setreclock",       control_setreclock,       true,   false, "Set/clear the reclock file of a node", "[filename]"},
+       { "setnatgwstate",    control_setnatgwstate,    false,  false, "Set NATGW state to on/off", "{on|off}"},
+       { "setlmasterrole",   control_setlmasterrole,   false,  false, "Set LMASTER role to on/off", "{on|off}"},
+       { "setrecmasterrole", control_setrecmasterrole, false,  false, "Set RECMASTER role to on/off", "{on|off}"},
+       { "setdbprio",        control_setdbprio,        false,  false, "Set DB priority", "<dbname|dbid> <prio:1-3>"},
+       { "getdbprio",        control_getdbprio,        false,  false, "Get DB priority", "<dbname|dbid>"},
+       { "setdbreadonly",    control_setdbreadonly,    false,  false, "Set DB readonly capable", "<dbname|dbid>"},
+       { "setdbsticky",      control_setdbsticky,      false,  false, "Set DB sticky-records capable", "<dbname|dbid>"},
+       { "msglisten",        control_msglisten,        false,  false, "Listen on a srvid port for messages", "<msg srvid>"},
+       { "msgsend",          control_msgsend,  false,  false, "Send a message to srvid", "<srvid> <message>"},
+       { "pfetch",          control_pfetch,            false,  false,  "fetch a record from a persistent database", "<dbname|dbid> <key> [<file>]" },
+       { "pstore",          control_pstore,            false,  false,  "write a record to a persistent database", "<dbname|dbid> <key> <file containing record>" },
+       { "pdelete",         control_pdelete,           false,  false,  "delete a record from a persistent database", "<dbname|dbid> <key>" },
+       { "tfetch",          control_tfetch,            false,  true,  "fetch a record from a [c]tdb-file [-v]", "<tdb-file> <key> [<file>]" },
+       { "tstore",          control_tstore,            false,  true,  "store a record (including ltdb header)", "<tdb-file> <key> <data+header>" },
+       { "readkey",         control_readkey,           true,   false,  "read the content off a database key", "<tdb-file> <key>" },
+       { "writekey",        control_writekey,          true,   false,  "write to a database key", "<tdb-file> <key> <value>" },
+       { "checktcpport",    control_chktcpport,        false,  true,  "check if a service is bound to a specific tcp port or not", "<port>" },
+       { "rebalancenode",     control_rebalancenode,   false,  false, "mark nodes as forced IP rebalancing targets", "[<pnn-list>]"},
+       { "getdbseqnum",     control_getdbseqnum,       false,  false, "get the sequence number off a database", "<dbname|dbid>" },
+       { "nodestatus",      control_nodestatus,        true,   false,  "show and return node status", "[<pnn-list>]" },
+       { "dbstatistics",    control_dbstatistics,      false,  false, "show db statistics", "<dbname|dbid>" },
+       { "reloadips",       control_reloadips,         false,  false, "reload the public addresses file on specified nodes" , "[<pnn-list>]" },
+       { "ipiface",         control_ipiface,           false,  true,  "Find which interface an ip address is hosted on", "<ip>" },
+};
+
+/*
+  show usage message
+ */
+static void usage(void)
+{
+       int i;
+       printf(
+"Usage: ctdb [options] <control>\n" \
+"Options:\n" \
+"   -n <node>          choose node number, or 'all' (defaults to local node)\n"
+"   -Y                 generate machinereadable output\n"
+"   -v                 generate verbose output\n"
+"   -t <timelimit>     set timelimit for control in seconds (default %u)\n", options.timelimit);
+       printf("Controls:\n");
+       for (i=0;i<ARRAY_SIZE(ctdb_commands);i++) {
+               printf("  %-15s %-27s  %s\n", 
+                      ctdb_commands[i].name, 
+                      ctdb_commands[i].args?ctdb_commands[i].args:"",
+                      ctdb_commands[i].msg);
+       }
+       exit(1);
+}
+
+
+static void ctdb_alarm(int sig)
+{
+       printf("Maximum runtime exceeded - exiting\n");
+       _exit(ERR_TIMEOUT);
+}
+
+/*
+  main program
+*/
+int main(int argc, const char *argv[])
+{
+       struct ctdb_context *ctdb;
+       char *nodestring = NULL;
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               POPT_CTDB_CMDLINE
+               { "timelimit", 't', POPT_ARG_INT, &options.timelimit, 0, "timelimit", "integer" },
+               { "node",      'n', POPT_ARG_STRING, &nodestring, 0, "node", "integer|all" },
+               { "machinereadable", 'Y', POPT_ARG_NONE, &options.machinereadable, 0, "enable machinereadable output", NULL },
+               { "verbose",    'v', POPT_ARG_NONE, &options.verbose, 0, "enable verbose output", NULL },
+               { "maxruntime", 'T', POPT_ARG_INT, &options.maxruntime, 0, "die if runtime exceeds this limit (in seconds)", "integer" },
+               { "print-emptyrecords", 0, POPT_ARG_NONE, &options.printemptyrecords, 0, "print the empty records when dumping databases (catdb, cattdb, dumpdbbackup)", NULL },
+               { "print-datasize", 0, POPT_ARG_NONE, &options.printdatasize, 0, "do not print record data when dumping databases, only the data size", NULL },
+               { "print-lmaster", 0, POPT_ARG_NONE, &options.printlmaster, 0, "print the record's lmaster in catdb", NULL },
+               { "print-hash", 0, POPT_ARG_NONE, &options.printhash, 0, "print the record's hash when dumping databases", NULL },
+               { "print-recordflags", 0, POPT_ARG_NONE, &options.printrecordflags, 0, "print the record flags in catdb and dumpdbbackup", NULL },
+               POPT_TABLEEND
+       };
+       int opt;
+       const char **extra_argv;
+       int extra_argc = 0;
+       int ret=-1, i;
+       poptContext pc;
+       struct event_context *ev;
+       const char *control;
+
+       setlinebuf(stdout);
+       
+       /* set some defaults */
+       options.maxruntime = 0;
+       options.timelimit = 10;
+       options.pnn = CTDB_CURRENT_NODE;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       DEBUG(DEBUG_ERR, ("Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt)));
+                       exit(1);
+               }
+       }
+
+       /* setup the remaining options for the main program to use */
+       extra_argv = poptGetArgs(pc);
+       if (extra_argv) {
+               extra_argv++;
+               while (extra_argv[extra_argc]) extra_argc++;
+       }
+
+       if (extra_argc < 1) {
+               usage();
+       }
+
+       if (options.maxruntime == 0) {
+               const char *ctdb_timeout;
+               ctdb_timeout = getenv("CTDB_TIMEOUT");
+               if (ctdb_timeout != NULL) {
+                       options.maxruntime = strtoul(ctdb_timeout, NULL, 0);
+               } else {
+                       /* default timeout is 120 seconds */
+                       options.maxruntime = 120;
+               }
+       }
+
+       signal(SIGALRM, ctdb_alarm);
+       alarm(options.maxruntime);
+
+       control = extra_argv[0];
+
+       /* Default value for CTDB_BASE - don't override */
+       setenv("CTDB_BASE", ETCDIR "/ctdb", 0);
+
+       ev = event_context_init(NULL);
+       if (!ev) {
+               DEBUG(DEBUG_ERR, ("Failed to initialize event system\n"));
+               exit(1);
+       }
+
+       for (i=0;i<ARRAY_SIZE(ctdb_commands);i++) {
+               if (strcmp(control, ctdb_commands[i].name) == 0) {
+                       break;
+               }
+       }
+
+       if (i == ARRAY_SIZE(ctdb_commands)) {
+               DEBUG(DEBUG_ERR, ("Unknown control '%s'\n", control));
+               exit(1);
+       }
+
+       if (ctdb_commands[i].without_daemon == true) {
+               if (nodestring != NULL) {
+                       DEBUG(DEBUG_ERR, ("Can't specify node(s) with \"ctdb %s\"\n", control));
+                       exit(1);
+               }
+               close(2);
+               return ctdb_commands[i].fn(NULL, extra_argc-1, extra_argv+1);
+       }
+
+       /* initialise ctdb */
+       ctdb = ctdb_cmdline_client(ev, TIMELIMIT());
+
+       if (ctdb == NULL) {
+               DEBUG(DEBUG_ERR, ("Failed to init ctdb\n"));
+               exit(1);
+       }
+
+       /* setup the node number(s) to contact */
+       if (!parse_nodestring(ctdb, ctdb, nodestring, CTDB_CURRENT_NODE, false,
+                             &options.nodes, &options.pnn)) {
+               usage();
+       }
+
+       if (options.pnn == CTDB_CURRENT_NODE) {
+               options.pnn = options.nodes[0];
+       }
+
+       if (ctdb_commands[i].auto_all && 
+           ((options.pnn == CTDB_BROADCAST_ALL) ||
+            (options.pnn == CTDB_MULTICAST))) {
+               int j;
+
+               ret = 0;
+               for (j = 0; j < talloc_array_length(options.nodes); j++) {
+                       options.pnn = options.nodes[j];
+                       ret |= ctdb_commands[i].fn(ctdb, extra_argc-1, extra_argv+1);
+               }
+       } else {
+               ret = ctdb_commands[i].fn(ctdb, extra_argc-1, extra_argv+1);
+       }
+
+       talloc_free(ctdb);
+       talloc_free(ev);
+       (void)poptFreeContext(pc);
+
+       return ret;
+
+}
diff --git a/ctdb/tools/ctdb_diagnostics b/ctdb/tools/ctdb_diagnostics
new file mode 100755 (executable)
index 0000000..2a51e1b
--- /dev/null
@@ -0,0 +1,335 @@
+#!/bin/sh
+# a script to test the basic setup of a CTDB/Samba install 
+# tridge@samba.org September 2007
+# martin@meltin.net August 2010
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: ctdb_diagnostics [OPTION] ...
+  options:
+    -n <nodes>  Comma separated list of nodes to operate on
+    -c          Ignore comment lines (starting with '#') in file comparisons
+    -w          Ignore whitespace in file comparisons
+    --no-ads    Do not use commands that assume an Active Directory Server
+EOF
+    exit 1
+
+}
+
+nodes=$(ctdb listnodes -Y | cut -d: -f2)
+bad_nodes=""
+diff_opts=
+no_ads=false
+
+parse_options ()
+{
+    temp=$(getopt -n "ctdb_diagnostics" -o "n:cwh" -l no-ads,help -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$temp"
+
+    while true ; do
+       case "$1" in
+           -n) nodes=$(echo "$2" | sed -e 's@,@ @g') ; shift 2 ;;
+           -c) diff_opts="${diff_opts} -I ^#.*" ; shift ;;
+           -w) diff_opts="${diff_opts} -w" ; shift ;;
+           --no-ads) no_ads=true ; shift ;;
+           --) shift ; break ;;
+           -h|--help|*) usage ;;
+       esac
+    done
+
+    [ $# -ne 0 ] && usage
+}
+
+parse_options "$@"
+
+# Use 5s ssh timeout if EXTRA_SSH_OPTS doesn't set a timeout.
+case "$EXTRA_SSH_OPTS" in
+    *ConnectTimeout=*) : ;;
+    *)
+       export EXTRA_SSH_OPTS="${EXTRA_SSH_OPTS} -o ConnectTimeout=5"
+esac
+
+# Filter nodes.  Remove any nodes we can't contact from $node and add
+# them to $bad_nodes.
+_nodes=""
+for _i in $nodes ; do
+    if onnode $_i true >/dev/null 2>&1 ; then
+       _nodes="${_nodes}${_nodes:+ }${_i}"
+    else
+       bad_nodes="${bad_nodes}${bad_nodes:+,}${_i}"
+    fi
+done
+nodes="$_nodes"
+
+nodes_comma=$(echo $nodes | sed -e 's@[[:space:]]@,@g')
+
+PATH="$PATH:/sbin:/usr/sbin:/usr/lpp/mmfs/bin"
+
+# list of config files that must exist and that we check are the same 
+# on the nodes
+if [ -d /etc/sysconfig ] ; then
+    CONFIG_FILES_MUST="/etc/krb5.conf /etc/hosts /etc/ctdb/nodes /etc/sysconfig/ctdb /etc/resolv.conf /etc/nsswitch.conf /etc/sysctl.conf /etc/samba/smb.conf /etc/fstab /etc/multipath.conf /etc/pam.d/system-auth /etc/sysconfig/nfs /etc/exports /etc/vsftpd/vsftpd.conf"
+else
+    CONFIG_FILES_MUST="/etc/krb5.conf /etc/hosts /etc/ctdb/nodes /etc/default/ctdb /etc/resolv.conf /etc/nsswitch.conf /etc/sysctl.conf /etc/samba/smb.conf /etc/fstab /etc/multipath.conf /etc/pam.d/system-auth /etc/default/nfs /etc/exports /etc/vsftpd/vsftpd.conf"
+fi
+
+# list of config files that may exist and should be checked that they
+# are the same on the nodes
+CONFIG_FILES_MAY="/etc/ctdb/public_addresses /etc/ctdb/static-routes"
+
+2>&1
+
+cat <<EOF
+--------------------------------------------------------------------
+ctdb_diagnostics starting. This script will gather information about
+your ctdb cluster. You should send the output of this script along
+with any ctdb or clustered Samba bug reports.
+--------------------------------------------------------------------
+EOF
+
+date
+
+error() {
+    msg="$1"
+    echo "ERROR: $msg"
+    NUM_ERRORS=`expr $NUM_ERRORS + 1`
+    echo " ERROR[$NUM_ERRORS]: $msg" >> $ERRORS
+}
+
+show_file() {
+    fname="$1"
+    echo "  ================================"
+    echo "  File: $fname"
+    echo "  `ls -l $fname 2>&1`"
+    cat "$fname" 2>&1 | sed 's/^/  /'
+    echo "  ================================"
+}
+
+show_all() {
+    echo "running $1 on nodes $nodes_comma"
+    onnode $nodes_comma "hostname; date; $1 2>&1 | sed 's/^/  /'" 2>&1
+}
+
+show_and_compare_files () {
+
+    fmt="$1" ; shift
+
+    for f ; do
+       first=true
+
+       for n in $nodes ; do
+
+           if $first ; then
+               onnode $n [ -r "$f" ] || {
+                   msg=$(printf "$fmt" "$f" $n)
+                   error "$msg"
+                   continue 2;
+               }
+
+               fstf=$tmpdir/`basename $f`.node$n
+               onnode $n cat $f > $fstf 2>&1
+
+               echo "  ================================"
+               echo "  File (on node $n): $f"
+               echo "  `onnode $n ls -l $f 2>&1`"
+               cat "$fstf" | sed 's/^/  /'
+               echo "  ================================"
+               first=false
+           else
+               echo "Testing for same config file $f on node $n"
+               tmpf=$tmpdir/`basename $f`.node$n
+               onnode $n cat $f > $tmpf 2>&1
+               diff $diff_opts $fstf $tmpf >/dev/null 2>&1 || {
+                   error "File $f is different on node $n"
+                   diff -u $diff_opts $fstf $tmpf
+               }
+               rm -f $tmpf
+           fi
+       done
+
+       rm -f $fstf
+    done
+}
+
+if ! tmpdir=$(mktemp -d) ; then
+    echo "Unable to create a temporary directory"
+    exit 1
+fi
+ERRORS="${tmpdir}/diag_err"
+NUM_ERRORS=0
+
+cat <<EOF
+Diagnosis started on these nodes:
+$nodes_comma
+EOF
+
+if [ -n "$bad_nodes" ] ; then
+    cat <<EOF
+
+NOT RUNNING DIAGNOSTICS on these uncontactable nodes:
+$bad_nodes
+EOF
+
+fi
+
+cat <<EOF
+
+For reference, here is the nodes file on the current node...
+EOF
+
+show_file /etc/ctdb/nodes
+
+cat <<EOF
+--------------------------------------------------------------------
+Comping critical config files on nodes $nodes_comma
+EOF
+
+show_and_compare_files \
+    "%s is missing on node %d" \
+    $CONFIG_FILES_MUST
+
+show_and_compare_files \
+    "Optional file %s is not present on node %d" \
+    $CONFIG_FILES_MAY
+
+cat <<EOF
+--------------------------------------------------------------------
+Checking for clock drift
+EOF
+t=`date +%s`
+for i in $nodes; do
+    t2=`onnode $i date +%s`
+    d=`expr $t2 - $t`
+    if [ $d -gt 30 -o $d -lt -30 ]; then
+       error "time on node $i differs by $d seconds"
+    fi
+done
+
+cat <<EOF
+--------------------------------------------------------------------
+Showing software versions
+EOF
+show_all "uname -a"
+[ -x /bin/rpm ] && {
+    show_all "rpm -qa | egrep 'samba|ctdb|gpfs'"
+}
+[ -x /usr/bin/dpkg-query ] && {
+    show_all "/usr/bin/dpkg-query --show 'ctdb'"
+    show_all "/usr/bin/dpkg-query --show 'samba'"
+    #show_all "/usr/bin/dpkg-query --show 'gpfs'"
+}
+
+
+cat <<EOF
+--------------------------------------------------------------------
+Showing ctdb status and recent log entries
+EOF
+show_all "ctdb status; ctdb ip"
+show_all "ctdb statistics"
+show_all "ctdb uptime"
+show_all "ctdb listvars"
+show_all "ctdb getdbmap"
+
+echo "Showing log.ctdb"
+show_all "test -f /var/log/log.ctdb && tail -100 /var/log/log.ctdb"
+
+echo "Showing log.ctdb"
+show_all "test -f /var/log/log.ctdb && tail -100 /var/log/log.ctdb"
+
+show_all "tail -200 /var/log/messages"
+show_all "tail -200 /etc/ctdb/state/vacuum.log"
+show_all "ls -lRs /var/ctdb"
+show_all "ls -lRs /etc/ctdb"
+
+
+cat <<EOF
+--------------------------------------------------------------------
+Showing system and process status
+EOF
+show_all "df"
+show_all "df -i"
+show_all "mount"
+show_all "w"
+show_all "ps axfwu"
+show_all "dmesg"
+show_all "/sbin/lspci"
+show_all "dmidecode"
+show_all "cat /proc/partitions"
+show_all "cat /proc/cpuinfo"
+show_all "cat /proc/scsi/scsi"
+show_all "/sbin/ifconfig -a"
+show_all "/sbin/ifconfig -a"
+show_all "/sbin/ip addr list"
+show_all "/sbin/route -n"
+show_all "netstat -s"
+show_all "free"
+show_all "crontab -l"
+show_all "sysctl -a"
+show_all "iptables -L -n"
+show_all "iptables -L -n -t nat"
+show_all "/usr/sbin/rpcinfo -p"
+show_all "/usr/sbin/showmount -a"
+show_all "/usr/sbin/showmount -e"
+show_all "/usr/sbin/nfsstat -v"
+[ -x /sbin/multipath ] && {
+    show_all "/sbin/multipath -ll"
+}
+[ -x /sbin/chkconfig ] && {
+    show_all "/sbin/chkconfig --list"
+}
+[ -x /usr/sbin/getenforce ] && {
+    show_all "/usr/sbin/getenforce"
+}
+[ -d /proc/net/bonding ] && {
+    for f in /proc/net/bonding/*; do
+       show_all "cat $f"
+    done
+}
+
+cat <<EOF
+--------------------------------------------------------------------
+Showing Samba status
+EOF
+show_all "smbstatus -n -B"
+if $no_ads ; then
+    echo
+    echo "Skipping \"net ads testjoin\" as requested"
+    echo
+else
+    show_all "net ads testjoin"
+fi
+show_all "net conf list"
+show_all "lsof -n | grep smbd"
+show_all "lsof -n | grep ctdbd"
+show_all "netstat -tan"
+if $no_ads ; then
+    echo
+    echo "Skipping \"net ads info\" as requested"
+    echo
+else
+    show_all "net ads info"
+fi
+show_all "date"
+show_all "smbclient -U% -L 127.0.0.1"
+WORKGROUP=`testparm -s --parameter-name=WORKGROUP 2> /dev/null`
+show_all id "$WORKGROUP/Administrator"
+show_all "wbinfo -p"
+show_all "wbinfo --online-status"
+show_all "smbd -b"
+
+date
+echo "Diagnostics finished with $NUM_ERRORS errors"
+
+[ -r $ERRORS ] && {
+    cat $ERRORS
+    rm -f $ERRORS
+}
+
+rm -rf "$tmpdir"
+
+exit $NUM_ERRORS
+
diff --git a/ctdb/tools/ctdb_vacuum.c b/ctdb/tools/ctdb_vacuum.c
new file mode 100644 (file)
index 0000000..808b15c
--- /dev/null
@@ -0,0 +1,193 @@
+/* 
+   ctdb control tool - database vacuum 
+
+   Copyright (C) Andrew Tridgell  2008
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include "includes.h"
+#include "system/filesys.h"
+#include "system/network.h"
+#include "../include/ctdb_client.h"
+#include "../include/ctdb_private.h"
+#include "../common/rb_tree.h"
+#include "db_wrap.h"
+
+/* should be tunable */
+#define TIMELIMIT() timeval_current_ofs(10, 0)
+
+
+struct vacuum_traverse_state {
+       bool error;
+       struct tdb_context *dest_db;
+};
+
+/*
+  traverse function for repacking
+ */
+static int repack_traverse(struct tdb_context *tdb, TDB_DATA key, TDB_DATA data, void *private)
+{
+       struct vacuum_traverse_state *state = (struct vacuum_traverse_state *)private;
+       if (tdb_store(state->dest_db, key, data, TDB_INSERT) != 0) {
+               state->error = true;
+               return -1;
+       }
+       return 0;
+}
+
+/*
+  repack a tdb
+ */
+static int ctdb_repack_tdb(struct tdb_context *tdb)
+{
+       struct tdb_context *tmp_db;
+       struct vacuum_traverse_state state;
+
+       if (tdb_transaction_start(tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to start transaction\n"));
+               return -1;
+       }
+
+       tmp_db = tdb_open("tmpdb", tdb_hash_size(tdb),
+                         TDB_INTERNAL|TDB_DISALLOW_NESTING,
+                         O_RDWR|O_CREAT, 0);
+       if (tmp_db == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to create tmp_db\n"));
+               tdb_transaction_cancel(tdb);
+               return -1;
+       }
+
+       state.error = false;
+       state.dest_db = tmp_db;
+
+       if (tdb_traverse_read(tdb, repack_traverse, &state) == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to traverse copying out\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;              
+       }
+
+       if (state.error) {
+               DEBUG(DEBUG_ERR,(__location__ " Error during traversal\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       if (tdb_wipe_all(tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to wipe database\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       state.error = false;
+       state.dest_db = tdb;
+
+       if (tdb_traverse_read(tmp_db, repack_traverse, &state) == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to traverse copying back\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;              
+       }
+
+       if (state.error) {
+               DEBUG(DEBUG_ERR,(__location__ " Error during second traversal\n"));
+               tdb_transaction_cancel(tdb);
+               tdb_close(tmp_db);
+               return -1;
+       }
+
+       tdb_close(tmp_db);
+
+       if (tdb_transaction_commit(tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to commit\n"));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/* repack one database */
+static int ctdb_repack_db(struct ctdb_context *ctdb, uint32_t db_id, 
+                         bool persistent, uint32_t repack_limit)
+{
+       struct ctdb_db_context *ctdb_db;
+       const char *name;
+       int size;
+
+       if (ctdb_ctrl_getdbname(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, db_id, ctdb, &name) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get name of db 0x%x\n", db_id));
+               return -1;
+       }
+
+       ctdb_db = ctdb_attach(ctdb, TIMELIMIT(), name, persistent, 0);
+       if (ctdb_db == NULL) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to attach to database '%s'\n", name));
+               return -1;
+       }
+
+       size = tdb_freelist_size(ctdb_db->ltdb->tdb);
+       if (size == -1) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to get freelist size for '%s'\n", name));
+               return -1;
+       }
+
+       if (size <= repack_limit) {
+               return 0;
+       }
+
+       printf("Repacking %s with %u freelist entries\n", name, size);
+
+       if (ctdb_repack_tdb(ctdb_db->ltdb->tdb) != 0) {
+               DEBUG(DEBUG_ERR,(__location__ " Failed to repack '%s'\n", name));
+               return -1;
+       }
+
+       return 0;
+}
+
+
+/*
+  repack all our databases
+ */
+int ctdb_repack(struct ctdb_context *ctdb, int argc, const char **argv)
+{
+       struct ctdb_dbid_map *dbmap=NULL;
+       int ret, i;
+       /* a reasonable default limit to prevent us using too much memory */
+       uint32_t repack_limit = 10000; 
+
+       if (argc > 0) {
+               repack_limit = atoi(argv[0]);
+       }
+
+       ret = ctdb_ctrl_getdbmap(ctdb, TIMELIMIT(), CTDB_CURRENT_NODE, ctdb, &dbmap);
+       if (ret != 0) {
+               DEBUG(DEBUG_ERR, ("Unable to get dbids from local node\n"));
+               return ret;
+       }
+
+       for (i=0;i<dbmap->num;i++) {
+               if (ctdb_repack_db(ctdb, dbmap->dbs[i].dbid, 
+                                  dbmap->dbs[i].flags & CTDB_DB_FLAGS_PERSISTENT, repack_limit) != 0) {
+                       DEBUG(DEBUG_ERR,("Failed to repack db 0x%x\n", dbmap->dbs[i].dbid));
+                       return -1;
+               }
+       }
+
+       return 0;
+}
diff --git a/ctdb/tools/ltdbtool.c b/ctdb/tools/ltdbtool.c
new file mode 100644 (file)
index 0000000..13fd7f9
--- /dev/null
@@ -0,0 +1,395 @@
+/*
+ * ctdb local tdb tool
+ *
+ * Copyright (C) Gregor Beck 2011
+ * Copyright (C) Michael Adam 2011
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <ctype.h> /* isprint */
+#include <string.h> /* strstr */
+#include <fcntl.h> /* mode_t */
+#include <sys/stat.h> /* S_IRUSR */
+#include <stdint.h> /* uint32_t */
+#include <netinet/in.h> /* struct sockaddr_in */
+#include <sys/socket.h> /* struct sockaddr */
+#include <sys/param.h>  /* MIN */
+#include <tdb.h>
+#include <unistd.h> /* getopt */
+#include <errno.h>
+
+#include "ctdb_protocol.h"
+
+enum {
+       MAX_HEADER_SIZE=24,
+       OUT_MODE = S_IRUSR | S_IWUSR,
+       OUT_FLAGS = O_EXCL|O_CREAT|O_RDWR,
+};
+
+union  ltdb_header {
+       struct ctdb_ltdb_header hdr;
+       uint32_t uints[MAX_HEADER_SIZE/4];
+};
+
+static const union ltdb_header DEFAULT_HDR = {
+       .hdr.dmaster = -1,
+};
+
+static int help(const char* cmd)
+{
+       fprintf(stdout, ""
+"Usage: %s [options] <command>\n"
+"\n"
+"Options:\n"
+"   -s {0|32|64}    specify how to determine the ctdb record header size\n"
+"                   for the input database:\n"
+"                   0: no ctdb header\n"
+"                   32: ctdb header size of a 32 bit system (20 bytes)\n"
+"                   64: ctdb header size of a 64 bit system (24 bytes)\n"
+"                   default: 32 or 64 depending on the system architecture\n"
+"\n"
+"   -S <num>        the number of bytes to interpret as ctdb record header\n"
+"                   for the input database (beware!)\n"
+"\n"
+"   -o {0|32|64}    specify how to determine the ctdb record header size\n"
+"                   for the output database\n"
+"                   0: no ctdb header\n"
+"                   32: ctdb header size of a 32 bit system (20 bytes)\n"
+"                   64: ctdb header size of a 64 bit system (24 bytes)\n"
+"                   default: 32 or 64 depending on the system architecture\n"
+"\n"
+"   -O <num>        the number of bytes to interpret as ctdb record header\n"
+"                   for the output database (beware!)\n"
+"\n"
+"   -e              Include empty records, defaults to off\n"
+"\n"
+"   -p              print header (for the dump command), defaults to off\n"
+"\n"
+"   -h              print this help\n"
+"\n"
+"Commands:\n"
+"  help                         print this help\n"
+"  dump <db>                    dump the db to stdout\n"
+"  convert <in_db> <out_db>     convert the db\n\n", cmd);
+       return 0;
+}
+
+static int usage(const char* cmd)
+{
+       fprintf(stderr,
+               "Usage: %s dump [-e] [-p] [-s{0|32|64}] <idb>\n"
+               "       %s convert [-e] [-s{0|32|64}] [-o{0|32|64}] <idb> <odb>\n"
+               "       %s {help|-h}\n"
+               , cmd, cmd, cmd);
+       return -1;
+}
+
+static int
+ltdb_traverse(TDB_CONTEXT *tdb, int (*fn)(TDB_CONTEXT*, TDB_DATA, TDB_DATA,
+                                         struct ctdb_ltdb_header*, void *),
+             void *state, int hsize, bool skip_empty);
+
+struct write_record_ctx {
+       TDB_CONTEXT* tdb;
+       size_t hsize;
+       int tdb_store_flags;
+};
+
+static int
+write_record(TDB_CONTEXT* tdb, TDB_DATA key, TDB_DATA val,
+            struct ctdb_ltdb_header* hdr,
+            void* write_record_ctx);
+
+
+struct dump_record_ctx {
+       FILE* file;
+       void (*print_data)(FILE*, TDB_DATA);
+       void (*dump_header)(struct dump_record_ctx*, struct ctdb_ltdb_header*);
+};
+
+static int dump_record(TDB_CONTEXT* tdb, TDB_DATA key, TDB_DATA val,
+                      struct ctdb_ltdb_header* hdr,
+                      void* dump_record_ctx);
+static void print_data_tdbdump(FILE* file, TDB_DATA data);
+static void dump_header_full(struct dump_record_ctx*, struct ctdb_ltdb_header*);
+static void dump_header_nop(struct dump_record_ctx* c,
+                           struct ctdb_ltdb_header* h)
+{}
+
+static int dump_db(const char* iname, FILE* ofile, int hsize, bool dump_header,
+                  bool empty)
+{
+       int ret = -1;
+       TDB_CONTEXT* idb = tdb_open(iname, 0, TDB_DEFAULT, O_RDONLY, 0);
+       if (!idb) {
+               perror("tdbopen in");
+       } else {
+               struct dump_record_ctx dump_ctx = {
+                       .file = ofile,
+                       .print_data =  &print_data_tdbdump,
+                       .dump_header = dump_header ? &dump_header_full
+                                                  : &dump_header_nop,
+               };
+               ret = ltdb_traverse(idb, &dump_record, &dump_ctx, hsize, !empty);
+               tdb_close(idb);
+       }
+       return ret;
+}
+
+static int conv_db(const char* iname, const char* oname, size_t isize,
+                  size_t osize, bool keep_empty)
+{
+       int ret = -1;
+       TDB_CONTEXT* idb = tdb_open(iname, 0, TDB_DEFAULT, O_RDONLY, 0);
+       if (!idb) {
+               perror("tdbopen in");
+       } else {
+               TDB_CONTEXT* odb = tdb_open(oname, 0, TDB_DEFAULT, OUT_FLAGS, OUT_MODE);
+               if (!odb) {
+                       perror("tdbopen out");
+               } else {
+                       struct write_record_ctx ctx = {
+                               .tdb = odb,
+                               .hsize = osize,
+                               .tdb_store_flags = TDB_REPLACE,
+                       };
+                       ret = ltdb_traverse(idb, &write_record, &ctx, isize, !keep_empty);
+                       tdb_close(odb);
+               }
+               tdb_close(idb);
+       }
+       return ret;
+}
+
+static bool parse_size(size_t* size, const char* arg, bool raw) {
+       long val;
+       errno = 0;
+       val = strtol(arg, (char **) NULL, 10);
+       if (errno != 0) {
+               return false;
+       }
+       if (!raw) {
+               switch(val) {
+               case 0:
+                       break;
+               case 32:
+                       val = 20;
+                       break;
+               case 64:
+                       val = 24;
+                       break;
+               default:
+                       return false;
+               }
+       }
+       *size = MIN(val, MAX_HEADER_SIZE);
+       return true;
+}
+
+
+int main(int argc, char* argv[])
+{
+       size_t isize = sizeof(struct ctdb_ltdb_header);
+       size_t osize = sizeof(struct ctdb_ltdb_header);
+       bool print_header = false;
+       bool keep_empty = false;
+       int opt;
+       const char *cmd, *idb, *odb;
+
+       while ((opt = getopt(argc, argv, "s:o:S:O:phe")) != -1) {
+               switch (opt) {
+               case 's':
+               case 'S':
+                       if (!parse_size(&isize, optarg, isupper(opt))) {
+                               return usage(argv[0]);
+                       }
+                       break;
+               case 'o':
+               case 'O':
+                       if (!parse_size(&osize, optarg, isupper(opt))) {
+                               return usage(argv[0]);
+                       }
+                       break;
+               case 'p':
+                       print_header = true;
+                       break;
+               case 'e':
+                       keep_empty = true;
+                       break;
+               case 'h':
+                       return help(argv[0]);
+               default:
+                       return usage(argv[0]);
+               }
+       }
+
+       if (argc - optind < 1) {
+               return usage(argv[0]);
+       }
+
+       cmd = argv[optind];
+
+       if (strcmp(cmd, "help") == 0) {
+               return help(argv[0]);
+       }
+       else if (strcmp(cmd, "dump") == 0) {
+               int ret;
+               if (argc - optind != 2) {
+                       return usage(argv[0]);
+               }
+               idb = argv[optind+1];
+               ret = dump_db(idb, stdout, isize, print_header, keep_empty);
+               return (ret >= 0) ? 0 : ret;
+       }
+       else if (strcmp(cmd, "convert") == 0) {
+               int ret;
+               if (argc - optind != 3) {
+                       return usage(argv[0]);
+               }
+               idb = argv[optind+1];
+               odb = argv[optind+2];
+               ret = conv_db(idb, odb, isize, osize, keep_empty);
+               return (ret >= 0) ? 0 : ret;
+       }
+
+       return usage(argv[0]);
+}
+
+struct ltdb_traverse_ctx {
+       int (*fn)(TDB_CONTEXT*,TDB_DATA,TDB_DATA,struct ctdb_ltdb_header*,void *);
+       void* state;
+       size_t hsize;
+       bool skip_empty;
+       unsigned nempty;
+};
+
+static int
+ltdb_traverse_fn(TDB_CONTEXT* tdb, TDB_DATA key, TDB_DATA val,
+                void* ltdb_traverse_ctx)
+{
+       struct ltdb_traverse_ctx* ctx =
+               (struct ltdb_traverse_ctx*)ltdb_traverse_ctx;
+       union ltdb_header hdr = DEFAULT_HDR;
+
+       const size_t hsize = MIN(sizeof(hdr), ctx->hsize);
+       if (val.dsize < hsize) {
+               fprintf(stderr, "Value too short to contain a ctdb header: ");
+               print_data_tdbdump(stderr, key);
+               fprintf(stderr, " = ");
+               print_data_tdbdump(stderr, val);
+               fputc('\n', stderr);
+               return -1;
+       }
+       if (val.dsize == hsize && ctx->skip_empty) {
+               ctx->nempty++;
+               return 0;
+       }
+
+       memcpy(&hdr, val.dptr, hsize);
+
+       if (hdr.uints[5] != 0) {
+               fprintf(stderr, "Warning: header padding isn't zero! Wrong header size?\n");
+       }
+       val.dptr += ctx->hsize;
+       val.dsize -= ctx->hsize;
+       return ctx->fn(tdb, key, val, &hdr.hdr, ctx->state);
+}
+
+int ltdb_traverse(TDB_CONTEXT *tdb,
+                 int (*fn)(TDB_CONTEXT *,TDB_DATA,TDB_DATA,struct ctdb_ltdb_header*,void *),
+                 void *state, int hsize, bool skip_empty)
+{
+       struct ltdb_traverse_ctx ctx = {
+               .fn = fn,
+               .state = state,
+               .hsize = hsize < 0 ? sizeof(struct ctdb_ltdb_header) : hsize,
+               .skip_empty = skip_empty,
+               .nempty = 0,
+       };
+       int ret = tdb_traverse(tdb, &ltdb_traverse_fn, &ctx);
+
+       return (ret < 0) ? ret : (ret - ctx.nempty);
+}
+
+int write_record(TDB_CONTEXT* tdb, TDB_DATA key, TDB_DATA val,
+                struct ctdb_ltdb_header* hdr,
+                void* write_record_ctx)
+{
+       struct write_record_ctx* ctx
+               = (struct write_record_ctx*)write_record_ctx;
+
+       if (ctx->hsize == 0) {
+               if (tdb_store(ctx->tdb, key, val, ctx->tdb_store_flags) == -1) {
+                       fprintf(stderr, "tdb_store: %s\n", tdb_errorstr(ctx->tdb));
+                       return -1;
+               }
+       } else {
+               TDB_DATA h = {
+                       .dptr = (void*)hdr,
+                       .dsize = ctx->hsize,
+               };
+               if(tdb_store(ctx->tdb, key, h, ctx->tdb_store_flags) == -1) {
+                       fprintf(stderr, "tdb_store: %s\n", tdb_errorstr(ctx->tdb));
+                       return -1;
+               }
+               if(tdb_append(ctx->tdb, key, val) == -1) {
+                       fprintf(stderr, "tdb_append: %s\n", tdb_errorstr(ctx->tdb));
+                       return -1;
+               }
+       }
+       return 0;
+}
+
+int dump_record(TDB_CONTEXT* tdb, TDB_DATA key, TDB_DATA val,
+               struct ctdb_ltdb_header* hdr,
+               void* dump_record_ctx)
+{
+       struct dump_record_ctx* ctx = (struct dump_record_ctx*)dump_record_ctx;
+
+       fprintf(ctx->file, "{\nkey(%d) = ", (int)key.dsize);
+       ctx->print_data(ctx->file, key);
+       fputc('\n', ctx->file);
+       ctx->dump_header(ctx, hdr);
+       fprintf(ctx->file, "data(%d) = ", (int)val.dsize);
+       ctx->print_data(ctx->file, val);
+       fprintf(ctx->file, "\n}\n");
+       return 0;
+}
+
+void dump_header_full(struct dump_record_ctx* c, struct ctdb_ltdb_header* h)
+{
+       fprintf(c->file, "dmaster: %d\nrsn: %llu\nflags: 0x%X\n",
+               (int)h->dmaster,
+               (unsigned long long)h->rsn, h->flags);
+}
+
+void print_data_tdbdump(FILE* file, TDB_DATA data) {
+       unsigned char *ptr = data.dptr;
+       fputc('"', file);
+       while (data.dsize--) {
+               if (isprint(*ptr) && !strchr("\"\\", *ptr)) {
+                       fputc(*ptr, file);
+               } else {
+                       fprintf(file, "\\%02X", *ptr);
+               }
+               ptr++;
+       }
+       fputc('"',file);
+}
+
diff --git a/ctdb/tools/onnode b/ctdb/tools/onnode
new file mode 100755 (executable)
index 0000000..0abc136
--- /dev/null
@@ -0,0 +1,419 @@
+#!/bin/bash
+
+# Run commands on CTDB nodes.
+
+# See http://ctdb.samba.org/ for more information about CTDB.
+
+# Copyright (C) Martin Schwenke  2008
+
+# Based on an earlier script by Andrew Tridgell and Ronnie Sahlberg.
+
+# Copyright (C) Andrew Tridgell  2007
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the License, or
+# (at your option) any later version.
+   
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+   
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
+
+prog=$(basename $0)
+
+usage ()
+{
+    cat >&2 <<EOF
+Usage: onnode [OPTION] ... <NODES> <COMMAND> ...
+  options:
+    -c          Run in current working directory on specified nodes.
+    -o <prefix> Save standard output from each node to file <prefix>.<ip>
+    -p          Run command in parallel on specified nodes.
+    -q          Do not print node addresses (overrides -v).
+    -n          Allow nodes to be specified by name.
+    -f          Specify nodes file, overrides CTDB_NODES_FILE.
+    -v          Print node address even for a single node.
+    -P          Push given files to nodes instead of running commands.
+  <NODES>       "all", "any", "ok" (or "healthy"), "con" (or "connected"),
+                "rm" (or "recmaster"), "lvs" (or "lvsmaster"),
+                "natgw" (or "natgwlist"); or
+                a node number (0 base); or
+                a hostname (if -n is specified); or
+                list (comma separated) of <NODES>; or
+                range (hyphen separated) of node numbers.
+EOF
+    exit 1
+
+}
+
+invalid_nodespec ()
+{
+    echo "Invalid <nodespec>" >&2 ; echo >&2
+    usage
+}
+
+# Defaults.
+current=false
+parallel=false
+verbose=false
+quiet=false
+prefix=""
+names_ok=false
+push=false
+
+ctdb_base="${CTDB_BASE:-/etc/ctdb}"
+
+parse_options ()
+{
+    # $POSIXLY_CORRECT means that the command passed to onnode can
+    # take options and getopt won't reorder things to make them
+    # options ot onnode.
+    local temp
+    # Not on the previous line - local returns 0!
+    temp=$(POSIXLY_CORRECT=1 getopt -n "$prog" -o "cf:hno:pqvP" -l help -- "$@")
+
+    [ $? != 0 ] && usage
+
+    eval set -- "$temp"
+
+    while true ; do
+       case "$1" in
+           -c) current=true ; shift ;;
+           -f) CTDB_NODES_FILE="$2" ; shift 2 ;;
+           -n) names_ok=true ; shift ;;
+           -o) prefix="$2" ; shift 2 ;;
+           -p) parallel=true ; shift ;;
+           -q) quiet=true ; shift ;;
+           -v) verbose=true ; shift ;;
+           -P) push=true ; shift ;;
+           --) shift ; break ;;
+           -h|--help|*) usage ;; # Shouldn't happen, so this is reasonable.
+       esac
+    done
+
+    [ $# -lt 2 ] && usage
+
+    nodespec="$1" ; shift
+    command="$@"
+}
+
+echo_nth ()
+{
+    local n="$1" ; shift
+
+    shift $n
+    local node="$1"
+
+    if [ -n "$node" -a "$node" != "#DEAD" ] ; then
+       echo $node
+    else
+       echo "${prog}: \"node ${n}\" does not exist" >&2
+       exit 1
+    fi
+}
+
+parse_nodespec ()
+{
+    # Subshell avoids hacks to restore $IFS.
+    (
+       IFS=","
+       for i in $1 ; do
+           case "$i" in
+               *-*) seq "${i%-*}" "${i#*-}" 2>/dev/null || invalid_nodespec ;;
+               # Separate lines for readability.
+               all|any|ok|healthy|con|connected) echo "$i" ;;
+               rm|recmaster|lvs|lvsmaster|natgw|natgwlist) echo "$i" ;;
+               *)
+                   [ $i -gt -1 ] 2>/dev/null || $names_ok || invalid_nodespec
+                   echo $i
+           esac
+       done
+    )
+}
+
+ctdb_status_output="" # cache
+get_nodes_with_status ()
+{
+    local all_nodes="$1"
+    local status="$2"
+
+    if [ -z "$ctdb_status_output" ] ; then
+       ctdb_status_output=$(ctdb -Y status 2>&1)
+       if [ $? -ne 0 ] ; then
+           echo "${prog}: unable to get status of CTDB nodes" >&2
+           echo "$ctdb_status_output" >&2
+           exit 1
+       fi
+       local nl="
+"
+       ctdb_status_output="${ctdb_status_output#*${nl}}"
+    fi
+
+    (
+       local i
+       IFS="${IFS}:"
+       while IFS="" read i ; do
+
+           set -- $i # split line on colons
+           shift     # line starts with : so 1st field is empty
+           local pnn="$1" ; shift
+           local ip="$1" ; shift
+
+           case "$status" in
+               healthy)
+                   # If any bit is 1, don't match this address.
+                   local s
+                   for s ; do
+                       [ "$s" != "1" ] || continue 2
+                   done
+                   ;;
+               connected)
+                   # If disconnected bit is not 0, don't match this address.
+                   [ "$1" = "0" ] || continue
+                   ;;
+               *)
+                   invalid_nodespec
+           esac
+
+           echo_nth "$pnn" $all_nodes
+       done <<<"$ctdb_status_output"
+    )
+}
+
+ctdb_props="" # cache
+get_node_with_property ()
+{
+    local all_nodes="$1"
+    local prop="$2"
+
+    local prop_node=""
+    if [ "${ctdb_props##:${prop}:}" = "$ctdb_props" ] ; then
+       # Not in cache.
+       prop_node=$(ctdb "$prop" -Y 2>/dev/null)
+       if [ $? -eq 0 ] ; then
+           if [ "$prop" = "natgwlist" ] ; then
+               prop_node="${prop_node%% *}" # 1st word
+               if [ "$prop_node" = "-1" ] ; then
+                   # This works around natgwlist returning 0 even
+                   # when there's no natgw.
+                   prop_node=""
+               fi
+           else
+               # We only want the first line.
+               local nl="
+"
+               prop_node="${prop_node%%${nl}*}"
+           fi
+       else
+           prop_node=""
+       fi
+
+       if [ -n "$prop_node" ] ; then
+           # Add to cache.
+           ctdb_props="${ctdb_props}${ctdb_props:+ }:${prop}:${prop_node}"
+       fi
+    else
+       # Get from cache.
+       prop_node="${ctdb_props##:${prop}:}"
+       prop_node="${prop_node%% *}"
+    fi
+
+    if [ -n "$prop_node" ] ; then
+       echo_nth "$prop_node" $all_nodes
+    else
+       echo "${prog}: No ${prop} available" >&2
+       exit 1
+    fi
+}
+
+get_any_available_node ()
+{
+    local all_nodes="$1"
+
+    # We do a recursive onnode to find which nodes are up and running.
+    local out=$($0 -pq all ctdb pnn 2>&1)
+    local line
+    while read line ; do 
+       local pnn="${line#PNN:}"
+       if [ "$pnn" != "$line" ] ; then
+           echo_nth "$pnn" $all_nodes
+           return 0
+       fi
+       # Else must be an error message from a down node.
+    done <<<"$out"
+    return 1
+}
+
+get_nodes ()
+{
+    local all_nodes
+
+    if [ -n "$CTDB_NODES_SOCKETS" ] ; then 
+       all_nodes="$CTDB_NODES_SOCKETS"
+    else
+       local f="${ctdb_base}/nodes"
+       if [ -n "$CTDB_NODES_FILE" ] ; then
+           f="$CTDB_NODES_FILE"
+           if [ ! -e "$f" -a "${f#/}" = "$f" ] ; then
+               # $f is relative, try in $ctdb_base
+               f="${ctdb_base}/${f}"
+           fi
+       fi
+
+       if [ ! -r "$f" ] ; then
+           echo "${prog}: unable to open nodes file  \"${f}\"" >&2
+           exit 1
+       fi
+
+       all_nodes=$(sed -e 's@#.*@@g' -e 's@ *@@g' -e 's@^$@#DEAD@' "$f")
+    fi
+
+    local nodes=""
+    local n
+    for n in $(parse_nodespec "$1") ; do
+       [ $? != 0 ] && exit 1  # Required to catch exit in above subshell.
+       case "$n" in
+           all)
+               echo "${all_nodes//#DEAD/}"
+               ;;
+           any)
+               get_any_available_node "$all_nodes" || exit 1
+               ;;
+           ok|healthy) 
+               get_nodes_with_status "$all_nodes" "healthy" || exit 1
+               ;;
+           con|connected) 
+               get_nodes_with_status "$all_nodes" "connected" || exit 1
+               ;;
+           rm|recmaster)
+               get_node_with_property "$all_nodes" "recmaster" || exit 1
+               ;;
+           lvs|lvsmaster)
+               get_node_with_property "$all_nodes" "lvsmaster" || exit 1
+               ;;
+           natgw|natgwlist)
+               get_node_with_property "$all_nodes" "natgwlist" || exit 1
+               ;;
+           [0-9]|[0-9][0-9]|[0-9][0-9][0-9])
+               echo_nth $n $all_nodes
+               ;;
+           *)
+               $names_ok || invalid_nodespec
+               echo $n
+       esac
+    done
+}
+
+push()
+{
+    local host="$1"
+    local files="$2"
+
+    local f
+    for f in $files ; do
+        $verbose && echo "Pushing $f"
+        case "$f" in
+           /*) rsync "$f" "${host}:${f}" ;;
+           *)  rsync "${PWD}/${f}" "${host}:${PWD}/${f}" ;;
+       esac
+    done
+}
+
+fakessh ()
+{
+    CTDB_SOCKET="$1" sh -c "$2" 3>/dev/null
+}
+
+stdout_filter ()
+{
+    if [ -n "$prefix" ] ; then
+       cat >"${prefix}.${n//\//_}"
+    elif $verbose && $parallel ; then
+       sed -e "s@^@[$n] @"
+    else
+       cat
+    fi
+}
+
+stderr_filter ()
+{
+    if $verbose && $parallel ; then
+       sed -e "s@^@[$n] @"
+    else
+       cat
+    fi
+}
+
+######################################################################
+
+parse_options "$@"
+
+ssh_opts=
+if $push ; then
+    SSH=push
+    EXTRA_SSH_OPTS=""
+else
+    $current && command="cd $PWD && $command"
+
+    if [ -n "$CTDB_NODES_SOCKETS" ] ; then
+       SSH=fakessh
+       EXTRA_SSH_OPTS=""
+    else 
+       # Could "2>/dev/null || true" but want to see errors from typos in file.
+       [ -r "${ctdb_base}/onnode.conf" ] && . "${ctdb_base}/onnode.conf"
+       [ -n "$SSH" ] || SSH=ssh
+       if [ "$SSH" = "ssh" ] ; then
+           ssh_opts="-n"
+       else
+           : # rsh? All bets are off!
+       fi
+    fi
+fi
+
+######################################################################
+
+nodes=$(get_nodes "$nodespec")
+[ $? != 0 ] && exit 1   # Required to catch exit in above subshell.
+
+if $quiet ; then
+    verbose=false
+else
+    # If $nodes contains a space or a newline then assume multiple nodes.
+    nl="
+"
+    [ "$nodes" != "${nodes%[ ${nl}]*}" ] && verbose=true
+fi
+
+pids=""
+trap 'kill -TERM $pids 2>/dev/null' INT TERM
+# There's a small race here where the kill can fail if no processes
+# have been added to $pids and the script is interrupted.  However,
+# the part of the window where it matter is very small.
+retcode=0
+for n in $nodes ; do
+    set -o pipefail 2>/dev/null
+    if $parallel ; then
+       { exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; } &
+       pids="${pids} $!"
+    else
+       if $verbose ; then
+           echo >&2 ; echo ">> NODE: $n <<" >&2
+       fi
+
+       { exec 3>&1 ; { $SSH $ssh_opts $EXTRA_SSH_OPTS $n "$command" | stdout_filter >&3 ; } 2>&1 | stderr_filter ; }
+       [ $? = 0 ] || retcode=$?
+    fi
+done
+
+$parallel && {
+    for p in $pids; do
+       wait $p
+       [ $? = 0 ] || retcode=$?
+    done
+}
+
+exit $retcode
diff --git a/ctdb/utils/nagios/README b/ctdb/utils/nagios/README
new file mode 100644 (file)
index 0000000..99fa6dc
--- /dev/null
@@ -0,0 +1,56 @@
+check_ctdb 0.3
+
+This nagios plugin is free software, and comes with ABSOLUTELY NO WARRANTY. 
+It may be used, redistributed and/or modified under the terms of the GNU 
+General Public Licence (see http://www.fsf.org/licensing/licenses/gpl.txt).
+
+CTDB plugin
+
+Usage: check_ctdb -i <info>
+    [ -t <timeout> ] [ -w <warn_range> ] [ -c <crit_range> ]
+    [ -H <host> ] [-s] [ -l <login_name> ]
+    [ -V ] [ -h ]
+
+ -?, --usage
+   Print usage information
+ -h, --help
+   Print detailed help screen
+ -V, --version
+   Print version information
+ --extra-opts=[section][@file]
+   Read options from an ini file. See http://nagiosplugins.org/extra-opts for usage
+ -i, --info=<info>
+   Information: One of scriptstatus or ping.
+ -H, --hostname=<login_name>
+   Host name or IP Address.
+ -s, --sudo
+   Use sudo.
+ -l, --login=<host>
+   The user to log in as on the remote machine.
+ -w, --warning=THRESHOLD
+   Warning threshold. See
+   http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
+   for the threshold format.
+ -c, --critical=THRESHOLD
+   Critical threshold. See
+   http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT
+   for the threshold format.
+ -t, --timeout=INTEGER
+   Seconds before plugin times out (default: 30)
+ -v, --verbose
+   Show details for command-line debugging (can repeat up to 3 times)
+Supported commands:
+    * scriptstatus :
+        check the ctdb scriptstatus command and return CRITICAL if one of the
+        scripts fails.
+        Perfdata count the number of scripts by state (ok, disabled, error,
+        total).
+    * ping :
+        check the ctdb ping command.
+        Perfdata count the number of nodes, the total ping time and the number
+        of clients.
+        Thresholds are checked against the number of nodes.
+
+
+Copyright (c) 2011 Nantes Metropole
+
diff --git a/ctdb/utils/nagios/check_ctdb b/ctdb/utils/nagios/check_ctdb
new file mode 100644 (file)
index 0000000..837a0a4
--- /dev/null
@@ -0,0 +1,279 @@
+#!/usr/bin/perl -w
+# Nagios plugin to monitor CTDB (Clustered Trivial Database)
+#
+# License: GPL
+# Copyright (c) 2011 Nantes Metropole
+# Author: Mathieu Parent <math.parent@gmail.com>
+# Contributor(s): -
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+use strict;
+use warnings;
+use vars qw($PROGNAME $VERSION $output $values $result);
+use Nagios::Plugin;
+use File::Basename;
+
+$PROGNAME = basename($0);
+$VERSION = '0.4';
+
+my $np = Nagios::Plugin->new(
+  usage => "Usage: %s -i <info>\n"
+    . "    [ -t <timeout> ] [ -w <warn_range> ] [ -c <crit_range> ]\n"
+    . "    [ -H <host> ] [-s] [ -l <login_name> ]\n"
+    . '    [ -V ] [ -h ]',
+  version => $VERSION,
+  plugin  => $PROGNAME,
+  shortname => uc($PROGNAME),
+  blurb => 'CTDB plugin',
+  extra   => "Supported commands:\n"
+    . "    * scriptstatus :\n"
+    . "        check the ctdb scriptstatus command and return CRITICAL if one of the\n"
+    . "        scripts fails.\n"
+    . "        Perfdata count the number of scripts by state (ok, disabled, error,\n"
+    . "        total).\n"
+    . "    * ping :\n"
+    . "        check the ctdb ping command.\n"
+    . "        Perfdata count the number of nodes, the total ping time and the number\n"
+    . "        of clients.\n"
+    . "        Thresholds are checked against the number of nodes.\n"
+    . "\n\nCopyright (c) 2011 Nantes Metropole",
+  timeout => 30,
+);
+
+$np->add_arg(
+  spec => 'info|i=s',
+  help => "-i, --info=<info>\n"
+    . '   Information: One of scriptstatus or ping.',
+  required => 1,
+);
+
+$np->add_arg(
+  spec => 'hostname|H=s',
+  help => "-H, --hostname=<login_name>\n"
+    . '   Host name or IP Address.',
+  required => 0,
+);
+
+$np->add_arg(
+  spec => 'sudo|s',
+  help => "-s, --sudo\n"
+    . '   Use sudo.',
+  required => 0,
+);
+
+$np->add_arg(
+  spec => 'login|l=s',
+  help => "-l, --login=<host>\n"
+    . '   The user to log in as on the remote machine.',
+  required => 0,
+);
+
+$np->add_arg(
+  spec => 'warning|w=s',
+  help => "-w, --warning=THRESHOLD\n"
+    . "   Warning threshold. See\n"
+    . "   http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n"
+    . '   for the threshold format.',
+  required => 0,
+);
+
+$np->add_arg(
+  spec => 'critical|c=s',
+  help => "-c, --critical=THRESHOLD\n"
+    . "   Critical threshold. See\n"
+    . "   http://nagiosplug.sourceforge.net/developer-guidelines.html#THRESHOLDFORMAT\n"
+    . '   for the threshold format.',
+  required => 0,
+);
+
+$np->getopts;
+
+my $info = $np->opts->info;
+my $hostname = $np->opts->hostname;
+my $login = $np->opts->login;
+my $sudo = $np->opts->sudo;
+my $warning = $np->opts->warning;
+my $critical = $np->opts->critical;
+my $percw;
+my $percc;
+
+$output = "";
+
+if (defined($critical))
+{
+        ($percc, $critical) = check_percantage($critical);
+        $critical = undef if ($critical eq '');
+}
+
+if (defined($warning))
+{
+        ($percw, $warning) = check_percantage($warning);
+        $warning = undef if ($warning eq '');
+}
+
+$np->set_thresholds(critical => $critical, warning => $warning);
+
+my $stderr;
+
+sub safe_open_command {
+    unshift @_, "sudo" if $sudo;
+    if ($hostname) {
+        unshift @_, $hostname;
+        unshift @_, "-l", $login if $login;
+        unshift @_, "ssh";
+    }
+    open(OLDERR, ">&", \*STDERR) or die "Can't dup STDERR: $!";
+    $stderr = "";
+    close STDERR;
+    open(STDERR, ">>", \$stderr) or die "Can't open STDERR: $!";
+    if ($np->opts->verbose) {
+      print "Executing: @_\n";
+    }
+    if (!open(PIPE, '-|', @_)) {
+        $result = CRITICAL;
+        $output .= "Cannot open command '@_': $! ($stderr). ";
+        # restore STDERR
+        open(STDERR, ">", \*OLDERR) or die "Can't dup OLDERR: $!";
+    }
+}
+
+sub safe_close_command {
+    close(PIPE);
+
+    if ($? == -1) {
+        $result = CRITICAL;
+        $output .= "failed to execute: $!. ";
+    } elsif ($? & 127) {
+        $result = CRITICAL;
+        $output .= sprintf("child died with signal %d, %s coredump. ",
+            ($? & 127), ($? & 128) ? 'with' : 'without');
+    } elsif ($? >> 8) {
+        if (($? >> 8) == 255) {
+            # ctdb returns -1=255 if any node is disconnected
+            $result = WARNING;
+            $output .= sprintf("child exited with value %d. ", $? >> 8) if $output eq "";
+        } else {
+            $result = CRITICAL;
+            $output .= sprintf("child exited with value %d. ", $? >> 8);
+        }
+    }
+    # restore STDERR
+    open(STDERR, ">&OLDERR") or die "Can't dup OLDERR: $!";
+}
+
+# main :
+
+if ($info eq "scriptstatus") {
+    $result = OK;
+    safe_open_command('ctdb', '-Y', 'scriptstatus');
+    if ($result == OK) {
+        my $script_count = 0;
+        my $ok_script_count = 0;
+        my $disabled_script_count = 0;
+        my $error_script_count = 0;
+        while (<PIPE>) {
+            next if $. == 1; # Header
+            $script_count++;
+            chop;
+            my ($col0, $type, $name, $code, $status, $start, $end, @error) = split(":");
+            if ($col0 ne '') {
+              # Old version, before 30 Aug 2011 and commit a779d83a6213
+              ($type, $name, $code, $status, $start, $end, @error) = ($col0, $type, $name, $code, $status, $start, $end, @error);
+            }
+            my $error = join(':', @error);
+            if ($error ne "") {
+                $output = "$output ;; " if $output;
+                $output = "$output$name ($status=$code): $error ";
+                if ($result != CRITICAL) {
+                    $result = WARNING;
+                }
+            }
+            if ($status eq "OK") {
+                $ok_script_count++;
+                next;
+            }
+            if ($status eq "DISABLED") {
+                $disabled_script_count++;
+                next;
+            }
+            $error_script_count++;
+            $result = WARNING;
+        }
+        safe_close_command();
+        $np->add_perfdata(label => "ok", value => $ok_script_count, uom => '',
+            min => 0, max => $script_count);
+        $np->add_perfdata(label => "disabled", value => $disabled_script_count, uom => '',
+            min => 0, max => $script_count);
+        $np->add_perfdata(label => "error", value => $error_script_count, uom => '',
+            min => 0, max => $script_count, warning => '0', critical => '0');
+        $np->add_perfdata(label => "total", value => $script_count, uom => '',
+            min => 0, max => $script_count);
+        if ($result == OK) {
+            $result = $np->check_threshold(check => $error_script_count, warning => '0', critical => '0');
+        }
+     }
+    $np->nagios_exit($result, $output);
+} elsif ($info eq "ping") {
+    # Get expected nodes count
+    $result = OK;
+    safe_open_command('cat', '/etc/ctdb/nodes');
+    1 while( <PIPE> );
+    my $max_nodes_count = $.;
+    safe_close_command();
+    # ctdb ping
+    $result = OK;
+    safe_open_command('ctdb', '-n', 'all', 'ping');
+    if ($result == OK) {
+        my $nodes_count = 0;
+        my $time_total = 0.0;
+        my $clients_count = 0;
+        while (<PIPE>) {
+            chop;
+            if ($_ =~ /^response from (\d+) time=([0-9.]+) sec  \((\d+) clients\)$/) {
+                my ($node_id, $time, $clients) = ($1,$2,$3);
+                $nodes_count += 1;
+                $time_total += $time;
+                $clients_count += $clients;
+            } elsif ($_ =~ /^Unable to get ping response from node (\d+)$/) {
+                #
+            } else {
+                $result = CRITICAL;
+                $output .= "'$_' doesn't match regexp. "
+            }
+        }
+        $output .= sprintf("%d missing nodes. ", $max_nodes_count - $nodes_count) if $nodes_count < $max_nodes_count;
+        safe_close_command();
+        $np->add_perfdata(label => "nodes", value => $nodes_count, uom => '',
+            min => 0, max => $max_nodes_count, warning => $warning, critical => $critical);
+        $np->add_perfdata(label => "ping_time", value => $time_total, uom => 's',
+            min => 0, max => undef);
+        $np->add_perfdata(label => "clients", value => $clients_count, uom => '',
+            min => 0, max => undef);
+        if ($result == OK) {
+            $result = $np->check_threshold(check => $nodes_count);
+        }
+    }
+    $np->nagios_exit($result, $output);
+} else {
+    $np->nagios_exit(UNKNOWN, "Unknown command: '$info'");
+}
+
+sub check_percantage
+{
+        my ($number) = shift(@_);
+        my $perc = $number =~ s/\%//;
+        return ($perc, $number);
+}
+
diff --git a/ctdb/utils/ping_pong/ping_pong.c b/ctdb/utils/ping_pong/ping_pong.c
new file mode 100644 (file)
index 0000000..16f58d8
--- /dev/null
@@ -0,0 +1,251 @@
+/*
+   A ping-pong fcntl byte range lock test
+
+   Copyright (C) Andrew Tridgell 2002
+   Copyright (C) Michael Adam 2012
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*
+  This measures the ping-pong byte range lock latency. It is
+  especially useful on a cluster of nodes sharing a common lock
+  manager as it will give some indication of the lock managers
+  performance under stress.
+
+  tridge@samba.org, February 2002
+
+*/
+
+#define _XOPEN_SOURCE 500
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <time.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+
+static struct timeval tp1,tp2;
+
+static int do_reads, do_writes, use_mmap, do_check;
+
+static void start_timer(void)
+{
+       gettimeofday(&tp1,NULL);
+}
+
+static double end_timer(void)
+{
+       gettimeofday(&tp2,NULL);
+       return (tp2.tv_sec + (tp2.tv_usec*1.0e-6)) - 
+               (tp1.tv_sec + (tp1.tv_usec*1.0e-6));
+}
+
+/* lock a byte range in a open file */
+static int lock_range(int fd, int offset, int len)
+{
+       struct flock lock;
+
+       lock.l_type = F_WRLCK;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = offset;
+       lock.l_len = len;
+       lock.l_pid = 0;
+       
+       return fcntl(fd,F_SETLKW,&lock);
+}
+
+/* check whether we could place a lock */
+int check_lock(int fd, int offset, int len)
+{
+       struct flock lock;
+       int ret;
+
+       lock.l_type = F_WRLCK;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = offset;
+       lock.l_len = len;
+       lock.l_pid = 0;
+
+       ret = fcntl(fd, F_GETLK, &lock);
+       if (ret != 0) {
+               printf("error calling fcntl F_GETLCK: %s\n", strerror(errno));
+               return -1;
+       }
+
+       if (lock.l_type == F_UNLCK) {
+               /* we would be able to place the lock */
+               return 0;
+       }
+
+       /* we would not be able to place lock */
+       printf("check_lock failed: lock held: "
+              "pid='%d', type='%d', start='%d', len='%d'\n",
+              (int)lock.l_pid, (int)lock.l_type, (int)lock.l_start, (int)lock.l_len);
+       return 1;
+}
+
+/* unlock a byte range in a open file */
+static int unlock_range(int fd, int offset, int len)
+{
+       struct flock lock;
+
+       lock.l_type = F_UNLCK;
+       lock.l_whence = SEEK_SET;
+       lock.l_start = offset;
+       lock.l_len = len;
+       lock.l_pid = 0;
+       
+       return fcntl(fd,F_SETLKW,&lock);
+}
+
+/* run the ping pong test on fd */
+static void ping_pong(int fd, int num_locks)
+{
+       unsigned count = 0;
+       int i=0, loops=0;
+       unsigned char *val;
+       unsigned char incr=0, last_incr=0;
+       unsigned char *p = NULL;
+       int ret;
+
+       ret = ftruncate(fd, num_locks+1);
+       if (ret == -1) {
+               printf("ftruncate failed: %s\n", strerror(errno));
+               return;
+       }
+
+       if (use_mmap) {
+               p = mmap(NULL, num_locks+1, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+               if (p == MAP_FAILED) {
+                       printf("mmap failed: %s\n", strerror(errno));
+                       return;
+               }
+       }
+
+       val = (unsigned char *)calloc(num_locks+1, sizeof(unsigned char));
+       if (val == NULL) {
+               printf("calloc failed\n");
+               return;
+       }
+
+       start_timer();
+
+       lock_range(fd, 0, 1);
+       i = 0;
+
+       while (1) {
+               if (lock_range(fd, (i+1) % num_locks, 1) != 0) {
+                       printf("lock at %d failed! - %s\n",
+                              (i+1) % num_locks, strerror(errno));
+               }
+               if (do_check) {
+                       ret = check_lock(fd, i, 1);
+               }
+               if (do_reads) {
+                       unsigned char c;
+                       if (use_mmap) {
+                               c = p[i];
+                       } else if (pread(fd, &c, 1, i) != 1) {
+                               printf("read failed at %d\n", i);
+                       }
+                       incr = c - val[i];
+                       val[i] = c;
+               }
+               if (do_writes) {
+                       char c = val[i] + 1;
+                       if (use_mmap) {
+                               p[i] = c;
+                       } else if (pwrite(fd, &c, 1, i) != 1) {
+                               printf("write failed at %d\n", i);
+                       }
+               }
+               if (unlock_range(fd, i, 1) != 0) {
+                       printf("unlock at %d failed! - %s\n",
+                              i, strerror(errno));
+               }
+               i = (i+1) % num_locks;
+               count++;
+               if (loops > num_locks && incr != last_incr) {
+                       last_incr = incr;
+                       printf("data increment = %u\n", incr);
+                       fflush(stdout);
+               }
+               if (end_timer() > 1.0) {
+                       printf("%8u locks/sec\r", 
+                              (unsigned)(2*count/end_timer()));
+                       fflush(stdout);
+                       start_timer();
+                       count=0;
+               }
+               loops++;
+       }
+}
+
+int main(int argc, char *argv[])
+{
+       char *fname;
+       int fd, num_locks;
+       int c;
+
+       while ((c = getopt(argc, argv, "rwmc")) != -1) {
+               switch (c){
+               case 'w':
+                       do_writes = 1;
+                       break;
+               case 'r':
+                       do_reads = 1;
+                       break;
+               case 'm':
+                       use_mmap = 1;
+                       break;
+               case 'c':
+                       do_check = 1;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option '%c'\n", c);
+                       exit(1);
+               }
+       }
+
+       argv += optind;
+       argc -= optind;
+
+       if (argc < 2) {
+               printf("ping_pong [options] <file> <num_locks>\n");
+               printf("           -r    do reads\n");
+               printf("           -w    do writes\n");
+               printf("           -m    use mmap\n");
+               printf("           -c    check locks\n");
+               exit(1);
+       }
+
+       fname = argv[0];
+       num_locks = atoi(argv[1]);
+       if (num_locks <= 0) {
+               printf("num_locks should be > 0\n");
+               exit(1);
+       }
+
+       fd = open(fname, O_CREAT|O_RDWR, 0600);
+       if (fd == -1) exit(1);
+
+       ping_pong(fd, num_locks);
+
+       return 0;
+}
diff --git a/ctdb/utils/pmda/Install b/ctdb/utils/pmda/Install
new file mode 100644 (file)
index 0000000..a56a635
--- /dev/null
@@ -0,0 +1,36 @@
+#! /bin/sh
+#
+# Copyright (c) 1997 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+#
+# Install the ctdb PMDA and/or PMNS
+#
+
+. $PCP_DIR/etc/pcp.env
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+iam=ctdb
+pmda_interface=2
+
+# runs as daemon and only supports pipe IPC
+daemon_opt=true
+dso_opt=false
+pipe_opt=true
+socket_opt=false
+
+pmdaSetup
+pmdaInstall
+exit 0
diff --git a/ctdb/utils/pmda/README b/ctdb/utils/pmda/README
new file mode 100644 (file)
index 0000000..f8dbbbc
--- /dev/null
@@ -0,0 +1,84 @@
+CTDB PMDA
+===========
+
+This PMDA extracts metrics from the locally running ctdbd daemon for
+export to PMCD.
+
+Note:
+       This PMDA may be remade from source and hence requires IDO (or
+       more specifically a C compiler) to be installed.
+
+       Uses of make(1) may fail (without removing or clobbering files)
+       if the C compiler cannot be found.  This is most likely to
+       happen when running the PMDA ./Install script.
+
+       The only remedial action is to install the C compiler, or
+       hand-craft changes to the Makefile.
+
+Metrics
+=======
+
+The file ./help contains descriptions for all of the metrics exported
+by this PMDA.
+
+Once the PMDA has been installed, the following command will list all
+the available metrics and their explanatory "help" text:
+
+       $ pminfo -fT ctdb
+
+Installation
+============
+
+ +  # cd $PCP_PMDAS_DIR/ctdb
+
+ +  Check that there is no clash in the Performance Metrics Domain
+    defined in ./domain.h and the other PMDAs currently in use (see
+    $PCP_PMCDCONF_PATH).  If there is, edit ./domain.h to choose another
+    domain number.
+
+ +  Then simply use
+
+       # ./Install
+
+    and choose both the "collector" and "monitor" installation
+    configuration options.
+
+    You will be prompted to choose either a daemon implementation
+    or a DSO implementation of the PMDA, and in the case of the daemon
+    variant to select an IPC method -- everything else is automated
+
+De-installation
+===============
+
+ +  Simply use
+
+       # cd $PCP_PMDAS_DIR/ctdb
+       # ./Remove
+
+Troubleshooting
+===============
+
+ +  After installing or restarting the agent, the PMCD log file
+    ($PCP_LOG_DIR/pmcd/pmcd.log) and the PMDA log file
+    ($PCP_LOG_DIR/pmcd/pmda_ctdb.log) should be checked for any warnings
+    or errors.
+
+
+Adding a New Metric
+===================
+
+This section walks through the development task of adding a new metric to the
+CTDB PMDA.
+
+  + Define the metric in the pmns file with a unique metric id. See the pmns(4)
+    man page for details.
+
+  + Add a description of the metric to the help file.
+
+  + Taking note of the previously assigned metric id, add a new entry to the
+    metrictab structure in pmda_ctdb.c. See the pmdaInit(3) man page for
+    details.
+
+  + Ensure the counter is already a member of the ctdb_statistics structure.
+    Finally, add code to pmda_ctdb_fetch_cb() to handle fetch requests for the
+    newly defined metric.
diff --git a/ctdb/utils/pmda/Remove b/ctdb/utils/pmda/Remove
new file mode 100644 (file)
index 0000000..7d1c509
--- /dev/null
@@ -0,0 +1,29 @@
+#! /bin/sh
+#
+# Copyright (c) 1997 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+#
+# Remove the ctdb PMDA
+#
+
+. $PCP_DIR/etc/pcp.env
+. $PCP_SHARE_DIR/lib/pmdaproc.sh
+
+iam=ctdb
+
+pmdaSetup
+pmdaRemove
+exit 0
diff --git a/ctdb/utils/pmda/config.m4 b/ctdb/utils/pmda/config.m4
new file mode 100644 (file)
index 0000000..6b3fbb0
--- /dev/null
@@ -0,0 +1,32 @@
+AC_ARG_ENABLE(pmda, 
+AS_HELP_STRING([--enable-pmda], [Turn on PCP pmda support (default=no)]))
+
+HAVE_PMDA=no
+
+if eval "test x$enable_pmda = xyes"; then
+       HAVE_PMDA=yes
+
+       AC_CHECK_HEADERS(pcp/pmapi.h pcp/impl.h pcp/pmda.h, [],
+       [AC_MSG_ERROR([Missing PCP pmda headers])],
+       [[#ifdef HAVE_PCP_PMAPI_H
+       # include <pcp/pmapi.h>
+       #endif
+       #ifdef HAVE_PCP_IMPL_H
+       # include <pcp/impl.h>
+       #endif
+       #ifdef HAVE_PCP_PMDA_H
+       # include <pcp/pmda.h>
+       #endif
+       ]])
+fi
+
+if test x"$HAVE_PMDA" = x"yes"; then
+    CTDB_PMDA=bin/pmdactdb
+    CTDB_PMDA_INSTALL=install_pmda
+else
+    CTDB_PMDA=
+    CTDB_PMDA_INSTALL=
+fi
+
+AC_SUBST(CTDB_PMDA)
+AC_SUBST(CTDB_PMDA_INSTALL)
diff --git a/ctdb/utils/pmda/domain.h b/ctdb/utils/pmda/domain.h
new file mode 100644 (file)
index 0000000..0bed7fe
--- /dev/null
@@ -0,0 +1,19 @@
+/* domain.h
+ *
+ * Copyright (c) 2004-2009 Silicon Graphics, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#define CTDB 110
diff --git a/ctdb/utils/pmda/help b/ctdb/utils/pmda/help
new file mode 100644 (file)
index 0000000..0e9984e
--- /dev/null
@@ -0,0 +1,106 @@
+#
+# Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+#
+# ctdb PMDA help file in the ASCII format
+#
+# lines beginning with a # are ignored
+# lines beginning @ introduce a new entry of the form
+#  @ metric_name oneline-text
+#  help test goes
+#  here over multiple lines
+#  ...
+#
+# the metric_name is decoded against the default PMNS -- as a special case,
+# a name of the form NNN.MM (for numeric NNN and MM) is interpreted as an
+# instance domain identification, and the text describes the instance domain
+#
+# blank lines before the @ line are ignored
+#
+
+@ ctdb.num_clients number of clients connected to ctdbd
+
+@ ctdb.frozen whether any databases are frozen
+
+@ ctdb.recovering whether recovery is active
+
+@ ctdb.client_packets_sent number of packets sent to all clients
+
+@ ctdb.client_packets_recv number of packets received from all clients
+
+@ ctdb.node_packets_sent number of packets sent to other nodes
+
+@ ctdb.node_packets_recv number of packets received from other nodes
+
+@ ctdb.keepalive_packets_sent number of keepalive packets sent to other nodes
+
+@ ctdb.keepalive_packets_recv number of keepalive packets received from other nodes
+
+@ ctdb.node.req_call number of node CTDB_REQ_CALL packets handled
+
+@ ctdb.node.reply_call number of node CTDB_REPLY_CALL packets handled
+
+@ ctdb.node.req_dmaster number of node CTDB_REQ_DMASTER packets handled
+
+@ ctdb.node.reply_dmaster number of node CTDB_REPLY_DMASTER packets handled
+
+@ ctdb.node.reply_error number of node CTDB_REPLY_ERROR packets handled
+
+@ ctdb.node.req_message number of node CTDB_REQ_MESSAGE packets handled
+
+@ ctdb.node.req_control number of node CTDB_REQ_CONTROL packets handled
+
+@ ctdb.node.reply_control number of node CTDB_REPLY_CONTROL packets handled
+
+@ ctdb.client.req_call number of client CTDB_REQ_CALL packets handled
+
+@ ctdb.client.req_message number of client CTDB_REQ_MESSAGE packets handled
+
+@ ctdb.client.req_control number of client CTDB_REQ_CONTROL packets handled
+
+@ ctdb.timeouts.call (counter not implemented) number of call timeouts
+
+@ ctdb.timeouts.control number of node control message request timeouts awaiting reply
+
+@ ctdb.timeouts.traverse number of database traversal timeouts
+
+@ ctdb.total_calls total number of client ctdb request calls received
+
+@ ctdb.pending_calls total number of client ctdb request calls in progress
+
+@ ctdb.lockwait_calls number of tdb chainlock lockwait calls
+
+@ ctdb.pending_lockwait_calls number of lockwait calls waiting for a lock
+
+@ ctdb.childwrite_calls number of childwrite calls
+
+@ ctdb.pending_childwrite_calls number of childwrite calls in progress
+
+@ ctdb.memory_used total size of the ctdbd null talloc pool
+
+@ ctdb.max_hop_count maximum hops performed by a CTDB_REQ_CALL packet
+
+@ ctdb.max_reclock_ctdbd maximum recovery lock latency during setrecmode
+
+@ ctdb.max_reclock_recd maximum recovery lock latency as reported by the recovery process
+
+@ ctdb.max_call_latency maximum time spent handling a client request call
+
+@ ctdb.max_lockwait_latency maximum time spent waiting for a tdb chainlock
+
+@ ctdb.max_childwrite_latency maximum time spent performing a childwrite
+
+@ ctdb.num_recoveries number of recoveries finished
diff --git a/ctdb/utils/pmda/pmda_ctdb.c b/ctdb/utils/pmda/pmda_ctdb.c
new file mode 100644 (file)
index 0000000..e8033be
--- /dev/null
@@ -0,0 +1,595 @@
+/*
+ * CTDB Performance Metrics Domain Agent (PMDA) for Performance Co-Pilot (PCP)
+ *
+ * Copyright (c) 1995,2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2011 David Disseldorp
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+#include <pcp/pmapi.h>
+#include <pcp/impl.h>
+#include <pcp/pmda.h>
+#include "../../include/includes.h"
+#include "../../include/ctdb.h"
+#include "../../include/ctdb_private.h"
+#include "../../include/ctdb_protocol.h"
+#include "domain.h"
+
+/*
+ * CTDB PMDA
+ *
+ * This PMDA connects to the locally running ctdbd daemon and pulls
+ * statistics for export via PCP. The ctdbd Unix domain socket path can be
+ * specified with the CTDB_SOCKET environment variable, otherwise the default
+ * path is used.
+ */
+
+/*
+ * All metrics supported in this PMDA - one table entry for each.
+ * The 4th field specifies the serial number of the instance domain
+ * for the metric, and must be either PM_INDOM_NULL (denoting a
+ * metric that only ever has a single value), or the serial number
+ * of one of the instance domains declared in the instance domain table
+ * (i.e. in indomtab, above).
+ */
+static pmdaMetric metrictab[] = {
+       /* num_clients */
+       { NULL, { PMDA_PMID(0,0), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+       /* frozen */
+       { NULL, { PMDA_PMID(1,2), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+       /* recovering */
+       { NULL, { PMDA_PMID(3,3), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+       /* client_packets_sent */
+       { NULL, { PMDA_PMID(4,4), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* client_packets_recv */
+       { NULL, { PMDA_PMID(5,5), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* node_packets_sent */
+       { NULL, { PMDA_PMID(6,6), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* node_packets_recv */
+       { NULL, { PMDA_PMID(7,7), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* keepalive_packets_sent */
+       { NULL, { PMDA_PMID(8,8), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* keepalive_packets_recv */
+       { NULL, { PMDA_PMID(9,9), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* req_call */
+       { NULL, { PMDA_PMID(10,10), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* reply_call */
+       { NULL, { PMDA_PMID(10,11), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* req_dmaster */
+       { NULL, { PMDA_PMID(10,12), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* reply_dmaster */
+       { NULL, { PMDA_PMID(10,13), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* reply_error */
+       { NULL, { PMDA_PMID(10,14), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* req_message */
+       { NULL, { PMDA_PMID(10,15), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* req_control */
+       { NULL, { PMDA_PMID(10,16), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* reply_control */
+       { NULL, { PMDA_PMID(10,17), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* req_call */
+       { NULL, { PMDA_PMID(11,18), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* req_message */
+       { NULL, { PMDA_PMID(11,19), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* req_control */
+       { NULL, { PMDA_PMID(11,20), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* call */
+       { NULL, { PMDA_PMID(12,21), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,0) }, },
+       /* control */
+       { NULL, { PMDA_PMID(12,22), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,0) }, },
+       /* traverse */
+       { NULL, { PMDA_PMID(12,23), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,0) }, },
+       /* total_calls */
+       { NULL, { PMDA_PMID(13,24), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* pending_calls */
+       { NULL, { PMDA_PMID(14,25), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+       /* locks.num_calls */
+       { NULL, { PMDA_PMID(15,27), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* locks.pending_calls */
+       { NULL, { PMDA_PMID(16,27), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+       /* childwrite_calls */
+       { NULL, { PMDA_PMID(17,28), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_COUNTER,
+               PMDA_PMUNITS(0,0,1,0,0,PM_COUNT_ONE) }, },
+       /* pending_childwrite_calls */
+       { NULL, { PMDA_PMID(18,29), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+       /* memory_used */
+       { NULL, { PMDA_PMID(19,30), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(1,0,0,PM_SPACE_BYTE,0,0) }, },
+       /* max_hop_count */
+       { NULL, { PMDA_PMID(20,31), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+       /* max_reclock_ctdbd */
+       { NULL, { PMDA_PMID(21,32), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
+       /* max_reclock_recd */
+       { NULL, { PMDA_PMID(22,33), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
+       /* max_call_latency */
+       { NULL, { PMDA_PMID(23,34), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
+       /* locks.latency.max */
+       { NULL, { PMDA_PMID(24,35), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
+       /* childwrite_latency.max */
+       { NULL, { PMDA_PMID(25,36), PM_TYPE_DOUBLE, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,1,0,0,PM_TIME_SEC,0) }, },
+       /* num_recoveries */
+       { NULL, { PMDA_PMID(26,37), PM_TYPE_U32, PM_INDOM_NULL, PM_SEM_INSTANT,
+               PMDA_PMUNITS(0,0,0,0,0,0) }, },
+};
+
+static struct event_context *ev;
+static struct ctdb_context *ctdb;
+static struct ctdb_statistics *stats;
+
+static void
+pmda_ctdb_q_read_cb(uint8_t *data, size_t cnt, void *args)
+{
+       if (cnt == 0) {
+               fprintf(stderr, "ctdbd unreachable\n");
+               /* cleanup on request timeout */
+               return;
+       }
+
+       ctdb_client_read_cb(data, cnt, args);
+}
+
+
+static int
+pmda_ctdb_daemon_connect(void)
+{
+       const char *socket_name;
+       int ret;
+       struct sockaddr_un addr;
+
+       ev = event_context_init(NULL);
+       if (ev == NULL) {
+               fprintf(stderr, "Failed to init event ctx\n");
+               return -1;
+       }
+
+       ctdb = ctdb_init(ev);
+       if (ctdb == NULL) {
+               fprintf(stderr, "Failed to init ctdb\n");
+               goto err_ev;
+       }
+
+       socket_name = getenv("CTDB_SOCKET");
+       if (socket_name == NULL) {
+               socket_name = CTDB_PATH;
+       }
+
+       ret = ctdb_set_socketname(ctdb, socket_name);
+       if (ret == -1) {
+               fprintf(stderr, "ctdb_set_socketname failed - %s\n",
+                               ctdb_errstr(ctdb));
+               goto err_ctdb;
+       }
+
+       /*
+        * ctdb_socket_connect() sets a default queue callback handler that
+        * calls exit() if ctdbd is unavailable on recv, use our own wrapper to
+        * work around this
+        */
+
+       memset(&addr, 0, sizeof(addr));
+       addr.sun_family = AF_UNIX;
+       strncpy(addr.sun_path, ctdb->daemon.name, sizeof(addr.sun_path));
+
+       ctdb->daemon.sd = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (ctdb->daemon.sd == -1) {
+               fprintf(stderr, "Failed to open client socket\n");
+               goto err_ctdb;
+       }
+
+       set_nonblocking(ctdb->daemon.sd);
+       set_close_on_exec(ctdb->daemon.sd);
+
+       if (connect(ctdb->daemon.sd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+               fprintf(stderr, "Failed to connect to ctdb daemon via %s\n",
+                       ctdb->daemon.name);
+               goto err_sd;
+       }
+
+       ctdb->daemon.queue = ctdb_queue_setup(ctdb, ctdb, ctdb->daemon.sd,
+                                             CTDB_DS_ALIGNMENT,
+                                             pmda_ctdb_q_read_cb, ctdb,
+                                             "to-ctdbd");
+       if (ctdb->daemon.queue == NULL) {
+               fprintf(stderr, "Failed to setup queue\n");
+               goto err_sd;
+       }
+
+       ctdb->pnn = ctdb_ctrl_getpnn(ctdb, timeval_current_ofs(3, 0),
+                                    CTDB_CURRENT_NODE);
+       if (ctdb->pnn == (uint32_t)-1) {
+               fprintf(stderr, "Failed to get ctdb pnn\n");
+               goto err_sd;
+       }
+
+       return 0;
+err_sd:
+       close(ctdb->daemon.sd);
+err_ctdb:
+       talloc_free(ctdb);
+err_ev:
+       talloc_free(ev);
+       ctdb = NULL;
+       return -1;
+}
+
+static void
+pmda_ctdb_daemon_disconnect(void)
+{
+       if (ctdb->methods) {
+               ctdb->methods->shutdown(ctdb);
+       }
+
+       if (ctdb->daemon.sd != -1) {
+               close(ctdb->daemon.sd);
+       }
+
+       talloc_free(ctdb);
+       talloc_free(ev);
+       ctdb = NULL;
+}
+
+static int
+fill_node(unsigned int item, pmAtomValue *atom)
+{
+       switch (item) {
+       case 10:
+               atom->ul = stats->node.req_call;
+               break;
+       case 11:
+               atom->ul = stats->node.reply_call;
+               break;
+       case 12:
+               atom->ul = stats->node.req_dmaster;
+               break;
+       case 13:
+               atom->ul = stats->node.reply_dmaster;
+               break;
+       case 14:
+               atom->ul = stats->node.reply_error;
+               break;
+       case 15:
+               atom->ul = stats->node.req_message;
+               break;
+       case 16:
+               atom->ul = stats->node.req_control;
+               break;
+       case 17:
+               atom->ul = stats->node.reply_control;
+               break;
+       default:
+               return PM_ERR_PMID;
+       }
+
+       return 0;
+}
+
+static int
+fill_client(unsigned int item, pmAtomValue *atom)
+{
+       switch (item) {
+       case 18:
+               atom->ul = stats->client.req_call;
+               break;
+       case 19:
+               atom->ul = stats->client.req_message;
+               break;
+       case 20:
+               atom->ul = stats->client.req_control;
+               break;
+       default:
+               return PM_ERR_PMID;
+       }
+
+       return 0;
+}
+
+static int
+fill_timeout(unsigned int item, pmAtomValue *atom)
+{
+       switch (item) {
+       case 21:
+               atom->ul = stats->timeouts.call;
+               break;
+       case 22:
+               atom->ul = stats->timeouts.control;
+               break;
+       case 23:
+               atom->ul = stats->timeouts.traverse;
+               break;
+       default:
+               return PM_ERR_PMID;
+       }
+
+       return 0;
+}
+
+/*
+ * callback provided to pmdaFetch
+ */
+static int
+pmda_ctdb_fetch_cb(pmdaMetric *mdesc, unsigned int inst, pmAtomValue *atom)
+{
+       int ret;
+       __pmID_int *id = (__pmID_int *)&(mdesc->m_desc.pmid);
+
+       if (inst != PM_IN_NULL) {
+               return PM_ERR_INST;
+       }
+
+       if (stats == NULL) {
+               fprintf(stderr, "stats not available\n");
+               ret = PM_ERR_VALUE;
+               goto err_out;
+       }
+
+
+       switch (id->cluster) {
+       case 0:
+               atom->ul = stats->num_clients;
+               break;
+       case 1:
+               atom->ul = stats->frozen;
+               break;
+       case 3:
+               atom->ul = stats->recovering;
+               break;
+       case 4:
+               atom->ul = stats->client_packets_sent;
+               break;
+       case 5:
+               atom->ul = stats->client_packets_recv;
+               break;
+       case 6:
+               atom->ul = stats->node_packets_sent;
+               break;
+       case 7:
+               atom->ul = stats->node_packets_recv;
+               break;
+       case 8:
+               atom->ul = stats->keepalive_packets_sent;
+               break;
+       case 9:
+               atom->ul = stats->keepalive_packets_recv;
+               break;
+       case 10:
+               ret = fill_node(id->item, atom);
+               if (ret) {
+                       goto err_out;
+               }
+               break;
+       case 11:
+               ret = fill_client(id->item, atom);
+               if (ret) {
+                       goto err_out;
+               }
+               break;
+       case 12:
+               ret = fill_timeout(id->item, atom);
+               if (ret) {
+                       goto err_out;
+               }
+               break;
+       case 13:
+               atom->ul = stats->total_calls;
+               break;
+       case 14:
+               atom->ul = stats->pending_calls;
+               break;
+       case 15:
+               atom->ul = stats->locks.num_calls;
+               break;
+       case 16:
+               atom->ul = stats->locks.num_pending;
+               break;
+       case 17:
+               atom->ul = stats->childwrite_calls;
+               break;
+       case 18:
+               atom->ul = stats->pending_childwrite_calls;
+               break;
+       case 19:
+               atom->ul = stats->memory_used;
+               break;
+       case 20:
+               atom->ul = stats->max_hop_count;
+               break;
+       case 21:
+               atom->d = stats->reclock.ctdbd.max;
+               break;
+       case 22:
+               atom->d = stats->reclock.recd.max;
+               break;
+       case 23:
+               atom->d = stats->call_latency.max;
+               break;
+       case 24:
+               atom->d = stats->locks.latency.max;
+               break;
+       case 25:
+               atom->d = stats->childwrite_latency.max;
+               break;
+       case 26:
+               atom->d = stats->num_recoveries;
+               break;
+       default:
+               return PM_ERR_PMID;
+       }
+
+       ret = 0;
+err_out:
+       return ret;
+}
+
+/*
+ * This routine is called once for each pmFetch(3) operation, so is a
+ * good place to do once-per-fetch functions, such as value caching or
+ * instance domain evaluation.
+ */
+static int
+pmda_ctdb_fetch(int numpmid, pmID pmidlist[], pmResult **resp, pmdaExt *pmda)
+{
+       int ret;
+       TDB_DATA data;
+       int32_t res;
+       struct timeval ctdb_timeout;
+
+       if (ctdb == NULL) {
+               fprintf(stderr, "attempting reconnect to ctdbd\n");
+               ret = pmda_ctdb_daemon_connect();
+               if (ret < 0) {
+                       fprintf(stderr, "reconnect failed\n");
+                       return PM_ERR_VALUE;
+               }
+       }
+
+       ctdb_timeout = timeval_current_ofs(1, 0);
+       ret = ctdb_control(ctdb, ctdb->pnn, 0,
+                          CTDB_CONTROL_STATISTICS, 0, tdb_null,
+                          ctdb, &data, &res, &ctdb_timeout, NULL);
+
+       if (ret != 0 || res != 0) {
+               fprintf(stderr, "ctdb control for statistics failed, reconnecting\n");
+               pmda_ctdb_daemon_disconnect();
+               ret = PM_ERR_VALUE;
+               goto err_out;
+       }
+
+       stats = (struct ctdb_statistics *)data.dptr;
+
+       if (data.dsize != sizeof(struct ctdb_statistics)) {
+               fprintf(stderr, "incorrect statistics size %zu - not %zu\n",
+                       data.dsize, sizeof(struct ctdb_statistics));
+               ret = PM_ERR_VALUE;
+               goto err_stats;
+       }
+
+       ret = pmdaFetch(numpmid, pmidlist, resp, pmda);
+
+err_stats:
+       talloc_free(stats);
+err_out:
+       return ret;
+}
+
+/*
+ * Initialise the agent
+ */
+void
+pmda_ctdb_init(pmdaInterface *dp)
+{
+       if (dp->status != 0) {
+               return;
+       }
+
+       dp->version.two.fetch = pmda_ctdb_fetch;
+       pmdaSetFetchCallBack(dp, pmda_ctdb_fetch_cb);
+
+       pmdaInit(dp, NULL, 0, metrictab,
+                (sizeof(metrictab) / sizeof(metrictab[0])));
+}
+
+static char *
+helpfile(void)
+{
+       static char buf[MAXPATHLEN];
+
+       if (!buf[0]) {
+               snprintf(buf, sizeof(buf), "%s/ctdb/help",
+                        pmGetConfig("PCP_PMDAS_DIR"));
+       }
+       return buf;
+}
+
+static void
+usage(void)
+{
+       fprintf(stderr, "Usage: %s [options]\n\n", pmProgname);
+       fputs("Options:\n"
+         "  -d domain        use domain (numeric) for metrics domain of PMDA\n"
+         "  -l logfile       write log into logfile rather than using default log name\n"
+         "\nExactly one of the following options may appear:\n"
+         "  -i port          expect PMCD to connect on given inet port (number or name)\n"
+         "  -p               expect PMCD to supply stdin/stdout (pipe)\n"
+         "  -u socket        expect PMCD to connect on given unix domain socket\n",
+         stderr);
+       exit(1);
+}
+
+/*
+ * Set up the agent if running as a daemon.
+ */
+int
+main(int argc, char **argv)
+{
+       int err = 0;
+       char log_file[] = "pmda_ctdb.log";
+       pmdaInterface dispatch;
+
+       __pmSetProgname(argv[0]);
+
+       pmdaDaemon(&dispatch, PMDA_INTERFACE_2, pmProgname, CTDB,
+                  log_file, helpfile());
+
+       if (pmdaGetOpt(argc, argv, "d:i:l:pu:?", &dispatch, &err) != EOF) {
+               err++;
+       }
+
+       if (err) {
+               usage();
+       }
+
+       pmdaOpenLog(&dispatch);
+       pmda_ctdb_init(&dispatch);
+       pmdaConnect(&dispatch);
+       pmdaMain(&dispatch);
+
+       exit(0);
+}
+
diff --git a/ctdb/utils/pmda/pmns b/ctdb/utils/pmda/pmns
new file mode 100644 (file)
index 0000000..dc7e3ac
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ * Metrics for CTDB PMDA
+ *
+ * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
+ * Copyright (c) 2011 David Disseldorp
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ */
+
+ctdb {
+       num_clients             CTDB:0:0
+       frozen                  CTDB:0:1
+       recovering              CTDB:0:2
+       client_packets_sent     CTDB:0:3
+       client_packets_recv     CTDB:0:4
+       node_packets_sent       CTDB:0:5
+       node_packets_recv       CTDB:0:6
+       keepalive_packets_sent  CTDB:0:7
+       keepalive_packets_recv  CTDB:0:8
+       node
+       client
+       timeouts
+       total_calls             CTDB:0:9
+       pending_calls           CTDB:0:10
+       lockwait_calls          CTDB:0:11
+       pending_lockwait_calls  CTDB:0:12
+       childwrite_calls        CTDB:0:13
+       pending_childwrite_calls CTDB:0:14
+       memory_used             CTDB:0:15
+       max_hop_count           CTDB:0:16
+       max_reclock_ctdbd       CTDB:0:17
+       max_reclock_recd        CTDB:0:18
+       max_call_latency        CTDB:0:19
+       max_lockwait_latency    CTDB:0:20
+       max_childwrite_latency  CTDB:0:21
+       num_recoveries          CTDB:0:22
+}
+
+ctdb.node {
+       req_call        CTDB:1:0
+       reply_call      CTDB:1:1
+       req_dmaster     CTDB:1:2
+       reply_dmaster   CTDB:1:3
+       reply_error     CTDB:1:4
+       req_message     CTDB:1:5
+       req_control     CTDB:1:6
+       reply_control   CTDB:1:7
+}
+
+ctdb.client {
+       req_call        CTDB:2:0
+       req_message     CTDB:2:1
+       req_control     CTDB:2:2
+}
+
+ctdb.timeouts {
+       call            CTDB:3:0
+       control         CTDB:3:1
+       traverse        CTDB:3:2
+}
+
diff --git a/ctdb/utils/pmda/root b/ctdb/utils/pmda/root
new file mode 100644 (file)
index 0000000..ff036ed
--- /dev/null
@@ -0,0 +1,10 @@
+/*
+ * fake "root" for validating the local PMNS subtree
+ */
+
+#include <stdpmid>
+
+root { ctdb }
+
+#include "pmns"
+
diff --git a/ctdb/utils/scsi_io/scsi_io.c b/ctdb/utils/scsi_io/scsi_io.c
new file mode 100644 (file)
index 0000000..1a4fe0e
--- /dev/null
@@ -0,0 +1,1152 @@
+/* a tool to open a scsi device and issue some useful commands
+   such as INQUIRY and helpers to call various PERSISTENT RESERVATION
+   functions
+
+   Copyright   ronnie sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/* very incomplete and needs to be enhanced with noice command line options
+   to drive it.
+   we need access to an array that supports the PERSISTENT RESERVATION cdb's
+   before we can proceed
+*/
+/* scsi bugs:
+   INQUIRY takes a 2 byte allocation_length parameter but it appears that 
+   it only looks at the low byte. If you specify 0x00ff all is well
+   but if you specify 0x0100   it gets confused and returnes garbage data
+   for (e.g) SupportedVPDPages. Same goes for UnitSerialNumber and probably all
+   other inq pages as well.
+
+*/
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <scsi/sg.h>
+#include "popt.h"
+
+
+#define SCSI_TIMEOUT 5000 /* ms */
+
+static char *command = NULL;
+static char *device  = NULL;
+static char *key     = NULL;
+static char *rmkey     = NULL;
+static int scope = -1;
+static int type  = -1;
+
+const char *sensetable[16]={
+       "no sense",
+       "recovered error",
+       "not ready",
+       "medium error",
+       "hardware error",
+       "illegal request",
+       "unit attention",
+       "data protect",
+       "blank check",
+       "vendor specific",
+       "copy aborted",
+       "aboreted command",
+       "unknown",
+       "unknown",
+       "unknown",
+       "unknown"
+};
+
+int scsi_io(int fd, unsigned char *cdb, unsigned char cdb_size, int xfer_dir, unsigned char *data, unsigned int *data_size, unsigned char *sense, unsigned int *sense_len)
+{
+       sg_io_hdr_t io_hdr;
+
+       memset(&io_hdr, 0, sizeof(sg_io_hdr_t));
+       io_hdr.interface_id = 'S';
+
+       /* CDB */
+       io_hdr.cmdp = cdb;
+       io_hdr.cmd_len = cdb_size;
+
+       /* Where to store the sense_data, if there was an error */
+       io_hdr.sbp = sense;
+       io_hdr.mx_sb_len = *sense_len;
+       *sense_len=0;
+
+       /* Transfer direction, either in or out. Linux does not yet
+          support bidirectional SCSI transfers ?
+        */
+       io_hdr.dxfer_direction = xfer_dir;
+
+       /* Where to store the DATA IN/OUT from the device and how big the
+          buffer is
+        */
+       io_hdr.dxferp = data;
+       io_hdr.dxfer_len = *data_size;
+
+       /* SCSI timeout in ms */
+       io_hdr.timeout = SCSI_TIMEOUT;
+
+
+       if(ioctl(fd, SG_IO, &io_hdr) < 0){
+               perror("SG_IO ioctl failed");
+               return -1;
+       }
+
+       /* now for the error processing */
+       if((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK){
+               if(io_hdr.sb_len_wr > 0){
+                       *sense_len=io_hdr.sb_len_wr;
+                       return 0;
+               }
+       }
+       if(io_hdr.masked_status){
+               printf("status=0x%x\n", io_hdr.status);
+               printf("masked_status=0x%x\n", io_hdr.masked_status);
+               return -2;
+       }
+       if(io_hdr.host_status){
+               printf("host_status=0x%x\n", io_hdr.host_status);
+               return -3;
+       }
+       if(io_hdr.driver_status){
+               printf("driver_status=0x%x\n", io_hdr.driver_status);
+               return -4;
+       }
+
+#if 0
+{int i;
+printf("CDB:\n");
+for(i=0;i<cdb_size;i++){printf("0x%02x ",cdb[i]);if((i%8)==7)printf("\n");}
+printf("\n");
+}
+{int i;
+printf("DATA:\n");
+for(i=0;i<96;i++){printf("0x%02x ",data[i]);if((i%8)==7)printf("\n");}
+printf("\n");
+}
+#endif
+
+       return 0;
+}
+
+typedef struct _value_string_t {
+       int     value;
+       const char      *string;
+} value_string_t;
+
+
+
+value_string_t peripheral_device_types[] = {
+       {0, "SBC : Direct Access Block device"},
+       {1, "SSC : Sequential Access Device"},
+       {5, "MMC : Multimedia Device"},
+       {17,"OSD : Object Based Storage"},
+       {0,NULL}
+};
+
+value_string_t scsi_versions[] = {
+       {0, "No conformance to any standard claimed"},
+       {3, "SPC"},
+       {4, "SPC-2"},
+       {5, "SPC-3"},
+       {0,NULL}
+};
+
+value_string_t vpd_pages[] = {
+       {0x00, "Supported VPD Pages"},
+       {0x80, "Unit Serial number"},
+       {0x83, "Device Identification"},
+       {0,NULL}
+};
+
+const char *val_to_str(value_string_t *vs, int v)
+{
+       while(vs && vs->string){
+               if(vs->value==v){
+                       return vs->string;
+               }
+               vs++;
+       }
+       return "";
+}
+
+void print_sense_data(unsigned char *sense, int sense_len)
+{
+       int i;
+       unsigned char asc, ascq;
+
+       printf("Device returned sense information\n");
+       if(sense[0]==0x70){
+               printf("filemark:%d eom:%d ili:%d  sense-key:0x%02x (%s)\n",
+                       !!(sense[2]&0x80),
+                       !!(sense[2]&0x40),
+                       !!(sense[2]&0x20),
+                       sense[2]&0x0f,
+                       sensetable[sense[2]&0x0f]);
+               printf("command specific info: 0x%02x 0x%02x 0x%02x 0x%02x\n",
+                       sense[8],sense[9],sense[10],sense[11]);
+
+               asc=sense[12];
+               printf("additional sense code:0x%02x\n", asc);
+
+               ascq=sense[13];
+               printf("additional sense code qualifier:0x%02x\n", ascq);
+
+               printf("field replacable unit code:0x%02x\n", sense[14]);
+
+               if((asc==0x20)&&(ascq==0x00))
+                       printf("INVALID COMMAND OPERATION CODE\n");
+       }
+
+       printf("Sense data:\n");
+       for(i=0;i<sense_len;i++){
+               printf("0x%02x ", sense[i]);
+               if((i%8)==7)printf("\n");
+       }
+       printf("\n");
+}
+
+int scsi_inquiry(int fd)
+{
+       unsigned char cdb[]={0x12,0,0,0,0,0};
+
+       unsigned int data_size=96;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+
+       int res, i;
+
+       cdb[3]=(data_size>>8)&0xff;
+       cdb[4]=data_size&0xff;
+
+
+       printf("Standard INQUIRY Data:\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_FROM_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       /* Peripheral Qualifier */
+       printf("Peripheral Qualifier:%c%c%cb\n",
+               '0'+!!(data[0]&0x80),
+               '0'+!!(data[0]&0x40),
+               '0'+!!(data[0]&0x20));
+
+       /* Peripheral Device Type */
+       printf("Peripheral Device Type: 0x%02x (%s)\n",
+               data[0]&0x1f,
+               val_to_str(peripheral_device_types, data[0]&0x1f));
+
+       /* RMB */
+       printf("RMB: %s device\n", data[1]&0x80?"REMOVABLE":"NON-REMOVABLE");
+
+       /* SCSI Version */
+       printf("SCSI Version: 0x%02x (%s)\n",
+               data[2],
+               val_to_str(scsi_versions, data[2]));
+
+       /* NormACA, HiSUP, Response Data Format */
+       printf("NormACA:%d HiSup:%d ResponseDataFormat:%d\n",
+               !!(data[3]&0x20),
+               !!(data[3]&0x10),
+               data[3]&0x0f);
+
+       switch(data[3]&0x0f){
+       /*SPC-2/SPC-3/SPC-4*/
+       case 2:
+       /*SPC (not strictly correct but we print it like 2 anyway)*/
+       case 1:
+               /* SCCS ... */
+               printf("SCCS:%d ACC:%d TPGS:%c%cb 3PC:%d PROTECT:%d\n",
+                       !!(data[5]&0x80),
+                       !!(data[5]&0x40),
+                       '0'+!!(data[5]&0x20),
+                       '0'+!!(data[5]&0x10),
+                       !!(data[5]&0x08),
+                       !!(data[5]&0x01));
+
+               /* Encserv ... */
+               printf("Encserv:%d VS:%d MultiP:%d ADDR16:%d\n",
+                       !!(data[6]&0x40),
+                       !!(data[6]&0x20),
+                       !!(data[6]&0x10),
+                       !!(data[6]&0x01));
+
+               /* WBUS16 ... */
+               printf("WBUS16:%d SYNC:%d CmdQue:%d VS:%d\n",
+                       !!(data[7]&0x20),
+                       !!(data[7]&0x10),
+                       !!(data[7]&0x02),
+                       !!(data[7]&0x01));
+                       
+
+               /* T10 vendor Identification */
+               printf("Vendor:");
+               for(i=0;i<8;i++)printf("%c",data[8+i]);printf("\n");
+               /* Product Identification */
+               printf("Product:");
+               for(i=0;i<16;i++)printf("%c",data[16+i]);printf("\n");
+
+               /* Product Revision Level */
+               printf("Product Revision:");
+               for(i=0;i<4;i++)printf("%c",data[32+i]);printf("\n");
+
+               break;
+       }
+       
+       return 0;
+}
+
+int scsi_inquiry_supported_vpd_pages(int fd)
+{
+       unsigned char cdb[]={0x12,0x01,0,0,0,0};
+
+       unsigned int data_size=0xff;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+
+       int res, pl, i;
+
+       cdb[3]=(data_size>>8)&0xff;
+       cdb[4]=data_size&0xff;
+
+
+       printf("INQUIRY Supported VPD Pages:\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_FROM_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       /* Page Length */
+       pl=data[3];
+
+       /* Pages */
+       for(i=4;i<(pl+4);i++){
+               printf("Page:%02xh (%s)\n",
+                       data[i],
+                       val_to_str(vpd_pages, data[i]));
+       }
+
+       return 0;
+}
+
+int scsi_inquiry_unit_serial_number(int fd)
+{
+       unsigned char cdb[]={0x12,0x01,0x80,0,0,0};
+
+       unsigned int data_size=0x00ff;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+
+       int res, pl, i;
+
+       cdb[3]=(data_size>>8)&0xff;
+       cdb[4]=data_size&0xff;
+
+
+       printf("INQUIRY Unit Serial Number:\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_FROM_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       /* Page Length */
+       pl=data[3];
+
+       /* Unit Serial Number */
+       printf("Unit Serial Number:");
+       for(i=4;i<(pl+4);i++)printf("%c",data[i]&0xff);printf("\n");
+
+       return 0;
+}
+
+int scsi_persistent_reserve_in_read_keys(int fd)
+{
+       unsigned char cdb[]={0x5e,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=0x00ff;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=0;
+       int res, i;
+       unsigned long prgeneration, additional_length;
+
+       cdb[1]=service_action;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+
+       printf("PRESISTENT RESERVE IN: READ KEYS\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_FROM_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       /* PRGeneration */
+       prgeneration=data[0];
+       prgeneration<<=8;prgeneration|=data[1];
+       prgeneration<<=8;prgeneration|=data[2];
+       prgeneration<<=8;prgeneration|=data[3];
+       printf("PRGeneration:%lu\n", prgeneration);
+
+       /* Additional Length */
+       additional_length=data[4];
+       additional_length<<=8;additional_length|=data[5];
+       additional_length<<=8;additional_length|=data[6];
+       additional_length<<=8;additional_length|=data[7];
+       printf("Additional Length:%lu\n", additional_length);
+
+       /* print the registered keys */
+       for(i=0;i<additional_length;i+=8){
+               printf("Key:%02x%02x%02x%02x%02x%02x%02x%02x\n",
+                       data[i+8],
+                       data[i+9],
+                       data[i+10],
+                       data[i+11],
+                       data[i+12],
+                       data[i+13],
+                       data[i+14],
+                       data[i+15]);
+       }
+
+       return 0;
+}
+
+int scsi_persistent_reserve_in_read_reservation(int fd)
+{
+       unsigned char cdb[]={0x5e,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=0x00ff;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=1;
+       int res;
+       unsigned long prgeneration, additional_length;
+
+       cdb[1]=service_action;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+
+       printf("PRESISTENT RESERVE IN: READ RESERVATION\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_FROM_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       /* PRGeneration */
+       prgeneration=data[0];
+       prgeneration<<=8;prgeneration|=data[1];
+       prgeneration<<=8;prgeneration|=data[2];
+       prgeneration<<=8;prgeneration|=data[3];
+       printf("PRGeneration:%lu\n", prgeneration);
+
+       /* Additional Length */
+       additional_length=data[4];
+       additional_length<<=8;additional_length|=data[5];
+       additional_length<<=8;additional_length|=data[6];
+       additional_length<<=8;additional_length|=data[7];
+       printf("Additional Length:%lu\n", additional_length);
+
+       if(additional_length==16){
+               printf("Key:%02x%02x%02x%02x%02x%02x%02x%02x\n",
+                       data[8],
+                       data[9],
+                       data[10],
+                       data[11],
+                       data[12],
+                       data[13],
+                       data[14],
+                       data[15]);
+               printf("Scope:%xh Type:%xh\n",data[21]>>4,data[21]&0x0f);
+       }
+
+       return 0;
+}
+
+int scsi_persistent_reserve_in_report_capabilities(int fd)
+{
+       unsigned char cdb[]={0x5e,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=0x00ff;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=2;
+       int res;
+       unsigned short length, type_mask;
+
+       cdb[1]=service_action;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+
+       printf("PRESISTENT RESERVE IN: REPORT CAPABILITIES\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_FROM_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       /* Length */
+       length=data[0];
+       length<<=8;length|=data[1];
+       printf("Length:%d\n", length);
+
+       /* CRH ... */
+       printf("CRH:%d SIP_C:%d ATP_C:%d PTPL_C:%d\n",
+               !!(data[2]&0x10),
+               !!(data[2]&0x08),
+               !!(data[2]&0x04),
+               !!(data[2]&0x01));
+
+       /* TMV ... */
+       printf("TMV:%d ALLOW_COMMANDS:%c%c%cb PTPL_A:%d\n",
+               !!(data[3]&0x80),
+               '0'+(!!(data[3]&0x40)),
+               '0'+(!!(data[3]&0x20)),
+               '0'+(!!(data[3]&0x10)),
+               !!(data[3]&0x01));
+
+       /* Persistent Reservation Type Mask */
+       type_mask=data[4];
+       type_mask<<=8;type_mask|=data[5];
+       printf("Presistent Reservation Type Mask:0x%04x\n", type_mask);
+       printf("WR_EX_AR:%d EX_AC_RO:%d WR_EX_RO:%d EX_AC:%d WR_EX:%d EX_AC_AR:%d\n",
+               !!(data[4]&0x80),
+               !!(data[4]&0x40),
+               !!(data[4]&0x20),
+               !!(data[4]&0x08),
+               !!(data[4]&0x02),
+               !!(data[4]&0x01));
+
+       return 0;
+}
+
+int scsi_persistent_reserve_in_read_full_status(int fd)
+{
+       unsigned char cdb[]={0x5e,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=0x00ff;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=3;
+       int res;
+       unsigned long prgeneration, additional_length;
+
+       cdb[1]=service_action;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+
+       printf("PRESISTENT RESERVE IN: READ FULL STATUS\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_FROM_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       /* PRGeneration */
+       prgeneration=data[0];
+       prgeneration<<=8;prgeneration|=data[1];
+       prgeneration<<=8;prgeneration|=data[2];
+       prgeneration<<=8;prgeneration|=data[3];
+       printf("PRGeneration:%lu\n", prgeneration);
+
+       /* Additional Length */
+       additional_length=data[4];
+       additional_length<<=8;additional_length|=data[5];
+       additional_length<<=8;additional_length|=data[6];
+       additional_length<<=8;additional_length|=data[7];
+       printf("Additional Length:%lu\n", additional_length);
+
+/*XXX*/
+
+       return 0;
+}
+
+int scsi_persistent_reserve_out_clear(int fd)
+{
+       unsigned char cdb[]={0x5f,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=24;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=3;
+       int res;
+
+       long long k;
+
+       if (scope==-1) {
+               printf("Must specify scope\n");
+               printf("scsi_io --device=<DEVICE> --command=clear --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (type==-1) {
+               printf("Must specify type\n");
+               printf("scsi_io --device=<DEVICE> --command=clear --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (!key) {
+               printf("Must specify key\n");
+               printf("scsi_io --device=<DEVICE> --command=clear --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+
+       sscanf(key, "%llx", &k);
+       cdb[1]=service_action;
+       cdb[2]=(scope<<4)|type;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+       memset(data, 0, data_size);
+
+       /* Reservation Key */
+       data[0]=(k>>56)&0xff;
+       data[1]=(k>>48)&0xff;
+       data[2]=(k>>40)&0xff;
+       data[3]=(k>>32)&0xff;
+       data[4]=(k>>24)&0xff;
+       data[5]=(k>>16)&0xff;
+       data[6]=(k>> 8)&0xff;
+       data[7]=(k    )&0xff;
+
+       /* Service Action Key */
+       data[8]=0;
+       data[9]=0;
+       data[10]=0;
+       data[11]=0;
+       data[12]=0;
+       data[13]=0;
+       data[14]=0;
+       data[15]=0;
+
+       /* Spec_ip_ti=0 all_tg_pt=1 aptpl=0 */
+       data[20]=0x04;
+
+       printf("PRESISTENT RESERVE IN: CLEAR\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_TO_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       return 0;
+}
+
+int scsi_persistent_reserve_out_reserve(int fd)
+{
+       unsigned char cdb[]={0x5f,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=24;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=1;
+       int res;
+       long long k;
+
+       if (scope==-1) {
+               printf("Must specify scope\n");
+               printf("scsi_io --device=<DEVICE> --command=reserve --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (type==-1) {
+               printf("Must specify type\n");
+               printf("scsi_io --device=<DEVICE> --command=reserve --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (!key) {
+               printf("Must specify key\n");
+               printf("scsi_io --device=<DEVICE> --command=reserve --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+
+       sscanf(key, "%llx", &k);
+
+
+       cdb[1]=service_action;
+       cdb[2]=(scope<<4)|type;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+       memset(data, 0, data_size);
+
+       /* Reservation Key */
+       data[0]=(k>>56)&0xff;
+       data[1]=(k>>48)&0xff;
+       data[2]=(k>>40)&0xff;
+       data[3]=(k>>32)&0xff;
+       data[4]=(k>>24)&0xff;
+       data[5]=(k>>16)&0xff;
+       data[6]=(k>> 8)&0xff;
+       data[7]=(k    )&0xff;
+
+       /* Service Action Key */
+       data[8]=0;
+       data[9]=0;
+       data[10]=0;
+       data[11]=0;
+       data[12]=0;
+       data[13]=0;
+       data[14]=0;
+       data[15]=0;
+
+       /* Spec_ip_ti=0 all_tg_pt=1 aptpl=0 */
+       data[20]=0x04;
+
+       printf("PRESISTENT RESERVE IN: RESERVE\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_TO_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       return 0;
+}
+
+int scsi_persistent_reserve_out_preempt(int fd)
+{
+       unsigned char cdb[]={0x5f,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=24;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=4;
+       int res;
+       long long k;
+
+       if (scope==-1) {
+               printf("Must specify scope\n");
+               printf("scsi_io --device=<DEVICE> --command=preempt --scope=<SCOPE> --type=<TYPE> --key=<KEY> --rmkey=<KEY>\n");
+               _exit(10);
+       }
+       if (type==-1) {
+               printf("Must specify type\n");
+               printf("scsi_io --device=<DEVICE> --command=preempt --scope=<SCOPE> --type=<TYPE> --key=<KEY> --rmkey=<KEY>\n");
+               _exit(10);
+       }
+       if (!key) {
+               printf("Must specify key\n");
+               printf("scsi_io --device=<DEVICE> --command=preempt --scope=<SCOPE> --type=<TYPE> --key=<KEY> --rmkey=<KEY>\n");
+               _exit(10);
+       }
+       if (!rmkey) {
+               printf("Must specify rmkey\n");
+               printf("scsi_io --device=<DEVICE> --command=preempt --scope=<SCOPE> --type=<TYPE> --key=<KEY> --rmkey=<KEY>\n");
+               _exit(10);
+       }
+
+
+
+       cdb[1]=service_action;
+       cdb[2]=(scope<<4)|type;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+       memset(data, 0, data_size);
+
+       /* Reservation Key */
+       sscanf(key, "%llx", &k);
+       data[0]=(k>>56)&0xff;
+       data[1]=(k>>48)&0xff;
+       data[2]=(k>>40)&0xff;
+       data[3]=(k>>32)&0xff;
+       data[4]=(k>>24)&0xff;
+       data[5]=(k>>16)&0xff;
+       data[6]=(k>> 8)&0xff;
+       data[7]=(k    )&0xff;
+
+       /* Service Action Key */
+       sscanf(rmkey, "%llx", &k);
+       data[8] =(k>>56)&0xff;
+       data[9] =(k>>48)&0xff;
+       data[10]=(k>>40)&0xff;
+       data[11]=(k>>32)&0xff;
+       data[12]=(k>>24)&0xff;
+       data[13]=(k>>16)&0xff;
+       data[14]=(k>> 8)&0xff;
+       data[15]=(k    )&0xff;
+
+       /* Spec_ip_ti=0 all_tg_pt=1 aptpl=0 */
+       data[20]=0x04;
+
+       printf("PRESISTENT RESERVE IN: RESERVE\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_TO_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       return 0;
+}
+
+int scsi_persistent_reserve_out_register_and_ignore_existing_key(int fd)
+{
+       unsigned char cdb[]={0x5f,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=24;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=6;
+       int res;
+       long long k;
+
+       if (scope==-1) {
+               printf("Must specify scope\n");
+               printf("scsi_io --device=<DEVICE> --command=registerkey --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (type==-1) {
+               printf("Must specify type\n");
+               printf("scsi_io --device=<DEVICE> --command=registerkey --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (!key) {
+               printf("Must specify key\n");
+               printf("scsi_io --device=<DEVICE> --command=registerkey --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+
+       sscanf(key, "%llx", &k);
+
+       cdb[1]=service_action;
+       cdb[2]=(scope<<4)|type;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+       memset(data, 0, data_size);
+
+       /* Reservation Key */
+       data[0]=0;
+       data[1]=0;
+       data[2]=0;
+       data[3]=0;
+       data[4]=0;
+       data[5]=0;
+       data[6]=0;
+       data[7]=0;
+
+       /* Service Action Key */
+       data[8] =(k>>56)&0xff;
+       data[9] =(k>>48)&0xff;
+       data[10]=(k>>40)&0xff;
+       data[11]=(k>>32)&0xff;
+       data[12]=(k>>24)&0xff;
+       data[13]=(k>>16)&0xff;
+       data[14]=(k>> 8)&0xff;
+       data[15]=(k    )&0xff;
+
+       /* Spec_ip_ti=0 all_tg_pt=1 aptpl=0 */
+       data[20]=0x04;
+
+       printf("PRESISTENT RESERVE IN: REGISTER AND IGNORE EXISTING KEY\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_TO_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       return 0;
+}
+
+int scsi_persistent_reserve_out_unregister_key(int fd)
+{
+       unsigned char cdb[]={0x5f,0,0,0,0,0,0,0,0,0};
+
+       unsigned int data_size=24;
+       unsigned char data[data_size];
+
+       unsigned int sense_len=32;
+       unsigned char sense[sense_len];
+       unsigned char service_action=6;
+       int res;
+       long long k;
+
+       if (scope==-1) {
+               printf("Must specify scope\n");
+               printf("scsi_io --device=<DEVICE> --command=unregisterkey --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (type==-1) {
+               printf("Must specify type\n");
+               printf("scsi_io --device=<DEVICE> --command=unregisterkey --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+       if (!key) {
+               printf("Must specify key\n");
+               printf("scsi_io --device=<DEVICE> --command=unregisterkey --scope=<SCOPE> --type=<TYPE> --key=<KEY>\n");
+               _exit(10);
+       }
+
+       sscanf(key, "%llx", &k);
+
+       cdb[1]=service_action;
+       cdb[2]=(scope<<4)|type;
+       cdb[7]=(data_size>>8)&0xff;
+       cdb[8]=data_size&0xff;
+
+       memset(data, 0, data_size);
+
+       /* Reservation Key */
+       data[0]=(k>>56)&0xff;
+       data[1]=(k>>48)&0xff;
+       data[2]=(k>>40)&0xff;
+       data[3]=(k>>32)&0xff;
+       data[4]=(k>>24)&0xff;
+       data[5]=(k>>16)&0xff;
+       data[6]=(k>> 8)&0xff;
+       data[7]=(k    )&0xff;
+
+       /* Service Action Key */
+       data[8]=0;
+       data[9]=0;
+       data[10]=0;
+       data[11]=0;
+       data[12]=0;
+       data[13]=0;
+       data[14]=0;
+       data[15]=0;
+
+       /* Spec_ip_ti=0 all_tg_pt=1 aptpl=0 */
+       data[20]=0x04;
+
+       printf("PRESISTENT RESERVE IN: UNREGISTER KEY\n");
+
+       res=scsi_io(fd, cdb, sizeof(cdb), SG_DXFER_TO_DEV, data, &data_size, sense, &sense_len);
+       if(res){
+               printf("SCSI_IO failed\n");
+               return -1;
+       }
+       if(sense_len){
+               print_sense_data(sense, sense_len);
+               return -1;
+       }
+
+       return 0;
+}
+
+
+
+
+int open_scsi_device(const char *dev)
+{
+       int fd, vers;
+
+       if((fd=open(dev, O_RDWR))<0){
+               printf("ERROR could not open device %s\n", dev);
+               return -1;
+       }
+       if ((ioctl(fd, SG_GET_VERSION_NUM, &vers) < 0) || (vers < 30000)) {
+               printf("/dev is not an sg device, or old sg driver\n");
+               close(fd);
+               return -1;
+       }
+
+       return fd;
+}
+
+typedef int (*scsi_func_t)(int fd);
+typedef struct _cmds_t {
+       const char *cmd;
+       scsi_func_t func;
+       const char *comment;
+} cmds_t;
+cmds_t cmds[] = {
+       {"inq",         scsi_inquiry,   "Standard INQUIRY output"},
+       {"vpd",         scsi_inquiry_supported_vpd_pages,       "Supported VPD Pages"},
+       {"usn",         scsi_inquiry_unit_serial_number,        "Unit serial number"},
+       {"readkeys",    scsi_persistent_reserve_in_read_keys,   "Read SCSI Reservation Keys"},
+       {"readrsvr",    scsi_persistent_reserve_in_read_reservation, "Read SCSI Reservation Data"},
+       {"reportcap",   scsi_persistent_reserve_in_report_capabilities, "Report reservation Capabilities"},
+       {"registerkey", scsi_persistent_reserve_out_register_and_ignore_existing_key,   "Register and ignore existing key"},
+       {"unregisterkey", scsi_persistent_reserve_out_unregister_key, "Unregister a key"},
+       {"clear",       scsi_persistent_reserve_out_clear, "Clear all reservations and registrations"},
+       {"reserve",     scsi_persistent_reserve_out_reserve, "Reserve"},
+       {"preempt",     scsi_persistent_reserve_out_preempt, "Preempt (remove someone elses registration)"},
+};
+
+void usage(void)
+{
+       int i;
+       printf("Usage:  scsi_io --command <command> --device <device>\n");
+       printf("Commands:\n");
+       for (i=0;i<sizeof(cmds)/sizeof(cmds[0]);i++){
+               printf("        %s      %s\n", cmds[i].cmd, cmds[i].comment);
+       }       
+}
+
+
+int main(int argc, const char *argv[])
+{
+       int i, fd;
+       int opt;
+       scsi_func_t func=NULL;
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               { "scope", 's', POPT_ARG_INT, &scope, 0, "scope", "integer" },
+               { "type", 't', POPT_ARG_INT, &type, 0, "type", "integer" },
+               { "key",      'k', POPT_ARG_STRING, &key, 0, "key", "key" },
+               { "rmkey",      'r', POPT_ARG_STRING, &rmkey, 0, "rmkey", "rmkey" },
+               { "command",      'c', POPT_ARG_STRING, &command, 0, "command", "command" },
+               { "device",      'd', POPT_ARG_STRING, &device, 0, "device", "device" },
+//             { "machinereadable", 'Y', POPT_ARG_NONE, &options.machinereadable, 0, "enable machinereadable output", NULL },
+               POPT_TABLEEND
+       };
+       poptContext pc;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       _exit(1);
+               }
+       }
+
+       if (!command) {
+               printf("Must specify the command\n");
+               usage();
+               _exit(10);
+       }
+
+       if (!device) {
+               printf("Must specify the device\n");
+               usage();
+               _exit(10);
+       }
+
+       fd=open_scsi_device(device);
+       if(fd<0){
+               printf("Could not open SCSI device %s\n",device);
+               usage();
+               _exit(10);
+       }
+
+       for (i=0;i<sizeof(cmds)/sizeof(cmds[0]);i++){
+               if(!strcmp(cmds[i].cmd, command)) {
+                       func = cmds[i].func;
+                       break;
+               }               
+       }
+       if (!func) {
+               printf("Unrecognized command : %s\n", command);
+               usage();
+               _exit(10);
+       }
+
+       func(fd);
+
+#if 0
+       scsi_persistent_reserve_in_read_full_status(fd);
+       scsi_persistent_reserve_out_register_and_ignore_existing_key(fd);
+       scsi_persistent_reserve_in_read_keys(fd);
+
+       scsi_persistent_reserve_out_reserve(fd);
+       scsi_persistent_reserve_in_read_reservation(fd);
+
+       scsi_persistent_reserve_out_clear(fd);
+       scsi_persistent_reserve_in_read_reservation(fd);
+
+       scsi_persistent_reserve_out_unregister_key(fd);
+       scsi_persistent_reserve_in_read_keys(fd);
+#endif
+       return 0;
+}
diff --git a/ctdb/utils/smnotify/smnotify.c b/ctdb/utils/smnotify/smnotify.c
new file mode 100644 (file)
index 0000000..d7fd546
--- /dev/null
@@ -0,0 +1,150 @@
+/* 
+   simple smnotify tool
+
+   Copyright (C) Ronnie Sahlberg 2007
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include "smnotify.h"
+#include "popt.h"
+
+static char *client       = NULL;
+static const char *ip     = NULL;
+static char *server = NULL;
+static int stateval       = 0;
+static int clientport     = 0;
+static int sendport       = 0;
+
+static void useage(void)
+{
+       exit(0);
+}
+
+static int create_socket(const char *addr, int port)
+{
+       int s;
+        struct sockaddr_in sock_in;
+
+       s = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+       if (s == -1) {
+               printf("Failed to open local socket\n");
+               exit(10);
+       }
+
+       bzero(&sock_in, sizeof(sock_in));
+       sock_in.sin_family = PF_INET;
+       sock_in.sin_port   = htons(port);
+       inet_aton(addr, &sock_in.sin_addr);
+       if (bind(s, (struct sockaddr *)&sock_in, sizeof(sock_in)) == -1) {
+               printf("Failed to bind to local socket\n");
+               exit(10);
+       }
+
+       return s;
+}
+
+int main(int argc, const char *argv[])
+{
+       struct poptOption popt_options[] = {
+               POPT_AUTOHELP
+               { "client", 'c', POPT_ARG_STRING, &client, 0, "remote client to send the notify to", "hostname/ip" },
+               { "clientport", 0, POPT_ARG_INT, &clientport, 0, "clientport", "integer" },
+               { "ip", 'i', POPT_ARG_STRING, &ip, 0, "local ip address to send the notification from", "ip" },
+               { "sendport", 0, POPT_ARG_INT, &sendport, 0, "port to send the notify from", "integer" },
+               { "server", 's', POPT_ARG_STRING, &server, 0, "servername to use in the notification", "hostname/ip" },
+               { "stateval", 0, POPT_ARG_INT, &stateval, 0, "stateval", "integer" },
+               POPT_TABLEEND
+       };
+       int opt;
+       poptContext pc;
+       CLIENT *clnt;
+       int s;
+        struct sockaddr_in sock_cl;
+       struct timeval w;
+       struct status st;
+
+       pc = poptGetContext(argv[0], argc, argv, popt_options, POPT_CONTEXT_KEEP_FIRST);
+
+       while ((opt = poptGetNextOpt(pc)) != -1) {
+               switch (opt) {
+               default:
+                       fprintf(stderr, "Invalid option %s: %s\n", 
+                               poptBadOption(pc, 0), poptStrerror(opt));
+                       exit(1);
+               }
+       }
+
+       if (client == NULL) {
+               printf("ERROR: client not specified\n");
+               useage();
+       }
+
+       if (ip == NULL) {
+               printf("ERROR: ip not specified\n");
+               useage();
+       }
+
+       if (server == NULL) {
+               printf("ERROR: server not specified\n");
+               useage();
+       }
+
+       if (stateval == 0) {
+               printf("ERROR: stateval not specified\n");
+               useage();
+       }
+
+
+       /* Since we want to control from which address these packets are
+          sent we must create the socket ourself and use low-level rpc
+          calls.
+       */
+       s = create_socket(ip, sendport);
+
+       /* only wait for at most 3 seconds before giving up */
+       alarm(3);
+
+       /* Setup a sockaddr_in for the client we want to notify */
+       bzero(&sock_cl, sizeof(sock_cl));
+       sock_cl.sin_family = PF_INET;
+       sock_cl.sin_port   = htons(clientport);
+       inet_aton(client, &sock_cl.sin_addr);
+
+       w.tv_sec = 1;
+       w.tv_usec= 0;
+
+       clnt = clntudp_create(&sock_cl, 100024, 1, w, &s);
+       if (clnt == NULL) {
+               printf("ERROR: failed to connect to client\n");
+               exit(10);
+       }
+
+       /* we dont want to wait for any reply */
+       w.tv_sec = 0;
+       w.tv_usec = 0;
+       clnt_control(clnt, CLSET_TIMEOUT, (char *)&w);
+
+       st.mon_name=server;
+       st.state=stateval;
+       sm_notify_1(&st, clnt);
+
+       return 0;
+}
diff --git a/ctdb/utils/smnotify/smnotify.x b/ctdb/utils/smnotify/smnotify.x
new file mode 100644 (file)
index 0000000..94239f8
--- /dev/null
@@ -0,0 +1,21 @@
+#ifdef RPC_HDR
+%#ifdef _AIX
+%#include <rpc/rpc.h>
+%#endif /* _AIX */
+#endif /* RPC_HDR */
+
+const SM_MAXSTRLEN = 1024;
+
+struct status {
+       string mon_name<SM_MAXSTRLEN>;
+       int state;
+};
+
+
+program SMNOTIFY {
+       version SMVERSION {
+               void SM_NOTIFY(struct status) = 6;
+       } = 1;  
+} = 100024;
+
+
diff --git a/ctdb/web/bar1.jpg b/ctdb/web/bar1.jpg
new file mode 100644 (file)
index 0000000..7c6acf3
Binary files /dev/null and b/ctdb/web/bar1.jpg differ
diff --git a/ctdb/web/building.html b/ctdb/web/building.html
new file mode 100644 (file)
index 0000000..7475078
--- /dev/null
@@ -0,0 +1,42 @@
+<!--#set var="TITLE" value="Building CTDB" -->
+<!--#include virtual="header.html" -->
+
+<H2 align="center">Building CTDB and Samba</h2>
+
+<h2>CTDB</h2>
+To build a copy of CTDB code from a git tree you should do this:
+<pre>
+   cd ctdb
+   ./autogen.sh
+   ./configure
+   make
+   make install
+</pre>
+
+To build a copy of CTDB code from a tarball you should do this:
+<pre>
+   tar xf ctdb-x.y.tar.gz
+   cd ctdb-x.y
+   ./configure
+   make
+   make install
+</pre>
+You need to install ctdb on all nodes of your cluster.
+
+
+<h2>Samba3</h2>
+
+To build a copy of Samba3 with clustering and ctdb support you should do this:
+<pre>
+    cd samba_3_0_ctdb/source
+    ./autogen.sh
+    ./configure --with-ctdb=/usr/src/ctdb --with-cluster-support --enable-pie=no
+    make proto
+    make
+</pre>
+
+Once compiled, you should install Samba on all cluster nodes.<br><br>
+
+The /usr/src/ctdb path should be replaced with the path to the ctdb sources that you downloaded above.
+
+<!--#include virtual="footer.html" -->
diff --git a/ctdb/web/clamd.html b/ctdb/web/clamd.html
new file mode 100644 (file)
index 0000000..4edb4cf
--- /dev/null
@@ -0,0 +1,78 @@
+<!--#set var="TITLE" value="CTDB and ClamAV Daemon" -->
+<!--#include virtual="header.html" -->
+
+<h1>Setting up ClamAV with CTDB</h1>
+
+<h2>Prereqs</h2>
+Configure CTDB as above and set it up to use public ipaddresses.<br>
+Verify that the CTDB cluster works.
+
+<h2>Configuration</h2>
+
+Configure clamd on each node on the cluster.<br><br>
+For details how to configure clamd check its documentation.
+
+<h2>/etc/sysconfig/ctdb</h2>
+
+Add the following lines to the /etc/sysconfig/ctdb configuration file.
+<pre>
+  CTDB_MANAGES_CLAMD=yes
+  CTDB_CLAMD_SOCKET="/path/to/clamd.sock"
+</pre>
+
+Disable clamd in chkconfig so that it does not start by default. Instead CTDB will start/stop clamd as required.
+<pre>
+  chkconfig clamd off
+</pre>
+
+<h2>Events script</h2>
+
+The CTDB distribution already comes with an events script for clamd in the file /etc/ctdb/events.d/31.clamd<br><br>
+There should not be any need to edit this file.
+What you need is to set it as executable, with command like this:
+<pre>
+  chmod +x /etc/ctdb/events.d/31.clamd
+</pre>
+To check if ctdb monitoring and handling with clamd, you can check outpout of command:
+<pre>
+  ctdb scriptstatus
+</pre>
+
+<h2>Restart your cluster</h2>
+Next time your cluster restarts, CTDB will start managing the clamd service.<br><br>
+If the cluster is already in production you may not want to restart the entire cluster since this would disrupt services.<br>
+
+Insted you can just disable/enable the nodes one by one. Once a node becomes enabled again it will start the clamd service.<br><br>
+
+Follow the procedure below for each node, one node at a time :
+
+<h3>1 Disable the node</h3>
+Use the ctdb command to disable the node :
+<pre>
+  ctdb -n NODE disable
+</pre>
+
+<h3>2 Wait until the cluster has recovered</h3>
+
+Use the ctdb tool to monitor until the cluster has recovered, i.e. Recovery mode is NORMAL. This should happen within seconds of when you disabled the node.
+<pre>
+  ctdb status
+</pre>
+
+<h3>3 Enable the node again</h3>
+
+Re-enable the node again which will start the newly configured vsftp service.
+<pre>
+  ctdb -n NODE enable
+</pre>
+
+<h2>See also</h2>
+
+The CLAMAV section in the ctdbd manpage.
+
+<pre>
+  man ctdbd
+</pre>
+
+<!--#include virtual="footer.html" -->
+
diff --git a/ctdb/web/configuring.html b/ctdb/web/configuring.html
new file mode 100644 (file)
index 0000000..b827290
--- /dev/null
@@ -0,0 +1,202 @@
+<!--#set var="TITLE" value="Configuring CTDB" -->
+<!--#include virtual="header.html" -->
+
+<H2 align="center">Configuring CTDB</H2>
+
+<h2>Clustering Model</h2>
+
+The setup instructions on this page are modelled on setting up a cluster of N 
+nodes that function in nearly all respects as a single multi-homed node. 
+So the cluster will export N IP interfaces, each of which is equivalent 
+(same shares) and which offers coherent CIFS file access across all 
+nodes.<p>
+
+The clustering model utilizes IP takeover techniques to ensure that
+the full set of public IP addresses assigned to services on the
+cluster will always be available to the clients even when some nodes
+have failed and become unavailable.
+
+<h2>CTDB Cluster Configuration</h2>
+
+These are the primary configuration files for CTDB.<p>
+
+When CTDB is installed, it will install template versions of these
+files which you need to edit to suit your system.  
+
+<h3>/etc/sysconfig/ctdb</h3>
+
+This file contains the startup parameters for ctdb.<p>
+
+When you installed ctdb, a template config file should have been
+installed in /etc/sysconfig/ctdb.<p>
+
+Edit this file, following the instructions in the template.<p>
+
+The most important options are:
+<ul>
+<li>CTDB_NODES
+<li>CTDB_RECOVERY_LOCK
+<li>CTDB_PUBLIC_ADDRESSES
+</ul>
+
+Please verify these parameters carefully.
+
+<h4>CTDB_RECOVERY_LOCK</h4>
+
+This parameter specifies the lock file that the CTDB daemons use to arbitrate 
+which node is acting as a recovery master.<br>
+
+This file MUST be held on shared storage so that all CTDB daemons in the cluster will access/lock the same file.<br><br>
+
+You <strong>must</strong> specify this parameter.<br>
+There is no default for this parameter.
+
+<h3>CTDB_NODES</h3>
+
+This file needs to be created and should contain a list of the private
+IP addresses that the CTDB daemons will use in your cluster. One IP
+address for each node in the cluster.<p>
+
+This should be a private non-routable subnet which is only used for
+internal cluster traffic. This file must be the same on all nodes in
+the cluster.<p>
+
+Make sure that these IP addresses are automatically started when the
+cluster node boots and that each node can ping each other node.<p>
+
+Example 4 node cluster:
+<pre>
+  CTDB_NODES=/etc/ctdb/nodes
+</pre>
+Content of /etc/ctdb/nodes:
+<pre>
+ 10.1.1.1
+ 10.1.1.2
+ 10.1.1.3
+ 10.1.1.4
+</pre>
+
+The default for this file is /etc/ctdb/nodes.
+
+
+<h3>CTDB_PUBLIC_ADDRESSES</h3>
+
+Each node in a CTDB cluster contains a list of public addresses which that 
+particular node can host.<p>
+
+While running the CTDB cluster will assign each public address that exists in the entire cluster to one node that will host that public address.<p>
+
+These are the addresses that the SMBD daemons and other services will
+bind to and which clients will use to connect to the cluster.<p>
+
+<h3>Example 4 node cluster:</h3>
+<pre>
+  CTDB_PUBLIC_ADDRESSES=/etc/ctdb/public_addresses
+</pre>
+Content of /etc/ctdb/public_addresses:
+<pre>
+ 192.168.1.1/24 eth0
+ 192.168.1.2/24 eth0
+ 192.168.2.1/24 eth1
+ 192.168.2.2/24 eth1
+</pre>
+
+These are the IP addresses that you should configure in DNS for the
+name of the clustered samba server and are the addresses that CIFS
+clients will connect to.<p>
+
+Configure it as one DNS A record (==name) with multiple IP addresses
+and let round-robin DNS distribute the clients across the nodes of the
+cluster.<p>
+
+The CTDB cluster utilizes IP takeover techniques to ensure that as long as at least one node in the cluster is available, all the public IP addresses will always be available to clients.<p>
+
+This means that if one physical node fails, the public addresses that
+node was serving will be taken over by a different node in the cluster. This
+provides a guarantee that all ip addresses exposed to clients will
+always be reachable by clients as long as at least one node still remains available in the cluster with the capability to host that public address (i.e. the public address exists in that nodes public_addresses file).
+
+Do not assign these addresses to any of the interfaces on the
+host. CTDB will add and remove these addresses automatically at
+runtime.<p>
+
+This parameter is used when CTDB operated in takeover ip mode.<p>
+
+The usual location for this file is /etc/ctdb/public_addresses.<p><p>
+
+<h3>Example 2:</h3>
+By using different public_addresses files on different nodes it is possible to 
+partition the cluster into subsets of nodes.
+
+<pre>
+Node 0 : /etc/ctdb/public_addresses
+10.1.1.1/24 eth0
+10.1.2.1/24 eth1
+</pre>
+
+<pre>
+Node 1 : /etc/ctdb/public_addresses
+10.1.2.1/24 eth1
+10.1.3.1/24 eth2
+</pre>
+
+<pre>
+Node 2 : /etc/ctdb/public_addresses
+10.1.3.2/24 eth2
+</pre>
+
+In this example we have three nodes but a total of 4 public addresses.<p>
+
+10.1.2.1 can be hosted by either node 0 or node 1 and will be available to clients as long as at least one of these nodes are available. Only if both nodes 0 and 1 fails will this public address become unavailable to clients.<p>
+
+All other public addresses can only be served by one single node respectively and will therefore only be avialable if the respective node is also available.
+
+
+<h2>Event scripts</h2>
+
+CTDB comes with a number of application specific event scripts that
+are used to do service specific tasks when the cluster has been
+reconfigured. These scripts are stored in /etc/ctdb/events.d/<p>
+
+You do not need to modify these scripts if you just want to use
+clustered Samba or NFS but they serve as examples in case you want to
+add clustering support for other application servers we do not yet
+proivide event scripts for.<p>
+
+Please see the service scripts that installed by ctdb in
+/etc/ctdb/events.d for examples of how to configure other services to
+be aware of the HA features of CTDB.<p>
+
+Also see /etc/ctdb/events.d/README for additional documentation on how to
+create and manage event scripts.
+
+<h2>TCP port to use for CTDB</h2>
+
+CTDB defaults to use TCP port 4379 for its traffic.<p>
+
+Configuring a different port to use for CTDB traffic is done by adding
+a ctdb entry to the /etc/services file.<p>
+
+Example: for change CTDB to use port 9999 add the following line to /etc/services
+<pre>
+ ctdb  9999/tcp
+</pre>
+
+Note: all nodes in the cluster MUST use the same port or else CTDB
+will not start correctly.
+
+<h2>Name resolution</h2>
+
+You need to setup some method for your Windows and NFS clients to find
+the nodes of the cluster, and automatically balance the load between
+the nodes.<p>
+
+We recommend that you use public ip addresses using
+CTDB_PUBLIC_INTERFACE/CTDB_PUBLIC_ADDRESSES and that you setup a
+round-robin DNS entry for your cluster, listing all the public IP
+addresses that CTDB will be managing as a single DNS A record.<p>
+
+You may also wish to setup a static WINS server entry listing all of
+your cluster nodes IP addresses.
+
+<!--#include virtual="footer.html" -->
diff --git a/ctdb/web/ctdblogo.png b/ctdb/web/ctdblogo.png
new file mode 100644 (file)
index 0000000..68304a2
Binary files /dev/null and b/ctdb/web/ctdblogo.png differ
diff --git a/ctdb/web/documentation.html b/ctdb/web/documentation.html
new file mode 100644 (file)
index 0000000..86ec332
--- /dev/null
@@ -0,0 +1,43 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
+<HTML>
+<!--#set var="TITLE" value="CTDB Documentation" -->
+<!--#include virtual="header.html" -->
+
+<h1>CTDB Documentation</h1>
+
+The following documentation should get you started with CTDB. 
+
+<ul>
+<li><a href="prerequisites.html">Prerequisites</a>
+<li><a href="download.html">Downloading CTDB</a>
+<li><a href="building.html">Building CTDB</a>
+<li><a href="configuring.html">Configuring CTDB</a>
+<li><a href="testing.html">Testing CTDB</a>
+<li><a href="samba.html">Setting up Samba with CTDB</a>
+<li><a href="ftp.html">Setting up FTP with CTDB</a>
+<li><a href="nfs.html">Setting up NFS with CTDB</a>
+<li><a href="iscsi.html">Setting up iSCSI with CTDB</a>
+<li><a href="clamd.html">Setting up CLAMD with CTDB</a>
+<li><a href="http://wiki.samba.org/index.php/CTDB_Setup">CTDB Wiki</a>
+</ul>
+
+Man pages:
+<ul>
+<li><a href="http://ctdb.samba.org/manpages/ctdb.1.html">ctdb (1)</a>
+<li><a href="http://ctdb.samba.org/manpages/ctdbd.1.html">ctdbd (1)</a>
+<li><a href="http://ctdb.samba.org/manpages/ctdbd_wrapper.1.html">ctdbd_wrapper (1)</a>
+<li><a href="http://ctdb.samba.org/manpages/ctdbd.conf.5.html">ctdbd.conf (5)</a>
+<li><a href="http://ctdb.samba.org/manpages/ctdb.7.html">ctdb (7)</a>
+<li><a href="http://ctdb.samba.org/manpages/ctdb-tunables.7.html">ctdb-tunables (7)</a>
+<li><a href="http://ctdb.samba.org/manpages/onnode.1.html">onnode (1)</a>
+<li><a href="http://ctdb.samba.org/manpages/ltdbtool.1.html">ltdbtool (1)</a>
+<li><a href="http://ctdb.samba.org/manpages/ping_pong.1.html">ping_pong (1)</a>
+</ul>
+
+Articles:
+<ul>
+<li><a href="http://samba.org/~obnox/presentations/sambaXP-2009/">Michael
+    Adam's clustered NAS articles</a>
+</ul>
+
+<!--#include virtual="footer.html" -->
diff --git a/ctdb/web/download.html b/ctdb/web/download.html
new file mode 100644 (file)
index 0000000..dce75fe
--- /dev/null
@@ -0,0 +1,50 @@
+<!--#set var="TITLE" value="Downloading CTDB" -->
+<!--#include virtual="header.html" -->
+
+<H2 align="center">Getting the code</h2>
+
+You need two source trees, one is a copy of Samba3 and the other is the
+ctdb code itself.<p>
+
+Both source trees are stored in git repositories.<p>
+
+<h2>CTDB</h2>
+You can download ctdb source code via <a href="ftp://ftp.samba.org/pub/ctdb">ftp</a>
+and <a href="http://ftp.samba.org/pub/ctdb">http</a>. <br><br>
+
+You can also get the latest development version of ctdb using git:
+<pre>
+   git clone git://git.samba.org/ctdb.git ctdb
+</pre>
+
+To update this tree when improvements are made in the upstream code do this:
+<pre>
+    cd ctdb
+    git pull
+</pre>
+
+If you don't have git and can't easily install it, then you can
+instead use the following command to fetch ctdb or update it:
+<pre>
+    rsync -avz samba.org::ftp/unpacked/ctdb .
+</pre>
+
+
+<h2>Samba3 ctdb version</h2>
+<p>
+With Samba version 3.3 all cluster-relevant changes have been merged
+to the mainstream Samba code. Please refer to the <a
+href="http://www.samba.org/">Samba website</a> for the current release
+information.
+</p>
+
+<h2>Binary Packages</h2>
+
+Note that packages are so far only available for RHEL5. Other packages
+may come later. <p>
+
+See <a href="http://ftp.samba.org/pub/ctdb/packages/">packages</a> directory for package
+downloads.
+
+
+<!--#include virtual="footer.html" -->
diff --git a/ctdb/web/footer.html b/ctdb/web/footer.html
new file mode 100644 (file)
index 0000000..a9758e8
--- /dev/null
@@ -0,0 +1,39 @@
+</td>
+</tr>
+
+  <TR ALIGN="center">
+    <TD><BR><a name="search"></a><img src="/bar1.jpg" WIDTH="493" HEIGHT="26" BORDER="0" alt="=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=">
+
+<!-- SiteSearch Google -->
+<form method="get" action="http://www.google.com/custom">
+<table border="0">
+<tr><td nowrap="nowrap" valign="top" align="left" height="32">
+<a href="http://www.google.com/"><img src="http://www.google.com/logos/Logo_25wht.gif" border="0" alt="Google" /></a>
+</td><td nowrap="nowrap">
+<input type="hidden" name="domains" value="samba.org" />
+<input type="text" name="q" size="31" maxlength="255" value="CTDB " />
+<input type="submit" name="sa" value="Search" />
+</td></tr><tr><td>&nbsp;</td>
+<td nowrap="nowrap">
+<table><tr><td>
+<input type="radio" name="sitesearch" value="" />
+<font size="-1" color="#000000">Search WWW</font>
+</td><td>
+<input type="radio" name="sitesearch" value="samba.org" checked="checked" />
+<font size="-1" color="#000000">Search samba.org</font>
+</td></tr></table>
+<input type="hidden" name="client" value="pub-1444957896811922" />
+<input type="hidden" name="forid" value="1" />
+<input type="hidden" name="ie" value="ISO-8859-1" />
+<input type="hidden" name="oe" value="ISO-8859-1" />
+<input type="hidden" name="cof"
+       value="GALT:#008000;GL:1;DIV:#336699;VLC:663399;AH:center;BGC:FFFFFF;LBGC:FFFFFF;ALC:0000FF;LC:0000FF;T:000000;GFNT:0000FF;GIMP:0000FF;LH:60;LW:470;L:http://samba.org/samba/images/samba_banner.gif;S:http://samba.org/;FORID:1;"
+       />
+<input type="hidden" name="hl" value="en" />
+</td></tr></table>
+</form>
+<!-- SiteSearch Google -->
+
+  </TD>
+  </TR>
+</TABLE>
diff --git a/ctdb/web/ftp.html b/ctdb/web/ftp.html
new file mode 100644 (file)
index 0000000..82acd1d
--- /dev/null
@@ -0,0 +1,102 @@
+<!--#set var="TITLE" value="CTDB and ftp" -->
+<!--#include virtual="header.html" -->
+
+<h1>Setting up clustered FTP</h1>
+
+<h2>Prereqs</h2>
+Configure CTDB as above and set it up to use public ipaddresses.<br>
+Verify that the CTDB cluster works.
+
+<h2>Configuration</h2>
+
+Setting up a vsftpd cluster is really easy.<br>
+Configure vsftpd on each node on the cluster.<br><br>
+Set up vsftpd to export directories from the shared cluster filesystem.
+
+<h2>/etc/sysconfig/ctdb</h2>
+
+Add the following line to the /etc/sysconfig/ctdb configuration file.
+<pre>
+  CTDB_MANAGES_VSFTPD=yes
+</pre>
+
+Disable vsftpd in chkconfig so that it does not start by default. Instead CTDB will start/stop vsftdp as required.
+<pre>
+  chkconfig vsftpd off
+</pre>
+
+<h2>PAM configuration</h2>
+PAM must be configured to allow authentication of CIFS users so that the ftp 
+daemon can authenticate the users logging in.
+
+Make sure the following line is present in /etc/pam.d/system-auth
+<pre>
+auth        sufficient    pam_winbind.so use_first_pass
+
+</pre>
+If this line is missing you must enable winbind authentication by running 
+<pre>
+authconfig  --enablewinbindauth --update
+authconfig  --enablewinbind --update
+</pre>
+
+<h2>Default shell</h2>
+To log in to the ftp server, the user must have a shell configured in smb.conf.
+
+Add the following line to the globals section of /etc/samba/smb.conf 
+<pre>
+       template shell = /bin/bash
+</pre>
+
+<h2>Home directory</h2>
+FTP users must have a home directory configured so they can log in.
+Configure samba to provide home directories for domain users. These home 
+directories should be stored on shared storage so they are available from
+all nodes in the cluster.<br>
+
+
+A simple way to create homedirectories are to add
+<pre>
+       template homedir = /&lt;shared storage&gt;/homedir/%D/%U
+</pre>
+to /etc/samba/smb.conf .<br>
+
+The homedirectory must exist or the user will not be able to log in with FTP.
+
+
+<h2>Events script</h2>
+
+The CTDB distribution already comes with an events script for vsftp in the file /etc/ctdb/events.d/40.vsftpd<br><br>
+There should not be any need to edit this file.
+
+
+<h2>Restart your cluster</h2>
+Next time your cluster restarts, CTDB will start managing the vsftp service.<br><br>
+If the cluster is already in production you may not want to restart the entire cluster since this would disrupt services.<br>
+
+Insted you can just disable/enable the nodes one by one. Once a node becomes enabled again it will start the vsftp service.<br><br>
+
+Follow the procedure below for each node, one node at a time :
+
+<h3>1 Disable the node</h3>
+Use the ctdb command to disable the node :
+<pre>
+  ctdb -n NODE disable
+</pre>
+
+<h3>2 Wait until the cluster has recovered</h3>
+
+Use the ctdb tool to monitor until the cluster has recovered, i.e. Recovery mode is NORMAL. This should happen within seconds of when you disabled the node.
+<pre>
+  ctdb status
+</pre>
+
+<h3>3 Enable the node again</h3>
+
+Re-enable the node again which will start the newly configured vsftp service.
+<pre>
+  ctdb -n NODE enable
+</pre>
+
+<!--#include virtual="footer.html" -->
+
diff --git a/ctdb/web/header.html b/ctdb/web/header.html
new file mode 100644 (file)
index 0000000..a356b08
--- /dev/null
@@ -0,0 +1,44 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">
+<HTML>
+<HEAD>
+<TITLE><!--#echo var="TITLE" --></TITLE>
+<meta http-equiv="Content-Type" content="text/html;charset=utf-8" >
+</HEAD>
+
+<BODY BGCOLOR="#ffffff" TEXT="#000000" VLINK="#292555" LINK="#292555"
+      ALINK="#cc0033">
+<TABLE BORDER=0 WIDTH="75%" ALIGN="CENTER">
+  <tr VALIGN="middle">
+    <td ALIGN="left">
+       <ul>
+      <li><small><a href="/">home</a></small>
+      <li><small><a href="/documentation.html">documentation</a></small>
+      <li><small><a href="/configuring.html">configuring</a></small>
+      <li><small><a href="/building.html">building</a></small>
+       </ul>
+    </td>
+    <td align="center">
+      <a href="."><img src="/ctdblogo.png" border="0" alt="CTDB"></a>
+    </td>
+    <td align="left">
+      <ul>
+      <li><small><a href="/download.html">download</a></small>
+      <li><small><a href="/testing.html">testing</a></small>
+      <li><small><a href="http://wiki.samba.org/index.php/CTDB_Setup">wiki</a></small>
+      <li><small><a href="http://bugzilla.samba.org/">bug-tracking</a></small>
+      </ul>
+    </td>
+  </tr>
+
+  <TR ALIGN="center">
+    <TD COLSPAN="3">
+    <img src="/bar1.jpg" WIDTH="493" HEIGHT="26"
+    BORDER="0"
+    alt="=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=">
+    </TD>
+  </TR>
+</TABLE>
+
+<TABLE BORDER=0 WIDTH="60%" ALIGN="CENTER">
+  <tr VALIGN="middle">
+    <td ALIGN="left">
diff --git a/ctdb/web/index.html b/ctdb/web/index.html
new file mode 100644 (file)
index 0000000..91f87e8
--- /dev/null
@@ -0,0 +1,141 @@
+<!--#set var="TITLE" value="CTDB" -->
+<!--#include virtual="header.html" -->
+
+<H2 align="center">Welcome to the CTDB web pages</H2>
+
+CTDB is a cluster implementation of the TDB database used by Samba and
+other projects to store temporary data. If an application is already
+using TDB for temporary data it is very easy to convert that
+application to be cluster aware and use CTDB instead.
+
+<p>CTDB provides the same types of functions as TDB but in a clustered
+  fashion, providing a TDB-style database that spans multiple physical
+  hosts in a cluster.
+
+<p>Features include:
+<ul>
+<li>CTDB provides a TDB that has consistent data and consistent locking across
+all nodes in a cluster.
+<li>CTDB is very fast.
+<li>In case of node failures, CTDB will automatically recover and
+  repair all TDB databases that it manages.
+<li>CTDB is the core component that provides <strong>pCIFS</strong>
+("parallel CIFS") with Samba3/4.
+<li>CTDB provides HA features such as node monitoring, node failover,
+  and IP takeover.
+<li>CTDB provides a reliable messaging transport to allow applications
+ linked with CTDB to communicate to other instances of the application
+ running on different nodes in the cluster.
+<li>CTDB has pluggable transport backends. Currently implemented backends are TCP
+ and Infiniband.
+<li>CTDB supports a system of application specific management scripts,
+  allowing applications that depend on network or filesystem resources
+  to be managed in a highly available manner on a cluster.
+</ul>
+
+<h2>Requirements</h2>
+
+CTDB relies on a clustered filesystem being available and shared on
+all nodes that participate in the CTDB cluster. This filesystem must
+be mounted and available on all nodes in the CTDB cluster.
+
+<p>On top of this cluster filesystem, CTDB then provides clustered HA
+features so that data from the clustered filesystem can be exported
+through multiple nodes in the CTDB cluster using various
+services. Currently included with CTDB are the necessary hooks for Samba, NFS
+  and ftp exports. Support for new service types can easily be added.
+
+<h2>TDB</h2>
+
+TDB is a very fast simple database that was originally developed for
+use in Samba. Today several other projects use TDB to store their data.
+
+<p>See the <a
+href="http://samba.org/ftp/unpacked/tdb/docs/README">TDB
+README file</a> for a description of how TDB is used.
+
+<h2>Documentation</h2>
+
+<a href="./documentation.html">CTDB documentation</a><br><br>
+
+Additional documentation on how to install and configure CTDB is available in the
+<a href="http://wiki.samba.org/index.php/CTDB_Setup">CTDB
+  Wiki</a>. Please read all of the documentation carefully.
+
+<h2>High Availability Features</h2>
+
+The CTDB nodes in a cluster designates one node as a recovery master
+through an election process. If the recovery master node fails a
+new election is initiated so that the cluster will always guarantee
+there will be a recovery master. The recovery master will
+continuously monitor the cluster to verify that all nodes contain a
+consistent configuration and view of the cluster and will initiate a
+recovery process when required.
+
+<p>During the recovery phase, the recovery master will automatically
+rebuild/recover all clustered TDB database to ensure that the
+databases are consistent. Recovery typically takes between 1 and 3
+seconds. During the recovery period the databases are 'frozen', and
+all database IO operations by ctdb clients are suspended.
+
+<h3>Is CTDB a HA solution?</h3>
+
+Yes and no.<p>
+
+CTDB alone is not a HA solution, but when you combine CTDB with a clustered 
+filesystem it becomes one.<p>
+
+CTDB is primarily developed around the concept of having a shared
+cluster filesystem across all the nodes in the cluster to provide the
+features required for building a NAS cluster.<p>
+
+Thus CTDB relies on an external component (the cluster filesystem) to
+provide the mechanisms for avoiding split-brain and other core
+clustering tasks.<p>
+
+However, if you do have a clustered filesystem for all the nodes, in
+that scenario CTDB will provide a very easy to install and manage
+solution for your clustering HA needs.
+
+<h3>IP Takeover</h3>
+
+When a node in a cluster fails, CTDB will arrange that a different
+node takes over the IP address of the failed node to ensure that the
+IP addresses for the services provided are always available.
+
+<p>To speed up the process of IP takeover and when clients attached to
+a failed node recovers as fast as possible, CTDB will automatically
+generate gratuitous ARP packets to inform all nodes of the changed MAC
+address for that IP.  CTDB will also send "tickle ACK" packets to all
+attached clients to trigger the clients to immediately recognize that
+the TCP connection needs to be re-established and to shortcut any TCP
+retransmission timeouts that may be active in the clients.
+
+<h2>Discussion and bug reports</h2>
+
+For discussions please use
+the <a href="https://lists.samba.org/mailman/listinfo/samba-technical">samba-technical</a>
+mailing list. To submit a bug report, please use
+the <a href="http://bugzilla.samba.org/">Samba bugzilla</a> bug
+tracking system.
+
+<p>We would be very interested in hearing from and work with other
+projects that want to make their services cluster aware using CTDB.
+
+<p>CTDB discussions also happen on the #ctdb IRC channel on freenode.net
+
+
+<hr>
+<h2>Developers</h2>
+<ul>
+<li><a href="http://samba.org/~tridge/">Andrew Tridgell</a></li>
+<li><a href="http://samba.org/~sahlberg/">Ronnie Sahlberg</a></li>
+<li><a href="http://samba.org/~obnox/">Michael Adam</a></li>
+<li>Peter Somogyi</li>
+<li><a href="http://sernet.de/Samba/">Volker Lendecke</a></li>
+<li>Stefan Metzmacher</li>
+<li><a href="http://meltin.net/people/martin/">Martin Schwenke</a></li>
+<li>Amitay Isaacs</li>
+</ul>
+
+<!--#include virtual="footer.html" -->
diff --git a/ctdb/web/iscsi.html b/ctdb/web/iscsi.html
new file mode 100644 (file)
index 0000000..1385e18
--- /dev/null
@@ -0,0 +1,113 @@
+<!--#set var="TITLE" value="CTDB and iSCSI" -->
+<!--#include virtual="header.html" -->
+
+<h1>Setting up HA iSCSI with CTDB</h1>
+
+<p>
+You can use CTDB to create a HA iSCSI Target.
+</p>
+
+<p>
+Since the iSCSI Target is not
+clusterized nor integrated with CTDB in the same sense Samba is, this
+implementation will only create a HA solution for iSCSI where each public address is assinged its own iscsi target name and the LUNs that are created are only accessible through one specific target (i.e. one public address at a time).
+
+</p>
+
+<p>
+! This feature ONLY works when public addresses are used. It is not supported, nor does it work, if you use the LVS feature to present the entire cluster as one single ip address. !
+
+</p>
+
+<h2>Prereqs</h2>
+Configure CTDB as above and set it up to use public ipaddresses.<br>
+Verify that the CTDB cluster works.
+
+<h2>Install the iSCSI target software on all nodes</h2>
+On RHEL5 this package is called "scsi-target-utils" and it needs to be installed
+on all nodes in the cluster. The easiest way to install this package is by using :
+
+<pre>
+onnode all yum install scsi-target-utils -y
+</pre>
+
+Make sure that the service is not started automatically when booting, we want CTDB to start/stop this service :
+<pre>
+onnode all chkconfig tgtd off
+</pre>
+
+<h2>/etc/sysconfig/iscsi</h2>
+
+Create this file and add the following three lines to it :
+
+<pre>
+   CTDB_START_ISCSI_SCRIPTS=/gpfs/iscsi/
+</pre>
+
+<p>
+CTDB_START_ISCSI_SCRIPTS=<directory on shared storage>
+This is a directory on shared storage where the scripts to start and configure the iscsi service are held. There is one script for each public address named <public address>.sh .
+</p>
+
+
+<h2>/etc/sysconfig/ctdb</h2>
+
+Add the following line to /etc/sysconfig/ctdb :
+
+<pre>
+   CTDB_MANAGES_ISCSI=yes
+</pre>
+
+<p>
+CTDB_MANAGES_ISCSI=yes just tells CTDB event script for iSCSI that CTDB should start and stop the iSCSI target service as required.
+</p>
+
+
+<h2>Example: create a LUN that will be hosted on public ip address 10.1.1.1</h2>
+<p>
+Before you cna export a LUN you must create it as a file in the shared filesystem. When doing so, make sure you create it as a real file and not a sparse file!<br />
+While it is much quicker to create a sparse file if you want a file with filesize 100Gb, SCSI has no concept of "disk full" so if you run out of backing space for the sparse file, the scsi initiators will be "surprised" and "unhappy".
+</p>
+<pre>
+dd if=/dev/zero of=/gpfs/iscsi/10.1.1.1.lun.1 bs=1024 count=102400
+</pre>
+<p>
+to create a 100MByte file to export as an iSCSI LUN.
+</p>
+
+<h2>Example: 10.1.1.1.sh</h2>
+<p>
+This example shellscript is used to configure the iscsi target that is hosted onthe public address 10.1.1.1
+</p>
+<pre>
+#!/bin/sh
+# script to set up the iscsi target and luns hosted by public address
+# 10.1.1.1
+
+
+#create a target
+tgtadm --lld iscsi --op new --mode target --tid 1 -T iqn.2007-11.com.ctdb:iscsi.target.10.1.1.1
+
+#attach a lun
+tgtadm --lld iscsi --op new --mode logicalunit --tid 1 --lun 1 -b /gpfs/iscsi/10.1.1.1.lun.1
+
+# no security, allow everyone to access this lun
+tgtadm --lld iscsi --op bind --mode target --tid 1 -I ALL
+</pre>
+
+
+<p>
+iqn.2007-11.com.ctdb:iscsi.target.10.1.1.1 in the example above is the iscsi name that is assigned to the target. Dont use this name, pick your own name!
+</p>
+
+<p>
+See the documentation for the tgtadm command for more information on how you want to set up your environment.
+</p>
+
+<h2>Perform a ctdb recovery to start the iscsi service</h2>
+<pre>
+ctdb recover
+</pre>
+
+<!--#include virtual="footer.html" -->
+
diff --git a/ctdb/web/nfs.html b/ctdb/web/nfs.html
new file mode 100644 (file)
index 0000000..a4a6fb5
--- /dev/null
@@ -0,0 +1,96 @@
+<!--#set var="TITLE" value="CTDB and NFS" -->
+<!--#include virtual="header.html" -->
+
+<h1>Setting up clustered NFS</h1>
+
+NFS v2/v3 has been successfully tested with exporting the same
+data/network share from multiple nodes in a CTDB cluster with correct
+file locking behaviour and lock recovery.<br><br>
+
+Also see <a href="http://wiki.samba.org/index.php/CTDB_Setup#Setting_up_CTDB_for_clustered_NFS">Configuring
+NFS for CTDB clustering</a> at samba.org for additional information.
+
+<h2>Prereqs</h2>
+Configure CTDB as above and set it up to use public ipaddresses.<br>
+Verify that the CTDB cluster works.
+
+<h2>/etc/exports</h2>
+
+Export the same directory from all nodes.<br>
+Make sure to specify the fsid export option so that all nodes will present the same fsid to clients.<br>
+
+Clients can get "upset" if the fsid on a mount suddenly changes.<br>
+Example /etc/exports :
+<pre>
+  /gpfs0/data *(rw,fsid=1235)
+</pre>
+
+<h2>/etc/sysconfig/nfs</h2>
+
+This file must be edited to point statd to keep its state directory on 
+shared storage instead of in a local directory.<br><br>
+
+We must also make statd use a fixed port to listen on that is the same for 
+all nodes in the cluster.<br>
+
+If we don't specify a fixed port, the statd port will change during failover 
+which causes problems on some clients.<br>
+(some clients are very slow to realize when the port has changed)<br><br>
+
+This file should look something like :
+<pre>
+  NFS_HOSTNAME=ctdb
+  STATD_PORT=595
+  STATD_OUTGOING_PORT=596
+  MOUNTD_PORT=597
+  RQUOTAD_PORT=598
+  LOCKD_TCPPORT=599
+  LOCKD_UDPPORT=599
+  STATD_HOSTNAME="$NFS_HOSTNAME -H /etc/ctdb/statd-callout -p 97"
+  RPCNFSDARGS="-N 4"
+
+</pre>
+
+You need to make sure that the lock manager runs on the same port on all nodes in the cluster since some clients will have "issues" and take very long to recover if the port suddenly changes.<br>
+599 above is only an example. You can run the lock manager on any available port as long as you use the same port on all nodes.<br><br>
+
+NFS_HOSTNAME is the dns name for the ctdb cluster and which is used when clients map nfs shares. This name must be in DNS and resolve back into the public ip addresses of the cluster.<br>
+Always use the same name here as you use for the samba hostname.
+
+RPCNFSDARGS is used to disable support for NFSv4 which is not yet supported by CTDB.
+
+<h2>/etc/sysconfig/ctdb</h2>
+Add the following line to /etc/sysconfig/ctdb :
+
+<pre>
+  CTDB_MANAGES_NFS=yes
+</pre>
+The CTDB_MANAGES_NFS line tells the events scripts that CTDB is to manage startup and shutdown of the NFS and NFSLOCK services.<br>
+
+With this set to yes, CTDB will start/stop/restart these services as required.<br><br>
+
+
+<h2>chkconfig</h2>
+
+Since CTDB will manage and start/stop/restart the nfs and the nfslock services, you must disable them using chkconfig.
+<pre>
+  chkconfig nfs off
+  chkconfig nfslock off
+</pre>
+
+
+<h2>Event scripts</h2>
+
+CTDB clustering for NFS relies on two event scripts /etc/ctdb/events.d/60.nfs and /etc/ctdb/events.d/61.nfstickle.<br>
+
+These two scripts are provided by the RPM package and there should not be any need to change them.
+
+<h2><strong>IMPORTANT</strong></h2>
+
+Never ever mount the same nfs share on a client from two different nodes in the cluster at the same time!<br><br>
+
+The client side caching in NFS is very fragile and assumes/relies on that an object can only be accessed through one single path at a time.
+
+
+<!--#include virtual="footer.html" -->
+
diff --git a/ctdb/web/prerequisites.html b/ctdb/web/prerequisites.html
new file mode 100644 (file)
index 0000000..5a56300
--- /dev/null
@@ -0,0 +1,30 @@
+<!--#set var="TITLE" value="CTDB prerequisites" -->
+<!--#include virtual="header.html" -->
+
+<h1>Prerequisites</h1>
+
+Before you can start using CTDB you must first install and configure a
+bunch of linux boxes.<br><br>
+
+After that you need to install and configure a cluster filesystem and
+mount that cluster filesystem on all the linux boxes that will form
+your cluster.<br><br>
+
+Also, ensure that the cluster filesystem supports correct
+posix locking semantics. A simple way to test this is to run <a
+href="https://wiki.samba.org/index.php/Ping_pong">ping_pong</a> utility
+bundled with CTDB.<br><br>
+
+<h1>Cluster filesystems</h1>
+We have primarily used the GPFS filesystem for our testing but any
+cluster filesystem should work as long as it provides correct file
+locking.<br><br>
+
+While we primarily test with GPFS, CTDB should work with almost any
+other cluster filesystem as well.<br><br>
+
+Please let us know your experiences in using other cluster filesystems.
+
+
+<!--#include virtual="footer.html" -->
+
diff --git a/ctdb/web/samba.html b/ctdb/web/samba.html
new file mode 100644 (file)
index 0000000..fb17d0f
--- /dev/null
@@ -0,0 +1,97 @@
+<!--#set var="TITLE" value="CTDB and Samba" -->
+<!--#include virtual="header.html" -->
+
+<h1>Setting up clustered samba</h1>
+
+It is assumed tou have already installed the ctdb version of samba and also installed, configured and tested CTDB.
+
+<h2>Create a user account</h2>
+
+First you need to initialise the Samba password database so that you have some user that can authenticate to the samba service.<br>
+Do this by running:
+<pre>
+  smbpasswd -a root
+</pre>
+
+Samba with clustering must use the tdbsam or ldap SAM passdb backends (it must not use the default smbpasswd backend), or must be configured to be a member of a domain.<br>
+The rest of the configuration of Samba is exactly as it is done on a normal system.<br><br>
+See the docs on http://samba.org/ for details.
+
+<h2>Critical smb.conf parameters</h2>
+
+A clustered Samba install must set some specific configuration parameters
+<pre>
+  clustering = yes
+  idmap backend = tdb2
+</pre>
+
+<h2>Using smbcontrol</h2>
+
+You can check for connectivity to the smbd daemons on each node using smbcontrol
+<pre>
+  smbcontrol smbd ping
+</pre>
+
+<h2>Using Samba4 smbtorture</h2>
+
+The Samba4 version of smbtorture has several tests that can be used to
+benchmark a CIFS cluster. You can download Samba 4 from Samba website.
+
+The particular tests that are helpful for cluster benchmarking are the RAW-BENCH-OPEN, RAW-BENCH-LOCK and BENCH-NBENCH tests.<br>
+These tests take a unclist that allows you to spread the workload out over more than one node. For example:
+
+<pre>
+  smbtorture //localhost/data -Uuser%password  RAW-BENCH-LOCK --unclist=unclist.txt --num-progs=32 -t60
+</pre>
+
+The file unclist.txt should contain a list of server names in your cluster prefixed by //. For example
+<pre>
+ //192.168.1.1
+ //192.168.1.2
+ //192.168.2.1
+ //192.168.2.2
+</pre>
+
+For NBENCH testing you need a client.txt file.<br>
+A suitable file can be found in the dbench distribution at http://samba.org/ftp/tridge/dbench/
+
+
+<h3>CTDB_MANAGES_SAMBA</h3>
+This is a parameter in /etc/sysconfig/ctdb<br><br>
+When this parameter is set to "yes" CTDB will start/stop/restart the local samba daemon as the cluster configuration changes.<br><br>
+When this parameter is set you should also make sure that samba is NOT started by default by the linux system when it boots, e.g.
+<pre>
+  chkconfig smb off
+</pre>
+on a Redhat system and
+<pre>
+  chkconfig smb off
+  chkconfig nmb off
+</pre>
+on a SuSE system.
+
+Example:
+<pre>
+  CTDB_MANAGES_SAMBA="yes"
+</pre>
+
+It is strongly recommended that you set this parameter to "yes" if you intend to use clustered samba.
+
+<h3>CTDB_MANAGES_WINBIND</h3>
+This is a parameter in /etc/sysconfig/ctdb<br><br>
+When this parameter is set to "yes" CTDB will start/stop/restart the local winbind daemon as the cluster configuration changes.<br><br>
+When this parameter is set you should also make sure that winbind is NOT started by default by the linux system when it boots:
+<pre>
+  chkconfig winbind off
+</pre>
+
+Example:
+<pre>
+  CTDB_MANAGES_WINBIND="yes"
+</pre>
+
+It is strongly recommended that you set this parameter to "yes" if you
+intend to use clustered samba in DOMAIN or ADS security mode.
+
+<!--#include virtual="footer.html" -->
+
diff --git a/ctdb/web/testing.html b/ctdb/web/testing.html
new file mode 100644 (file)
index 0000000..d0d39a3
--- /dev/null
@@ -0,0 +1,112 @@
+<!--#set var="TITLE" value="CTDB Testing" -->
+<!--#include virtual="header.html" -->
+
+<H2 align="center">Starting and testing CTDB</h2>
+
+The CTDB log is in /var/log/log.ctdb so look in this file if something
+did not start correctly.<p>
+
+You can ensure that ctdb is running on all nodes using
+<pre>
+  onnode all service ctdb start
+</pre>
+Verify that the CTDB daemon started properly. There should normally be at least 2 processes started for CTDB, one for the main daemon and one for the recovery daemon.
+<pre>
+  onnode all pidof ctdbd
+</pre>
+
+Once all CTDB nodes have started, verify that they are correctly
+talking to each other.<p>
+
+There should be one TCP connection from the private ip address on each
+node to TCP port 4379 on each of the other nodes in the cluster.
+<pre>
+  onnode all netstat -tn | grep 4379
+</pre>
+
+
+<h2>Automatically restarting CTDB</h2>
+
+If you wish to cope with software faults in ctdb, or want ctdb to
+automatically restart when an administration kills it, then you may
+wish to add a cron entry for root like this:
+
+<pre>
+ * * * * * /etc/init.d/ctdb cron > /dev/null 2>&1
+</pre>
+
+
+<h2>Testing CTDB</h2>
+
+Once your cluster is up and running, you may wish to know how to test that it is functioning correctly. The following tests may help with that
+
+<h3>The ctdb tool</h3>
+
+The ctdb package comes with a utility called ctdb that can be used to
+view the behaviour of the ctdb cluster.<p>
+
+If you run it with no options it will provide some terse usage information. The most commonly used commands are:
+<pre>
+ ctdb status
+ ctdb ip
+ ctdb ping
+</pre>
+
+<h3>ctdb status</h3>
+
+The status command provides basic information about the cluster and the status of the nodes. when you run it you will get some output like:
+
+<pre>
+<strong>Number of nodes:4
+vnn:0 10.1.1.1       OK (THIS NODE)
+vnn:1 10.1.1.2       OK
+vnn:2 10.1.1.3       OK
+vnn:3 10.1.1.4       OK</strong>
+Generation:1362079228
+Size:4
+hash:0 lmaster:0
+hash:1 lmaster:1
+hash:2 lmaster:2
+hash:3 lmaster:3
+<strong>Recovery mode:NORMAL (0)</strong>
+Recovery master:0
+</pre>
+
+The important parts are in bold. This tells us that all 4 nodes are in
+a healthy state.<p>
+
+It also tells us that recovery mode is normal, which means that the
+cluster has finished a recovery and is running in a normal fully
+operational state.<p>
+
+Recovery state will briefly change to "RECOVERY" when there ahs been a
+node failure or something is wrong with the cluster.<p>
+
+If the cluster remains in RECOVERY state for very long (many seconds)
+there might be something wrong with the configuration. See
+/var/log/log.ctdb.
+
+<h3>ctdb ip</h3>
+
+This command prints the current status of the public ip addresses and which physical node is currently serving that ip.
+
+<pre>
+Number of nodes:4
+192.168.1.1         0
+192.168.1.2         1
+192.168.2.1         2
+192.168.2.1         3
+</pre>
+
+<h3>ctdb ping</h3>
+this command tries to "ping" each of the CTDB daemons in the cluster.
+<pre>
+  ctdb ping -n all
+
+  response from 0 time=0.000050 sec  (13 clients)
+  response from 1 time=0.000154 sec  (27 clients)
+  response from 2 time=0.000114 sec  (17 clients)
+  response from 3 time=0.000115 sec  (59 clients)
+</pre>
+
+<!--#include virtual="footer.html" -->