s3/cleanupd: use smbd_cleanupd.tdb
authorRalph Boehme <slow@samba.org>
Thu, 21 Jul 2016 17:08:47 +0000 (19:08 +0200)
committerStefan Metzmacher <metze@samba.org>
Thu, 28 Jul 2016 03:00:18 +0000 (05:00 +0200)
Instead of using messaging to send individual cleanup events, it works
this way:

o parent smbd stores cleanup events (ie exitted children) in
  smbd_cleanup.tdb

o it sends cleanupd an empty MSG_SMB_NOTIFY_CLEANUP message

o cleanupd does a traverse on the smbd_cleanupd.tdb and collects all
  childs in a list

o after the traverse cleanupd walks the list and does the real work

It would have been possible to optimize for the common case by passing
info about exitted childs with the message (as was done before this
patch), adding a new message type for triggering a db traverse that
would be used when cleanupd had to be restarted and cleanup events may
have been accumulated in cleanup.tdb.

But this could be subject to subtle race conditions and could loose
events if cleanupd dies randomly.

Bug: https://bugzilla.samba.org/show_bug.cgi?id=12022

Signed-off-by: Ralph Boehme <slow@samba.org>
Reviewed-by: Volker Lendecke <vl@samba.org>
source3/smbd/server.c
source3/smbd/smbd_cleanupd.c

index a2ac77e638a02651f99c6d327fb7a1ff75d6181c..2e005e6dbba8500a8873d39e6916d14bd24e26ed 100644 (file)
@@ -51,6 +51,7 @@
 #include "smbd/notifyd/notifyd.h"
 #include "smbd/smbd_cleanupd.h"
 #include "lib/util/sys_rw.h"
+#include "cleanupdb.h"
 
 #ifdef CLUSTER_SUPPORT
 #include "ctdb_protocol.h"
@@ -654,6 +655,9 @@ static void cleanup_timeout_fn(struct tevent_context *event_ctx,
 static void cleanupd_started(struct tevent_req *req)
 {
        bool ok;
+       NTSTATUS status;
+       struct smbd_parent_context *parent = tevent_req_callback_data(
+               req, struct smbd_parent_context);
 
        ok = cleanupd_init_recv(req);
        TALLOC_FREE(req);
@@ -661,6 +665,15 @@ static void cleanupd_started(struct tevent_req *req)
                DBG_ERR("Failed to restart cleanupd, giving up\n");
                return;
        }
+
+       status = messaging_send(parent->msg_ctx,
+                               parent->cleanupd,
+                               MSG_SMB_NOTIFY_CLEANUP,
+                               &data_blob_null);
+       if (!NT_STATUS_IS_OK(status)) {
+               DBG_ERR("messaging_send returned %s\n",
+                       nt_errstr(status));
+       }
 }
 
 static void remove_child_pid(struct smbd_parent_context *parent,
@@ -668,8 +681,8 @@ static void remove_child_pid(struct smbd_parent_context *parent,
                             bool unclean_shutdown)
 {
        struct smbd_child_pid *child;
-       struct iovec iov[2];
        NTSTATUS status;
+       bool ok;
 
        for (child = parent->children; child != NULL; child = child->next) {
                if (child->pid == pid) {
@@ -706,8 +719,6 @@ static void remove_child_pid(struct smbd_parent_context *parent,
        }
 
        if (pid == procid_to_pid(&parent->notifyd)) {
-               bool ok;
-
                DBG_WARNING("Restarting notifyd\n");
                ok = smbd_notifyd_init(parent->msg_ctx, false,
                                       &parent->notifyd);
@@ -717,15 +728,22 @@ static void remove_child_pid(struct smbd_parent_context *parent,
                return;
        }
 
-       iov[0] = (struct iovec) { .iov_base = (uint8_t *)&pid,
-                                 .iov_len = sizeof(pid) };
-       iov[1] = (struct iovec) { .iov_base = (uint8_t *)&unclean_shutdown,
-                                 .iov_len = sizeof(bool) };
+       ok = cleanupdb_store_child(pid, unclean_shutdown);
+       if (!ok) {
+               DBG_ERR("cleanupdb_store_child failed\n");
+               return;
+       }
 
-       status = messaging_send_iov(parent->msg_ctx, parent->cleanupd,
-                                   MSG_SMB_NOTIFY_CLEANUP,
-                                   iov, ARRAY_SIZE(iov), NULL, 0);
-       DEBUG(10, ("messaging_send_iov returned %s\n", nt_errstr(status)));
+       if (!server_id_is_disconnected(&parent->cleanupd)) {
+               status = messaging_send(parent->msg_ctx,
+                                       parent->cleanupd,
+                                       MSG_SMB_NOTIFY_CLEANUP,
+                                       &data_blob_null);
+               if (!NT_STATUS_IS_OK(status)) {
+                       DBG_ERR("messaging_send returned %s\n",
+                               nt_errstr(status));
+               }
+       }
 
        if (unclean_shutdown) {
                /* a child terminated uncleanly so tickle all
index c940ef1f5fa1f341f12e4a917f8d267cc148d55e..0deb8b0c68f658b8afc69c7f0d6f004ac9c9ded5 100644 (file)
@@ -25,6 +25,7 @@
 #include "smbprofile.h"
 #include "serverid.h"
 #include "locking/proto.h"
+#include "cleanupdb.h"
 
 struct smbd_cleanupd_state {
        pid_t parent_pid;
@@ -102,6 +103,39 @@ static void smbd_cleanupd_unlock(struct messaging_context *msg,
        brl_revalidate(msg, private_data, msg_type, server_id, data);
 }
 
+struct cleanup_child {
+       struct cleanup_child *prev, *next;
+       pid_t pid;
+       bool unclean;
+};
+
+struct cleanupdb_traverse_state {
+       TALLOC_CTX *mem_ctx;
+       bool ok;
+       struct cleanup_child *childs;
+};
+
+static int cleanupdb_traverse_fn(const pid_t pid,
+                                const bool unclean,
+                                void *private_data)
+{
+       struct cleanupdb_traverse_state *cleanup_state =
+               (struct cleanupdb_traverse_state *)private_data;
+       struct cleanup_child *child = NULL;
+
+       child = talloc_zero(cleanup_state->mem_ctx, struct cleanup_child);
+       if (child == NULL) {
+               DBG_ERR("talloc_zero failed\n");
+               return -1;
+       }
+
+       child->pid = pid;
+       child->unclean = unclean;
+       DLIST_ADD(cleanup_state->childs, child);
+
+       return 0;
+}
+
 static void smbd_cleanupd_process_exited(struct messaging_context *msg,
                                         void *private_data, uint32_t msg_type,
                                         struct server_id server_id,
@@ -111,43 +145,73 @@ static void smbd_cleanupd_process_exited(struct messaging_context *msg,
                private_data, struct tevent_req);
        struct smbd_cleanupd_state *state = tevent_req_data(
                req, struct smbd_cleanupd_state);
-       pid_t pid;
-       struct server_id child_id;
-       bool unclean_shutdown;
        int ret;
+       struct cleanupdb_traverse_state cleanup_state;
+       TALLOC_CTX *frame = talloc_stackframe();
+       struct cleanup_child *child = NULL;
 
-       if (data->length != (sizeof(pid) + sizeof(unclean_shutdown))) {
-               DBG_WARNING("Got invalid length: %zu\n", data->length);
-               return;
-       }
-
-       memcpy(&pid, data->data, sizeof(pid));
-       memcpy(&unclean_shutdown, data->data + sizeof(pid),
-              sizeof(unclean_shutdown));
-
-       DBG_DEBUG("%d exited %sclean\n", (int)pid,
-                 unclean_shutdown ? "un" : "");
+       cleanup_state = (struct cleanupdb_traverse_state) {
+               .mem_ctx = frame
+       };
 
        /*
-        * Get child_id before messaging_cleanup which wipes the
-        * unique_id. Not that it really matters here for functionality (the
-        * child should have properly cleaned up :-)) though, but it looks
-        * nicer.
+        * This merely collect childs in a list, whatever we're
+        * supposed to cleanup for every child, it has to take place
+        * *after* the db traverse in a list loop. This is to minimize
+        * locking interaction between the traverse and writers (ie
+        * the parent smbd).
         */
-       child_id = pid_to_procid(pid);
-
-       smbprofile_cleanup(pid, state->parent_pid);
-
-       ret = messaging_cleanup(msg, pid);
+       ret = cleanupdb_traverse_read(cleanupdb_traverse_fn, &cleanup_state);
+       if (ret < 0) {
+               DBG_ERR("cleanupdb_traverse_read failed\n");
+               TALLOC_FREE(frame);
+               return;
+       }
 
-       if ((ret != 0) && (ret != ENOENT)) {
-               DBG_DEBUG("messaging_cleanup returned %s\n", strerror(ret));
+       if (ret == 0) {
+               DBG_ERR("got 0 cleanup events, expected at least 1\n");
+               TALLOC_FREE(frame);
+               return;
        }
 
-       if (!serverid_deregister(child_id)) {
-               DEBUG(1, ("Could not remove pid %d from serverid.tdb\n",
-                         (int)pid));
+       for (child = cleanup_state.childs;
+            child != NULL;
+            child = child->next)
+       {
+               struct server_id child_id;
+               bool ok;
+
+               ok = cleanupdb_delete_child(child->pid);
+               if (!ok) {
+                       DBG_ERR("failed to delete pid %d\n", (int)child->pid);
+               }
+
+               /*
+                * Get child_id before messaging_cleanup which wipes
+                * the unique_id. Not that it really matters here for
+                * functionality (the child should have properly
+                * cleaned up :-)) though, but it looks nicer.
+                */
+               child_id = pid_to_procid(child->pid);
+
+               smbprofile_cleanup(child->pid, state->parent_pid);
+
+               ret = messaging_cleanup(msg, child->pid);
+
+               if ((ret != 0) && (ret != ENOENT)) {
+                       DBG_DEBUG("messaging_cleanup returned %s\n",
+                                 strerror(ret));
+               }
+
+               if (!serverid_deregister(child_id)) {
+                       DBG_ERR("Could not remove pid %d from serverid.tdb\n",
+                               (int)child->pid);
+               }
+
+               DBG_DEBUG("cleaned up pid %d\n", (int)child->pid);
        }
+
+       TALLOC_FREE(frame);
 }
 
 NTSTATUS smbd_cleanupd_recv(struct tevent_req *req)