merge from ronnie

author Andrew Tridgell <tridge@samba.org>

Sun, 2 Dec 2007 23:19:24 +0000 (10:19 +1100)

committer Andrew Tridgell <tridge@samba.org>

Sun, 2 Dec 2007 23:19:24 +0000 (10:19 +1100)
author Andrew Tridgell <tridge@samba.org>
Sun, 2 Dec 2007 23:19:24 +0000 (10:19 +1100)
committer Andrew Tridgell <tridge@samba.org>
Sun, 2 Dec 2007 23:19:24 +0000 (10:19 +1100)
diff --cc ctdb/server/ctdb_monitor.c

index 0e2dc29c6a0a871600218e5ecf48990ea2f10220,dfa91abe71e8dc5190fa6502874a1bead0860fe4..bdb3d45eda06b6d2deabca2e2ddd845ce8413698
--- 1/ctdb/server/ctdb_monitor.c
--- 2/ctdb/server/ctdb_monitor.c
+++ b/ctdb/server/ctdb_monitor.c
@@@ -199,7 -192,28 +192,28 @@@ static void ctdb_check_health(struct ev
         }       
   }
   
- /* stop any monitoring */
+ /* 
+   (Temporaily) Disabling monitoring will stop the monitor event scripts
+   from running   but node health checks will still occur
+ */
+ void ctdb_disable_monitoring(struct ctdb_context *ctdb)
+ {
+       ctdb->monitoring_mode  = CTDB_MONITORING_DISABLED;
- -      DEBUG(1,("Monitoring has been disabled\n"));
++      DEBUG(2,("Monitoring has been disabled\n"));
+ }
+ 
+ /* 
+    Re-enable running monitor events after they have been disabled
+  */
+ void ctdb_enable_monitoring(struct ctdb_context *ctdb)
+ {
+       ctdb->monitoring_mode  = CTDB_MONITORING_ACTIVE;
- -      DEBUG(1,("Monitoring has been enabled\n"));
++      DEBUG(2,("Monitoring has been enabled\n"));
+ }
+ 
+ /* stop any monitoring 
+    this should only be done when shutting down the daemon
+ */
   void ctdb_stop_monitoring(struct ctdb_context *ctdb)
   {
         talloc_free(ctdb->monitor_context);
@@@ -243,7 -269,7 +269,7 @@@ int32_t ctdb_control_modflags(struct ct
         node->flags &= ~m->clear;
   
         if (node->flags == old_flags) {
-               /* no change */
- -              DEBUG(0, ("Control modflags on node %u - Unchanged - flags 0x%x\n", ctdb->pnn, node->flags));
++              DEBUG(2, ("Control modflags on node %u - Unchanged - flags 0x%x\n", ctdb->pnn, node->flags));
                 return 0;
         }
   
diff --cc ctdb/server/ctdb_recover.c

index 3721facdba6ae1c484e76ca797c4f23e2e1c2eec,1c9f6a91bdf81bfd96259bd58c0e6aafc00345a0..8b2dfb758304217ffa883ee58f0c6e4502eaabfd
--- 1/ctdb/server/ctdb_recover.c
--- 2/ctdb/server/ctdb_recover.c
+++ b/ctdb/server/ctdb_recover.c
@@@ -415,8 -415,6 +415,8 @@@ static void ctdb_recovered_callback(str
   {
         struct ctdb_set_recmode_state *state = talloc_get_type(p, struct ctdb_set_recmode_state);
   
-       ctdb_start_monitoring(ctdb);
++      ctdb_enable_monitoring(state->ctdb);
+ +
         if (status == 0) {
                 ctdb->recovery_mode = state->recmode;
         } else {
@@@ -492,7 -490,10 +492,10 @@@ static void set_recmode_handler(struct 
                                          state, 
                                          ctdb_recovered_callback, 
                                          state, "recovered");
- -      ctdb_enable_monitoring(state->ctdb);
- -
+ 
         if (ret != 0) {
++              ctdb_enable_monitoring(state->ctdb);
++
                 ctdb_request_control_reply(state->ctdb, state->c, NULL, -1, "failed to run eventscript from set_recmode");
                 talloc_free(state);
                 return;
diff --cc ctdb/server/ctdb_recoverd.c

index e54c53d935ee885a6ab61246a8b2e8016d015ff0,88f22543668a1ee3a6c736e4eed15bcccecc0fa0..8e297e9f52c2a6fff8b4a247c891d36d1b0004e8
--- 1/ctdb/server/ctdb_recoverd.c
--- 2/ctdb/server/ctdb_recoverd.c
+++ b/ctdb/server/ctdb_recoverd.c
@@@ -739,13 -760,32 +760,32 @@@ static void ctdb_wait_election(struct c
         }
   }
   
- -              /* either a new node is the culprit, or we've decide to forgive them */
+ /*
+   remember the trouble maker
+  */
+ static void ctdb_set_culprit(struct ctdb_recoverd *rec, uint32_t culprit)
+ {
+       struct ctdb_context *ctdb = rec->ctdb;
+ 
+       if (rec->last_culprit != culprit ||
+           timeval_elapsed(&rec->first_recover_time) > ctdb->tunable.recovery_grace_period) {
+               DEBUG(0,("New recovery culprit %u\n", culprit));
++              /* either a new node is the culprit, or we've decided to forgive them */
+               rec->last_culprit = culprit;
+               rec->first_recover_time = timeval_current();
+               rec->culprit_counter = 0;
+       }
+       rec->culprit_counter++;
+ }
   
   /*
-   update our local flags from all remote connected nodes. 
+   Update our local flags from all remote connected nodes. 
+   This is only run when we are or we belive we are the recovery master
    */
- static int update_local_flags(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap)
+ static int update_local_flags(struct ctdb_recoverd *rec, struct ctdb_node_map *nodemap)
   {
         int j;
+       struct ctdb_context *ctdb = rec->ctdb;
         TALLOC_CTX *mem_ctx = talloc_new(ctdb);
   
         /* get the nodemap for all active remote nodes and verify
diff --cc ctdb/server/ctdb_takeover.c

index ec3455e4c0489a1c6be7f63c5565702ffbec6c12,90d692e355aa1b361898b54c6463a24d654b1249..a452da6424e09e54f24ca09b1aa105e646be37a2
--- 1/ctdb/server/ctdb_takeover.c
--- 2/ctdb/server/ctdb_takeover.c
+++ b/ctdb/server/ctdb_takeover.c
@@@ -131,8 -131,6 +131,8 @@@ static void takeover_ip_callback(struc
         char *ip = inet_ntoa(state->sin->sin_addr);
         struct ctdb_tcp_array *tcparray;
   
-       ctdb_start_monitoring(ctdb);
++      ctdb_enable_monitoring(ctdb);
+ +
         if (status != 0) {
                 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
                          ip, state->vnn->iface));
@@@ -247,7 -245,9 +247,9 @@@ int32_t ctdb_control_takeover_ip(struc
                                          vnn->iface, 
                                          inet_ntoa(pip->sin.sin_addr),
                                          vnn->public_netmask_bits);
- -      ctdb_enable_monitoring(ctdb);
+ 
         if (ret != 0) {
++              ctdb_enable_monitoring(ctdb);
                 DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n",
                          inet_ntoa(pip->sin.sin_addr), vnn->iface));
                 talloc_free(state);
@@@ -299,8 -299,6 +301,8 @@@ static void release_ip_callback(struct 
         char *ip = inet_ntoa(state->sin->sin_addr);
         TDB_DATA data;
   
-       ctdb_start_monitoring(ctdb);
++      ctdb_enable_monitoring(ctdb);
+ +
         /* send a message to all clients of this node telling them
            that the cluster has been reconfigured and they should
            release any sockets on this IP */
@@@ -373,7 -371,9 +375,9 @@@ int32_t ctdb_control_release_ip(struct 
                                          vnn->iface, 
                                          inet_ntoa(pip->sin.sin_addr),
                                          vnn->public_netmask_bits);
- -      ctdb_enable_monitoring(ctdb);
- -
         if (ret != 0) {
++              ctdb_enable_monitoring(ctdb);
++
                 DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n",
                          inet_ntoa(pip->sin.sin_addr), vnn->iface));
                 talloc_free(state);
author	Andrew Tridgell <tridge@samba.org>
	Sun, 2 Dec 2007 23:19:24 +0000 (10:19 +1100)
committer	Andrew Tridgell <tridge@samba.org>
	Sun, 2 Dec 2007 23:19:24 +0000 (10:19 +1100)
		1	2
ctdb/server/ctdb_monitor.c	patch \|	diff1 \|	diff2 \|	blob \| history
ctdb/server/ctdb_recover.c	patch \|	diff1 \|	diff2 \|	blob \| history
ctdb/server/ctdb_recoverd.c	patch \|	diff1 \|	diff2 \|	blob \| history
ctdb/server/ctdb_takeover.c	patch \|	diff1 \|	diff2 \|	blob \| history