ctdb-scripts: New consistent system memory and swap monitoring
authorMartin Schwenke <martin@meltin.net>
Mon, 3 Aug 2015 07:22:08 +0000 (17:22 +1000)
committerAmitay Isaacs <amitay@samba.org>
Sat, 29 Aug 2015 15:06:25 +0000 (17:06 +0200)
New variables CTDB_MONITOR_MEMORY_USAGE and CTDB_MONITOR_SWAP_USAGE.
Both take a pair of <warn_threshold>:<unhealthy_threshold> where each
theshold is specified as a percentage.

This adds a callout to check_thresholds() that is run when the
unhealthy threshold is reached.

Add some combination tests.

Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
ctdb/config/events.d/05.system
ctdb/doc/ctdbd.conf.5.xml
ctdb/tests/eventscripts/05.system.monitor.011.sh
ctdb/tests/eventscripts/05.system.monitor.012.sh
ctdb/tests/eventscripts/05.system.monitor.013.sh
ctdb/tests/eventscripts/05.system.monitor.014.sh
ctdb/tests/eventscripts/05.system.monitor.015.sh
ctdb/tests/eventscripts/05.system.monitor.016.sh [new file with mode: 0755]
ctdb/tests/eventscripts/05.system.monitor.017.sh [new file with mode: 0755]
ctdb/tests/eventscripts/scripts/local.sh

index 770c0dc6055a8ade026304e67804fe7e5ebcc336..48946cc36f56e7971410de9bd71fc5466d3ac181 100644 (file)
@@ -22,6 +22,7 @@ check_thresholds ()
     _thing="$1"
     _thresholds="$2"
     _usage="$3"
+    _unhealthy_callout="$4"
 
     case "$_thresholds" in
        *:*)
@@ -35,7 +36,9 @@ check_thresholds ()
 
     if validate_percentage "$_unhealthy_threshold" "$_thing" ; then
         if [ "$_usage" -ge "$_unhealthy_threshold" ] ; then
-           die "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
+           echo "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
+           eval "$_unhealthy_callout"
+           exit 1
         fi
     fi
 
@@ -73,11 +76,21 @@ monitor_filesystem_usage ()
     done
 }
 
+dump_memory_info ()
+{
+    echo "CRITICAL: Shutting down CTDB!!!"
+    get_proc "meminfo"
+    ps auxfww
+    set_proc "sysrq-trigger" "m"
+    ctdb disable
+    sleep 3
+    ctdb shutdown
+}
+
 monitor_memory_usage ()
 {
-    if [ -z "$CTDB_MONITOR_FREE_MEMORY_WARN" -a \
-        -z "$CTDB_MONITOR_FREE_MEMORY" -a \
-        "$CTDB_CHECK_SWAP_IS_NOT_USED" != "yes" ] ; then
+    if [ -z "$CTDB_MONITOR_MEMORY_USAGE" -a \
+        -z "$CTDB_MONITOR_SWAP_USAGE" ] ; then
        return
     fi
 
@@ -98,35 +111,15 @@ END {
     _mem_usage="$1"
     _swap_usage="$2"
 
-    # Shutdown CTDB when memory is below the configured limit
-    if [ -n "$CTDB_MONITOR_FREE_MEMORY" ] ; then
-       if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY ] ; then
-           echo "CRITICAL: OOM - ${_mem_usage}% usage >= ${CTDB_MONITOR_FREE_MEMORY}% (CTDB threshold)"
-           echo "CRITICAL: Shutting down CTDB!!!"
-           echo "$_meminfo"
-           ps auxfww
-           set_proc "sysrq-trigger" "m"
-           ctdb disable
-           sleep 3
-           ctdb shutdown
-       fi
-    fi
+    check_thresholds "System memory" \
+                    "$CTDB_MONITOR_MEMORY_USAGE" \
+                    "$_mem_usage" \
+                    dump_memory_info
 
-    # Warn when low on memory
-    if [ -n "$CTDB_MONITOR_FREE_MEMORY_WARN" ] ; then
-       if [ $_mem_usage -ge $CTDB_MONITOR_FREE_MEMORY_WARN ] ; then
-           echo "WARNING: memory usage is excessive - ${_mem_usage}% >=  ${CTDB_MONITOR_FREE_MEMORY_WARN}% (CTDB threshold)"
-       fi
-    fi
-
-    # We should never enter swap, so SwapTotal == SwapFree.
-    if [ "$CTDB_CHECK_SWAP_IS_NOT_USED" = "yes" ] ; then
-       if [ $_swap_usage -gt 0 ] ; then
-           echo We are swapping:
-           echo "$_meminfo"
-           ps auxfww
-       fi
-    fi
+    check_thresholds "System swap" \
+                    "$CTDB_MONITOR_SWAP_USAGE" \
+                    "$_swap_usage" \
+                    dump_memory_info
 }
 
 
index 63c84aa5ac3ebba11a80f94a18170f580d558c2c..0e38d6acf4126e2c38858e4a2c0521465d1c37e9 100644 (file)
@@ -1321,26 +1321,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
        </varlistentry>
 
        <varlistentry>
-         <term>CTDB_CHECK_SWAP_IS_NOT_USED=yes|no</term>
+         <term>CTDB_MONITOR_MEMORY_USAGE=<parameter>MEM-LIMITS</parameter></term>
          <listitem>
            <para>
-             Should a warning be logged if swap space is in use.
-           </para>
-           <para>
-             Default is no.
-           </para>
-         </listitem>
-       </varlistentry>
-
-       <varlistentry>
-         <term>CTDB_MONITOR_FREE_MEMORY=<parameter>NUM</parameter></term>
-         <listitem>
-           <para>
-             NUM is threshold of acceptable memory usage, expressed
-             as a percentage.  If this is set and memory usage
-             reaches this limit then some debug information will be
-             logged, the node will be disabled and then CTDB will be
-             shut down.
+             MEM-LIMITS takes the form
+             <parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
+             indicating that warnings should be logged if memory
+             usage reaches WARN_LIMIT%.  If usage reaches
+             UNHEALTHY_LIMIT then the node should be flagged
+             unhealthy.  Either WARN_LIMIT or UNHEALTHY_LIMIT may be
+             left blank, meaning that check will be omitted.
            </para>
            <para>
              No default.
@@ -1349,12 +1339,16 @@ CTDB_PER_IP_ROUTING_TABLE_ID_HIGH=9000
        </varlistentry>
 
        <varlistentry>
-         <term>CTDB_MONITOR_FREE_MEMORY_WARN=<parameter>NUM</parameter></term>
+         <term>CTDB_MONITOR_SWAP_USAGE=<parameter>SWAP-LIMITS</parameter></term>
          <listitem>
            <para>
-             NUM is threshold of acceptable memory usage, expressed
-             as a percentage.  If this is set and memory usage
-             reaches this limit then a warning will be logged.
+             SWAP-LIMITS takes the form
+             <parameter>WARN_LIMIT</parameter><optional>:<parameter>UNHEALTHY_LIMIT</parameter></optional>
+              indicating that warnings should be logged if
+             swap usage reaches WARN_LIMIT%.  If usage reaches
+             UNHEALTHY_LIMIT then the node should be flagged
+             unhealthy.  Either WARN_LIMIT or UNHEALTHY_LIMIT may be
+             left blank, meaning that check will be omitted.
            </para>
            <para>
              No default.
index 79f55f06a914a860acfac1201baff861e95a825d..79ceb90b76897a47c58058d57a69b7d44bfaedd6 100755 (executable)
@@ -6,9 +6,8 @@ define_test "Memory check, bad situation, no checks enabled"
 
 setup_memcheck 100 100
 
-CTDB_MONITOR_FREE_MEMORY=""
-CTDB_MONITOR_FREE_MEMORY_WARN=""
-CTDB_CHECK_SWAP_IS_NOT_USED="no"
+CTDB_MONITOR_MEMORY_USAGE=""
+CTDB_MONITOR_SWAP_USAGE=""
 
 ok_null
 
index 6c06480824a989eb1f848050779b423ce3422fe1..bb2c7b578117213bb2439c6e907e5c43e97484a5 100755 (executable)
@@ -6,9 +6,8 @@ define_test "Memory check, good situation, all enabled"
 
 setup_memcheck
 
-CTDB_MONITOR_FREE_MEMORY="90"
-CTDB_MONITOR_FREE_MEMORY_WARN="80"
-CTDB_CHECK_SWAP_IS_NOT_USED="yes"
+CTDB_MONITOR_MEMORY_USAGE="80:90"
+CTDB_MONITOR_SWAP_USAGE="1:50"
 
 ok_null
 
index dc3d40d0fa94bafe68712ea029cc5b8b194f5a47..25fa780d89e66cb3eb84a7512b75622c61bc5195 100755 (executable)
@@ -4,16 +4,17 @@
 
 define_test "Memory check, bad situation, only swap check"
 
-setup_memcheck 100 10
+setup_memcheck 100 90
 
-CTDB_MONITOR_FREE_MEMORY=""
-CTDB_MONITOR_FREE_MEMORY_WARN=""
-CTDB_CHECK_SWAP_IS_NOT_USED="yes"
+CTDB_MONITOR_MEMORY_USAGE=""
+CTDB_MONITOR_SWAP_USAGE=":50"
 
-ok <<EOF
-We are swapping:
+required_result 1 <<EOF
+ERROR: System swap utilization 90% >= threshold 50%
+CRITICAL: Shutting down CTDB!!!
 $FAKE_PROC_MEMINFO
 $(ps foobar)
+CTDB says BYE!
 EOF
 
 simple_test
index 64c07416445b59a643dbf854ed08d664a2f39860..46955f34020dc0b5e4598e99362ee538aafe3482 100755 (executable)
@@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory warning"
 
 setup_memcheck 90 10
 
-CTDB_MONITOR_FREE_MEMORY=""
-CTDB_MONITOR_FREE_MEMORY_WARN="85"
-CTDB_CHECK_SWAP_IS_NOT_USED="no"
+CTDB_MONITOR_MEMORY_USAGE="85:"
+CTDB_MONITOR_SWAP_USAGE=""
 
 ok <<EOF
-WARNING: memory usage is excessive - 90% >=  85% (CTDB threshold)
+WARNING: System memory utilization 90% >= threshold 85%
 EOF
 
 simple_test
index e950bbd276e6639ca860bcab87111d0375978492..3beac4cc91d98572210b0cb203182119e9b85c57 100755 (executable)
@@ -6,12 +6,11 @@ define_test "Memory check, bad situation, only memory critical"
 
 setup_memcheck 90 0
 
-CTDB_MONITOR_FREE_MEMORY="85"
-CTDB_MONITOR_FREE_MEMORY_WARN=""
-CTDB_CHECK_SWAP_IS_NOT_USED="no"
+CTDB_MONITOR_MEMORY_USAGE=":85"
+CTDB_MONITOR_SWAP_USAGE=""
 
-ok <<EOF
-CRITICAL: OOM - 90% usage >= 85% (CTDB threshold)
+required_result 1 <<EOF
+ERROR: System memory utilization 90% >= threshold 85%
 CRITICAL: Shutting down CTDB!!!
 $FAKE_PROC_MEMINFO
 $(ps foobar)
diff --git a/ctdb/tests/eventscripts/05.system.monitor.016.sh b/ctdb/tests/eventscripts/05.system.monitor.016.sh
new file mode 100755 (executable)
index 0000000..44dddc6
--- /dev/null
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, both memory checks, causes warning"
+
+setup_memcheck 87 0
+
+CTDB_MONITOR_MEMORY_USAGE="80:90"
+CTDB_MONITOR_SWAP_USAGE=""
+
+ok <<EOF
+WARNING: System memory utilization 87% >= threshold 80%
+EOF
+
+simple_test
diff --git a/ctdb/tests/eventscripts/05.system.monitor.017.sh b/ctdb/tests/eventscripts/05.system.monitor.017.sh
new file mode 100755 (executable)
index 0000000..f1b6a26
--- /dev/null
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+. "${TEST_SCRIPTS_DIR}/unit.sh"
+
+define_test "Memory check, bad situation, both memory checks, causes unhealthy"
+
+setup_memcheck 87 0
+
+CTDB_MONITOR_MEMORY_USAGE="70:80"
+CTDB_MONITOR_SWAP_USAGE=""
+
+required_result 1 <<EOF
+ERROR: System memory utilization 87% >= threshold 80%
+CRITICAL: Shutting down CTDB!!!
+MemTotal:        3940712 kB
+MemFree:          225268 kB
+Buffers:          146120 kB
+Cached:          140904 kB
+SwapCached:        56016 kB
+Active:          2422104 kB
+Inactive:        1019928 kB
+Active(anon):    1917580 kB
+Inactive(anon):   523080 kB
+Active(file):     504524 kB
+Inactive(file):   496848 kB
+Unevictable:        4844 kB
+Mlocked:            4844 kB
+SwapTotal:       5857276 kB
+SwapFree:        5857276 kB
+...
+USER       PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
+root         2  0.0  0.0      0     0 ?        S    Aug28   0:00 [kthreadd]
+root         3  0.0  0.0      0     0 ?        S    Aug28   0:43  \_ [ksoftirqd/0]
+...
+root         1  0.0  0.0   2976   624 ?        Ss   Aug28   0:07 init [2]
+root       495  0.0  0.0   3888  1640 ?        Ss   Aug28   0:00 udevd --daemon
+...
+[MORE FAKE ps OUTPUT]
+CTDB says BYE!
+EOF
+
+simple_test
index ce1c2510bd0b729ad5a9f46023e6e7459774dc7e..57e022536cdeb44e168debf09ab0ab445f133254 100644 (file)
@@ -369,9 +369,8 @@ SwapTotal:       ${_swap_total} kB
 SwapFree:        ${_swap_free} kB
 ..."
 
-    export CTDB_MONITOR_FREE_MEMORY
-    export CTDB_MONITOR_FREE_MEMORY_WARN
-    export CTDB_CHECK_SWAP_IS_NOT_USED
+    export CTDB_MONITOR_MEMORY_USAGE
+    export CTDB_MONITOR_SWAP_USAGE
 }
 
 setup_fscheck ()