ctdb-scripts: Throttle system resource monitoring warnings
[sfrench/samba-autobuild/.git] / ctdb / config / events.d / 05.system
1 #!/bin/sh
2 # ctdb event script for checking local file system utilization
3
4 [ -n "$CTDB_BASE" ] || \
5     export CTDB_BASE=$(cd -P $(dirname "$0") ; dirname "$PWD")
6
7 . $CTDB_BASE/functions
8 loadconfig
9
10 ctdb_setup_service_state_dir "system-monitoring"
11
12 validate_percentage ()
13 {
14     case "$1" in
15         "") return 1 ;;  # A failure that doesn't need a warning
16         [0-9]|[0-9][0-9]|100) return 0 ;;
17         *) echo "WARNING: ${1} is an invalid percentage${2:+ in \"}${2}${2:+\"} check"
18            return 1
19     esac
20 }
21
22 check_thresholds ()
23 {
24     _thing="$1"
25     _thresholds="$2"
26     _usage="$3"
27     _unhealthy_callout="$4"
28
29     case "$_thresholds" in
30         *:*)
31             _warn_threshold="${_thresholds%:*}"
32             _unhealthy_threshold="${_thresholds#*:}"
33             ;;
34         *)
35             _warn_threshold="$_thresholds"
36             _unhealthy_threshold=""
37     esac
38
39     _t=$(echo "$_thing" | sed -e 's@/@SLASH_@g' -e 's@ @_@g')
40     _cache="${service_state_dir}/cache_${_t}"
41     if validate_percentage "$_unhealthy_threshold" "$_thing" ; then
42         if [ "$_usage" -ge "$_unhealthy_threshold" ] ; then
43             echo "ERROR: ${_thing} utilization ${_usage}% >= threshold ${_unhealthy_threshold}%"
44             eval "$_unhealthy_callout"
45             echo "$_usage" >"$_cache"
46             exit 1
47         fi
48     fi
49
50     if validate_percentage "$_warn_threshold" "$_what" ; then
51         if [ "$_usage" -ge "$_warn_threshold" ] ; then
52             if [ -r "$_cache" ] ; then
53                 read _prev <"$_cache"
54             else
55                 _prev=""
56             fi
57             if [ "$_usage" != "$_prev" ] ; then
58                 echo "WARNING: ${_thing} utilization ${_usage}% >= threshold ${_warn_threshold}%"
59                 echo "$_usage" >"$_cache"
60             fi
61         else
62             if [ -r "$_cache" ] ; then
63                 echo "NOTICE: ${_thing} utilization ${_usage}% < threshold ${_warn_threshold}%"
64             fi
65             rm -f "$_cache"
66         fi
67     fi
68 }
69
70 monitor_filesystem_usage ()
71 {
72     # Check each specified filesystem, specified in format
73     # <fs_mount>:<fs_warn_threshold>[:fs_unhealthy_threshold]
74     for _fs in $CTDB_MONITOR_FILESYSTEM_USAGE ; do
75         _fs_mount="${_fs%%:*}"
76         _fs_thresholds="${_fs#*:}"
77
78         if [ ! -d "$_fs_mount" ]; then
79             echo "WARNING: Directory ${_fs_mount} does not exist"
80             continue
81         fi
82
83         # Get current utilization
84         _fs_usage=$(df -kP "$_fs_mount" | \
85                            sed -n -e 's@.*[[:space:]]\([[:digit:]]*\)%.*@\1@p')
86         if [ -z "$_fs_usage" ] ; then
87             echo "WARNING: Unable to get FS utilization for ${_fs_mount}"
88             continue
89         fi
90
91         check_thresholds "Filesystem ${_fs_mount}" \
92                          "$_fs_thresholds" \
93                          "$_fs_usage"
94     done
95 }
96
97 dump_memory_info ()
98 {
99     get_proc "meminfo"
100     ps auxfww
101     set_proc "sysrq-trigger" "m"
102 }
103
104 monitor_memory_usage ()
105 {
106     if [ -z "$CTDB_MONITOR_MEMORY_USAGE" -a \
107          -z "$CTDB_MONITOR_SWAP_USAGE" ] ; then
108         return
109     fi
110
111     _meminfo=$(get_proc "meminfo")
112     set -- $(echo "$_meminfo" | awk '
113 $1 == "MemAvailable:" { memavail += $2 }
114 $1 == "MemFree:"      { memfree  += $2 }
115 $1 == "Cached:"       { memfree  += $2 }
116 $1 == "Buffers:"      { memfree  += $2 }
117 $1 == "MemTotal:"     { memtotal  = $2 }
118 $1 == "SwapFree:"     { swapfree  = $2 }
119 $1 == "SwapTotal:"    { swaptotal = $2 }
120 END {
121     if (memavail != 0) { memfree = memavail ; }
122     print int((memtotal -  memfree)  / memtotal * 100),
123           int((swaptotal - swapfree) / swaptotal * 100)
124 }')
125     _mem_usage="$1"
126     _swap_usage="$2"
127
128     check_thresholds "System memory" \
129                      "$CTDB_MONITOR_MEMORY_USAGE" \
130                      "$_mem_usage" \
131                      dump_memory_info
132
133     check_thresholds "System swap" \
134                      "$CTDB_MONITOR_SWAP_USAGE" \
135                      "$_swap_usage" \
136                      dump_memory_info
137 }
138
139
140 case "$1" in
141     monitor)
142         monitor_filesystem_usage
143         monitor_memory_usage
144         ;;
145
146     *)
147         ctdb_standard_event_handler "$@"
148         ;;
149 esac
150
151 exit 0