lib:ldb: Use correct integer types for sizes
[vlendec/samba-autobuild/.git] / ctdb / config / debug_locks.sh
old mode 100644 (file)
new mode 100755 (executable)
index 0dde861..6c730ee
 #!/bin/sh
 
-# This script parses /proc/locks and finds the processes that are holding
-# locks on CTDB databases.  For all those processes the script dumps a
-# stack trace using gstack.
+# This script attempts to find processes holding locks on a particular
+# CTDB database and dumps a stack trace for each such processe.
 #
-# This script can be used only if Samba is configured to use fcntl locks
-# rather than mutex locks.
-
-# Create sed expression to convert inodes to names
-sed_cmd=$( ls -li /var/ctdb/*.tdb.* /var/ctdb/persistent/*.tdb.* |
-          sed -e "s#/var/ctdb[/persistent]*/\(.*\)#\1#" |
-          awk '{printf "s#[0-9]*:[0-9]*:%s #%s #\n", $1, $10}' )
-
-# Parse /proc/locks and extract following information
-#    pid process_name tdb_name offsets [W]
-out=$( cat /proc/locks |
-    grep -F "POSIX  ADVISORY  WRITE" |
-    awk '{ if($2 == "->") { print $6, $7, $8, $9, "W" } else { print $5, $6, $7, $8 } }' |
-    while read pid rest ; do
-       pname=$(readlink /proc/$pid/exe)
-       echo $pid $pname $rest
-    done | sed -e "$sed_cmd" | grep "\.tdb" )
-
-if [ -n "$out" ]; then
-    # Log information about locks
-    echo "$out" | logger -t "ctdbd-lock"
-
-    # Find processes that are waiting for locks
-    dbs=$(echo "$out" | grep "W$" | awk '{print $3}')
-    all_pids=""
-    for db in $dbs ; do
-       pids=$(echo "$out" | grep -v "W$" | grep "$db" | grep -v ctdbd | awk '{print $1}')
-       all_pids="$all_pids $pids"
-    done
-    pids=$(echo $all_pids | sort -u)
-
-    # For each process waiting, log stack trace
-    for pid in $pids ; do
-       gstack $pid | logger -t "ctdbd-lock $pid"
-#      gcore -o /var/log/core-deadlock-ctdb $pid
-    done
+# There are 2 cases:
+#
+# * Samba is configured to use fcntl locks
+#
+#   In this case /proc/locks is parsed to find potential lock holders
+#
+# * Samba is configured to use POSIX robust mutexes
+#
+#   In this case the helper program tdb_mutex_check is used to find
+#   potential lock holders.
+#
+#   This helper program uses a private glibc struct field, so is
+#   neither portable nor supported.  If this field is not available
+#   then the helper is not built.  Unexpected changes in internal
+#   glibc structures may cause unexpected results, including crashes.
+#   Bug reports for this helper program are not accepted without an
+#   accompanying patch.
+
+[ -n "$CTDB_BASE" ] || \
+       CTDB_BASE=$(d=$(dirname "$0") && cd -P "$d" && echo "$PWD")
+
+. "${CTDB_BASE}/functions"
+
+if [ $# -ne 4 ] ; then
+       die "usage: $0 <pid> { DB | RECORD } <tdb_path> { FCNTL | MUTEX }"
 fi
 
+lock_helper_pid="$1"
+# lock_scope is unused for now
+# shellcheck disable=SC2034
+lock_scope="$2"
+tdb_path="$3"
+lock_type="$4"
+
+# type is at least mentioned in POSIX and more is portable than which(1)
+# shellcheck disable=SC2039
+if ! type gstack >/dev/null 2>&1 ; then
+       gstack ()
+       {
+               _pid="$1"
+
+               gdb -batch --quiet -nx "/proc/${_pid}/exe" "$_pid" \
+                   -ex "thread apply all bt" 2>/dev/null |
+                       grep '^\(#\|Thread \)'
+       }
+fi
+
+# Load/cache database options from configuration file
+ctdb_get_db_options
+
+dump_stack ()
+{
+       _pid="$1"
+
+       echo "----- Stack trace for PID=${_pid} -----"
+       _state=$(ps -p "$_pid" -o state= | cut -c 1)
+       if [ "$_state" = "D" ] ; then
+               # Don't run gstack on a process in D state since
+               # gstack will hang until the process exits D state.
+               # Although it is possible for a process to transition
+               # to D state after this check, it is unlikely because
+               # if a process is stuck in D state then it is probably
+               # the reason why this script was called.  Note that a
+               # kernel stack almost certainly won't help diagnose a
+               # deadlock... but it will probably give us someone to
+               # blame!
+               echo "----- Process in D state, printing kernel stack only"
+               get_proc "${_pid}/stack"
+       else
+               gstack "$_pid"
+       fi
+}
+
+dump_stacks ()
+{
+       _pids="$1"
+
+       # Use word splitting to squash whitespace
+       # shellcheck disable=SC2086
+       _pids=$(echo $_pids | tr ' ' '\n' | sort -u)
+
+       for _pid in $_pids; do
+               dump_stack "$_pid"
+       done
+}
+
+get_tdb_file_id ()
+{
+       if ! _device_inode=$(stat -c "%d:%i" "$tdb_path" 2>/dev/null) ; then
+               die "Unable to stat \"${tdb_path}\""
+       fi
+       _device="${_device_inode%%:*}"
+       _device_major=$((_device >> 8))
+       _device_minor=$((_device & 0xff))
+       _inode="${_device_inode#*:}"
+       printf '%02x:%02x:%u\n' "$_device_major" "$_device_minor" "$_inode"
+}
+
+debug_via_proc_locks ()
+{
+       # Get file ID to match relevant column in /proc/locks
+       _file_id=$(get_tdb_file_id)
+
+       # Log information from /proc/locks about the waiting process
+       _tdb=$(basename "$tdb_path")
+       _comm=$(ps -p "$lock_helper_pid" -o comm=)
+       _out=$(get_proc "locks" |
+              awk -v pid="$lock_helper_pid" \
+                  -v file_id="$_file_id" \
+                  -v file="$_tdb" \
+                  -v comm="$_comm" \
+                  '$2 == "->" &&
+                   $3 == "POSIX" &&
+                   $4 == "ADVISORY" &&
+                   $5 == "WRITE" &&
+                   $6 == pid &&
+                   $7 == file_id { print $6, comm, file, $8, $9 }')
+       if [ -n "$_out" ] ; then
+               echo "Waiter:"
+               echo "$_out"
+       fi
+
+       # Parse /proc/locks and find process holding locks on $tdb_path
+       # extract following information
+       #    pid process_name tdb_name offsets
+       _out=$(get_proc "locks" |
+              awk -v pid="$lock_helper_pid" \
+                  -v file_id="$_file_id" \
+                  -v file="$_tdb" \
+                  '$2 == "POSIX" &&
+                   $3 == "ADVISORY" &&
+                   $4 == "WRITE" &&
+                   $5 != pid &&
+                   $6 == file_id { print $5, file, $7, $8 }' |
+              while read -r _pid _rest ; do
+                      _pname=$(ps -p "$_pid" -o comm=)
+                      echo "$_pid $_pname $_rest"
+              done)
+
+       if [ -z "$_out" ]; then
+               return
+       fi
+
+       # Log information about locks
+       echo "Lock holders:"
+       echo "$_out"
+
+       _pids=$(echo "$_out" | awk '{ print $1 }')
+
+       lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}"
+}
+
+debug_via_tdb_mutex ()
+{
+       _helper="${CTDB_HELPER_BINDIR}/tdb_mutex_check"
+       if [ ! -x "$_helper" ] ; then
+               # Mutex helper not available - not supported?
+               # Avoid not found error...
+               return
+       fi
+
+       # Helper should always succeed
+       if ! _t=$("$_helper" "$tdb_path") ; then
+               return
+       fi
+
+       _out=$(echo "$_t" | sed -n -e 's#^\[\(.*\)\] pid=\(.*\)#\2 \1#p')
+
+       if [ -z "$_out" ]; then
+               if [ -n "$_t" ] ; then
+                       echo "$_t" | grep -F 'trylock failed'
+               fi
+               return
+       fi
+
+       # Get process names, append $tdb_path
+       _out=$(echo "$_out" |
+              while read -r _pid _rest ; do
+                      _pname=$(ps -p "$_pid" -o comm=)
+                      _tdb=$(basename "$tdb_path")
+                      echo "${_pid} ${_pname} ${_tdb} ${_rest}"
+              done)
+
+       # Log information about locks
+       echo "Lock holders:"
+       echo "$_out"
+
+       # Get PIDs of processes that are holding locks
+       _pids=$(echo "$_out" |
+               awk -v pid="$lock_helper_pid" '$1 != pid {print $1}')
+
+       lock_holder_pids="${lock_holder_pids:+${lock_holder_pids} }${_pids}"
+}
+
+(
+       flock -n 9 || exit 1
+
+       echo "===== Start of debug locks PID=$$ ====="
+
+       lock_holder_pids=""
+
+       debug_via_proc_locks
+
+       if [ "$lock_type" = "MUTEX" ] ; then
+               debug_via_tdb_mutex
+       fi
+
+       dump_stacks "$lock_holder_pids"
+
+       echo "===== End of debug locks PID=$$ ====="
+)9>"${CTDB_SCRIPT_VARDIR}/debug_locks.lock" | script_log "ctdbd-lock"
+
 exit 0