1 # Hey Emacs, this is a -*- shell-script -*- !!! :-)
3 . "${TEST_SCRIPTS_DIR}/common.sh"
5 ######################################################################
9 if [ -n "$CTDB_TEST_REMOTE_DIR" ] ; then
10 CTDB_TEST_WRAPPER="${CTDB_TEST_REMOTE_DIR}/test_wrap"
12 _d=$(cd ${TEST_SCRIPTS_DIR}; echo $PWD)
13 CTDB_TEST_WRAPPER="$_d/test_wrap"
15 export CTDB_TEST_WRAPPER
17 # If $VALGRIND is set then use it whenever ctdb is called, but only if
18 # $CTDB is not already set.
19 [ -n "$CTDB" ] || export CTDB="${VALGRIND}${VALGRIND:+ }ctdb"
22 PATH="${TEST_SCRIPTS_DIR}:${PATH}"
24 ######################################################################
32 # Avoid making a test fail from this point onwards. The test is
36 echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."
38 eval "$ctdb_test_exit_hook" || true
39 unset ctdb_test_exit_hook
41 if $ctdb_test_restart_scheduled || ! cluster_is_healthy ; then
42 echo "Restarting CTDB (scheduled)..."
43 ctdb_stop_all || true # Might be restarting some daemons were shutdown
45 echo "Reconfiguring cluster..."
50 # This could be made unconditional but then we might get
51 # duplication from the recovery in ctdb_init(). We want to
52 # leave the recovery in ctdb_init() so that future tests that
53 # might do a manual restart mid-test will benefit.
54 echo "Forcing a recovery..."
55 onnode 0 $CTDB recover
61 ctdb_test_exit_hook_add ()
63 ctdb_test_exit_hook="${ctdb_test_exit_hook}${ctdb_test_exit_hook:+ ; }$*"
68 scriptname=$(basename "$0")
69 ctdb_test_restart_scheduled=false
71 trap "ctdb_test_exit" 0
74 ########################################
77 try_command_on_node ()
79 local nodespec="$1" ; shift
84 while [ "${nodespec#-}" != "$nodespec" ] ; do
85 if [ "$nodespec" = "-v" ] ; then
88 onnode_opts="${onnode_opts}${onnode_opts:+ }${nodespec}"
95 out=$(onnode -q $onnode_opts "$nodespec" "$cmd" 2>&1) || {
97 echo "Failed to execute \"$cmd\" on node(s) \"$nodespec\""
103 echo "Output of \"$cmd\":"
108 sanity_check_output ()
111 local regexp="$2" # Should be anchored as necessary.
116 local num_lines=$(echo "$output" | wc -l)
117 echo "There are $num_lines lines of output"
118 if [ $num_lines -lt $min_lines ] ; then
119 echo "BAD: that's less than the required number (${min_lines})"
124 local unexpected # local doesn't pass through status of command on RHS.
125 unexpected=$(echo "$output" | egrep -v "$regexp") || status=$?
127 # Note that this is reversed.
128 if [ $status -eq 0 ] ; then
129 echo "BAD: unexpected lines in output:"
130 echo "$unexpected" | cat -A
133 echo "Output lines look OK"
141 local ips="$1" # list of "ip node" lines
143 echo "Sanity checking IPs..."
147 while read x ipp ; do
148 [ "$ipp" = "-1" ] && break
149 if [ -n "$prev" -a "$ipp" != "$prev" ] ; then
156 echo "BAD: a node was -1 or IPs are only assigned to one node:"
158 echo "Are you running an old version of CTDB?"
162 # This returns a list of "ip node" lines in $out
166 try_command_on_node $node \
167 "$CTDB ip -X | awk -F'|' 'NR > 1 { print \$2, \$3 }'"
170 _select_test_node_and_ips ()
172 try_command_on_node any \
173 "$CTDB ip -X all | awk -F'|' 'NR > 1 { print \$2, \$3 }'"
175 test_node="" # this matches no PNN
178 while read ip pnn ; do
179 if [ -z "$test_node" -a "$pnn" != "-1" ] ; then
182 if [ "$pnn" = "$test_node" ] ; then
183 test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
185 done <<<"$out" # bashism to avoid problem setting variable in pipeline.
187 echo "Selected node ${test_node} with IPs: ${test_node_ips}."
188 test_ip="${test_node_ips%% *}"
191 *:*) test_prefix="${test_ip}/128" ;;
192 *) test_prefix="${test_ip}/32" ;;
195 [ -n "$test_node" ] || return 1
198 select_test_node_and_ips ()
201 while ! _select_test_node_and_ips ; do
202 echo "Unable to find a test node with IPs assigned"
203 if [ $timeout -le 0 ] ; then
204 echo "BAD: Too many attempts"
208 timeout=$(($timeout - 1))
215 get_test_ip_mask_and_iface ()
218 try_command_on_node $test_node "$CTDB ip -v -X | awk -F'|' -v ip=$test_ip '\$2 == ip { print \$4 }'"
221 if [ -z "$TEST_LOCAL_DAEMONS" ] ; then
223 try_command_on_node $test_node ip addr show to $test_ip
230 echo "$test_ip/$mask is on $iface"
235 try_command_on_node -q all "$CTDB pnn"
239 # The subtlety is that "ctdb delip" will fail if the IP address isn't
240 # configured on a node...
241 delete_ip_from_all_nodes ()
249 for _pnn in $all_pnns ; do
250 all_ips_on_node $_pnn
251 while read _i _n ; do
252 if [ "$_ip" = "$_i" ] ; then
253 _nodes="${_nodes}${_nodes:+,}${_pnn}"
255 done <<<"$out" # bashism
258 try_command_on_node -pq "$_nodes" "$CTDB delip $_ip"
261 #######################################
266 for i in $(seq 1 $1) ; do
273 _cluster_is_healthy ()
275 $CTDB nodestatus all >/dev/null
278 _cluster_is_recovered ()
280 node_has_status 0 recovered
285 _cluster_is_healthy && _cluster_is_recovered
288 cluster_is_healthy ()
290 if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
291 echo "Cluster is HEALTHY"
292 if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
293 echo "WARNING: cluster in recovery mode!"
297 echo "Cluster is UNHEALTHY"
298 if ! ${ctdb_test_restart_scheduled:-false} ; then
299 echo "DEBUG AT $(date '+%F %T'):"
301 for i in "onnode -q 0 $CTDB status" "onnode -q 0 onnode all $CTDB scriptstatus" ; do
312 local timeout="${1:-120}"
314 echo "Waiting for cluster to become ready..."
316 wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_ready
319 # This function is becoming nicely overloaded. Soon it will collapse! :-)
325 local bits fpat mpat rpat
327 (unhealthy) bits="?|?|?|1|*" ;;
328 (healthy) bits="?|?|?|0|*" ;;
329 (disconnected) bits="1|*" ;;
330 (connected) bits="0|*" ;;
331 (banned) bits="?|1|*" ;;
332 (unbanned) bits="?|0|*" ;;
333 (disabled) bits="?|?|1|*" ;;
334 (enabled) bits="?|?|0|*" ;;
335 (stopped) bits="?|?|?|?|1|*" ;;
336 (notstopped) bits="?|?|?|?|0|*" ;;
337 (frozen) fpat='^[[:space:]]+frozen[[:space:]]+1$' ;;
338 (unfrozen) fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
339 (recovered) rpat='^Recovery mode:RECOVERY \(1\)$' ;;
340 (notlmaster) rpat="^hash:.* lmaster:${pnn}\$" ;;
342 echo "node_has_status: unknown status \"$status\""
346 if [ -n "$bits" ] ; then
349 out=$($CTDB -X status 2>&1) || return 1
354 # This needs to be done in 2 steps to avoid false matches.
355 local line_bits="${line#|${pnn}|*|}"
356 [ "$line_bits" = "$line" ] && continue
357 [ "${line_bits#${bits}}" != "$line_bits" ] && return 0
360 } <<<"$out" # Yay bash!
361 elif [ -n "$fpat" ] ; then
362 $CTDB statistics -n "$pnn" | egrep -q "$fpat"
363 elif [ -n "$rpat" ] ; then
364 ! $CTDB status -n "$pnn" | egrep -q "$rpat"
366 echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
371 wait_until_node_has_status ()
375 local timeout="${3:-30}"
376 local proxy_pnn="${4:-any}"
378 echo "Waiting until node $pnn has status \"$status\"..."
380 if ! wait_until $timeout onnode $proxy_pnn $CTDB_TEST_WRAPPER node_has_status "$pnn" "$status" ; then
381 for i in "onnode -q any $CTDB status" "onnode -q any onnode all $CTDB scriptstatus" ; do
391 # Useful for superficially testing IP failover.
392 # IPs must be on the given node.
393 # If the first argument is '!' then the IPs must not be on the given node.
397 if [ "$1" = "!" ] ; then
398 negating=true ; shift
400 local node="$1" ; shift
405 all_ips_on_node $node
408 for check in $ips ; do
410 while read ip pnn ; do
411 if [ "$check" = "$ip" ] ; then
412 if [ "$pnn" = "$node" ] ; then
413 if $negating ; then return 1 ; fi
415 if ! $negating ; then return 1 ; fi
417 ips="${ips/${ip}}" # Remove from list
420 # If we're negating and we didn't see the address then it
421 # isn't hosted by anyone!
423 ips="${ips/${check}}"
425 done <<<"$out" # bashism to avoid problem setting variable in pipeline.
428 ips="${ips// }" # Remove any spaces.
432 wait_until_ips_are_on_node ()
434 # Go to some trouble to print a use description of what is happening
436 if [ "$1" == "!" ] ; then
443 [ "$i" != "!" ] || continue
444 if [ -z "$node" ] ; then
448 ips="${ips}${ips:+, }${i}"
450 echo "Waiting for ${ips} to ${not}be assigned to node ${node}"
452 wait_until 60 ips_are_on_node "$@"
461 all_ips_on_node $node
463 while read ip pnn ; do
464 if [ "$node" = "$pnn" ] ; then
467 done <<<"$out" # bashism to avoid problem setting variable in pipeline.
472 wait_until_node_has_some_ips ()
474 echo "Waiting for some IPs to be assigned to node ${test_node}"
476 wait_until 60 node_has_some_ips "$@"
479 wait_until_node_has_no_ips ()
481 echo "Waiting until no IPs are assigned to node ${test_node}"
483 wait_until 60 ! node_has_some_ips "$@"
486 #######################################
492 if [ -e /etc/redhat-release ] ; then
495 /etc/init.d/ctdb "$cmd"
499 # Stop/start CTDB on all nodes. Override for local daemons.
502 onnode -p all $CTDB_TEST_WRAPPER _service_ctdb stop
506 onnode -p all $CTDB_TEST_WRAPPER _service_ctdb start
509 # Nothing needed for a cluster. Override for local daemons.
517 onnode "$1" $CTDB_TEST_WRAPPER _service_ctdb start
522 onnode "$1" $CTDB_TEST_WRAPPER _service_ctdb stop
527 onnode "$1" $CTDB_TEST_WRAPPER _service_ctdb restart
533 for i in $(seq 1 5) ; do
535 echo "Start failed. Trying again in a few seconds..."
540 wait_until_ready || {
541 echo "Cluster didn't become ready. Restarting..."
545 echo "Setting RerecoveryTimeout to 1"
546 onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
548 # In recent versions of CTDB, forcing a recovery like this
549 # blocks until the recovery is complete. Hopefully this will
550 # help the cluster to stabilise before a subsequent test.
551 echo "Forcing a recovery..."
552 onnode -q 0 $CTDB recover
555 if ! onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
556 echo "Cluster has gone into recovery again, waiting..."
557 wait_until 30/2 onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered
561 # Cluster is still healthy. Good, we're done!
562 if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
563 echo "Cluster became UNHEALTHY again [$(date)]"
564 onnode -p all ctdb status -X 2>&1
565 onnode -p all ctdb scriptstatus 2>&1
570 echo "Doing a sync..."
571 onnode -q 0 $CTDB sync
577 echo "Cluster UNHEALTHY... too many attempts..."
578 onnode -p all ctdb status -X 2>&1
579 onnode -p all ctdb scriptstatus 2>&1
581 # Try to make the calling test fail
586 ctdb_restart_when_done ()
588 ctdb_test_restart_scheduled=true
593 echo "${CTDB_BASE:-${CTDB_SCRIPTS_BASE}}"
596 #######################################
598 wait_for_monitor_event ()
603 echo "Waiting for a monitor event on node ${pnn}..."
605 try_command_on_node "$pnn" $CTDB scriptstatus || {
606 echo "Unable to get scriptstatus from node $pnn"
610 local ctdb_scriptstatus_original="$out"
611 wait_until 120 _ctdb_scriptstatus_changed
614 _ctdb_scriptstatus_changed ()
616 try_command_on_node "$pnn" $CTDB scriptstatus || {
617 echo "Unable to get scriptstatus from node $pnn"
621 [ "$out" != "$ctdb_scriptstatus_original" ]
624 #######################################
628 select_test_node_and_ips
630 nfs_first_export=$(showmount -e $test_ip | sed -n -e '2s/ .*//p')
632 echo "Creating test subdirectory..."
633 try_command_on_node $test_node "TMPDIR=$nfs_first_export mktemp -d"
635 try_command_on_node $test_node "chmod 777 $nfs_test_dir"
637 nfs_mnt_d=$(mktemp -d)
638 nfs_local_file="${nfs_mnt_d}/${nfs_test_dir##*/}/TEST_FILE"
639 nfs_remote_file="${nfs_test_dir}/TEST_FILE"
641 ctdb_test_exit_hook_add nfs_test_cleanup
643 echo "Mounting ${test_ip}:${nfs_first_export} on ${nfs_mnt_d} ..."
644 mount -o timeo=1,hard,intr,vers=3 \
645 "[${test_ip}]:${nfs_first_export}" ${nfs_mnt_d}
650 rm -f "$nfs_local_file"
651 umount -f "$nfs_mnt_d"
653 onnode -q $test_node rmdir "$nfs_test_dir"
656 #######################################
658 # If the given IP is hosted then print 2 items: maskbits and iface
664 *:*) _family="inet6" ; _bits=128 ;;
665 *) _family="inet" ; _bits=32 ;;
668 ip addr show to "${_addr}/${_bits}" 2>/dev/null | \
669 awk -v family="${_family}" \
670 'NR == 1 { iface = $2; sub(":$", "", iface) } \
671 $1 ~ /inet/ { mask = $2; sub(".*/", "", mask); \
672 print mask, iface, family }'
677 _addr="${1%/*}" # Remove optional maskbits
679 set -- $(ip_maskbits_iface $_addr)
680 if [ -n "$1" ] ; then
683 echo "Removing public address $_addr/$_maskbits from device $_iface"
684 ip addr del "$_ip/$_maskbits" dev "$_iface" >/dev/null 2>&1 || true
695 #######################################
697 # $1: pnn, $2: DB name
700 try_command_on_node -v $1 $CTDB getdbstatus "$2" |
701 sed -n -e "s@^path: @@p"
704 # $1: pnn, $2: DB name
705 db_ctdb_cattdb_count_records ()
707 # Count the number of keys, excluding any that begin with '_'.
708 # This excludes at least the sequence number record in
709 # persistent/replicated databases. The trailing "|| :" forces
710 # the command to succeed when no records are matched.
711 try_command_on_node $1 \
712 "$CTDB cattdb $2 | grep -c '^key([0-9][0-9]*) = \"[^_]' || :"
716 # $1: pnn, $2: DB name, $3: key string, $4: value string, $5: RSN (default 7)
719 _tdb=$(db_get_path $1 "$2")
721 try_command_on_node $1 $CTDB tstore "$_tdb" "$3" "$4" "$_rsn"
724 # $1: pnn, $2: DB name, $3: dbseqnum (must be < 255!!!!!)
725 db_ctdb_tstore_dbseqnum ()
727 # "__db_sequence_number__" + trailing 0x00
728 _key='0x5f5f64625f73657175656e63655f6e756d6265725f5f00'
730 # Construct 8 byte (unit64_t) database sequence number. This
731 # probably breaks if $3 > 255
732 _value=$(printf "0x%02x%014x" $3 0)
734 db_ctdb_tstore $1 "$2" "$_key" "$_value"
737 #######################################
739 # Enables all of the event scripts used in cluster tests, except for
740 # the mandatory scripts
741 ctdb_enable_cluster_test_event_scripts ()
752 for s in $scripts ; do
753 try_command_on_node all ctdb event script enable legacy "$s"
757 ########################################
759 # Make sure that $CTDB is set.
762 local="${TEST_SUBDIR}/scripts/local.bash"
763 if [ -r "$local" ] ; then