prev="$ipp"
done <<<"$ips"
- echo "BAD: a node was -1 or IPs are only assigned to one node"
+ echo "BAD: a node was -1 or IPs are only assigned to one node:"
+ echo "$ips"
echo "Are you running an old version of CTDB?"
return 1
}
# This returns a list of "ip node" lines in $out
all_ips_on_node()
{
- local node=$@
- try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
+ local node="$1"
+ try_command_on_node $node \
+ "$CTDB ip -X | awk -F'|' 'NR > 1 { print \$2, \$3 }'"
}
_select_test_node_and_ips ()
{
- all_ips_on_node 0
+ try_command_on_node any \
+ "$CTDB ip -X -n all | awk -F'|' 'NR > 1 { print \$2, \$3 }'"
test_node="" # this matches no PNN
test_node_ips=""
return 0
}
+# Sets: mask, iface
+get_test_ip_mask_and_iface ()
+{
+ # Find the interface
+ try_command_on_node $test_node "$CTDB ip -v -X | awk -F'|' -v ip=$test_ip '\$2 == ip { print \$4 }'"
+ iface="$out"
+
+ if [ -z "$TEST_LOCAL_DAEMONS" ] ; then
+ # Find the netmask
+ try_command_on_node $test_node ip addr show to $test_ip
+ mask="${out##*/}"
+ mask="${mask%% *}"
+ else
+ mask="24"
+ fi
+
+ echo "$test_ip/$mask is on $iface"
+}
+
#######################################
# Wait until either timeout expires or command succeeds. The command
-# will be tried once per second.
+# will be tried once per second, unless timeout has format T/I, where
+# I is the recheck interval.
wait_until ()
{
local timeout="$1" ; shift # "$@" is the command...
+ local interval=1
+ case "$timeout" in
+ */*)
+ interval="${timeout#*/}"
+ timeout="${timeout%/*}"
+ esac
+
local negate=false
if [ "$1" = "!" ] ; then
negate=true
echo "OK"
return 0
fi
- echo -n .
- t=$(($t - 1))
- sleep 1
+ local i
+ for i in $(seq 1 $interval) ; do
+ echo -n .
+ done
+ t=$(($t - $interval))
+ sleep $interval
done
echo "*TIMEOUT*"
_cluster_is_healthy ()
{
- $CTDB nodestatus all >/dev/null && \
- node_has_status 0 recovered
+ $CTDB nodestatus all >/dev/null
+}
+
+_cluster_is_recovered ()
+{
+ node_has_status all recovered
+}
+
+_cluster_is_ready ()
+{
+ _cluster_is_healthy && _cluster_is_recovered
}
cluster_is_healthy ()
{
if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
echo "Cluster is HEALTHY"
+ if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
+ echo "WARNING: cluster in recovery mode!"
+ fi
return 0
else
echo "Cluster is UNHEALTHY"
fi
}
-wait_until_healthy ()
+wait_until_ready ()
{
local timeout="${1:-120}"
- echo "Waiting for cluster to become healthy..."
+ echo "Waiting for cluster to become ready..."
- wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_healthy
+ wait_until $timeout onnode -q any $CTDB_TEST_WRAPPER _cluster_is_ready
}
# This function is becoming nicely overloaded. Soon it will collapse! :-)
local bits fpat mpat rpat
case "$status" in
- (unhealthy) bits="?:?:?:1:*" ;;
- (healthy) bits="?:?:?:0:*" ;;
- (disconnected) bits="1:*" ;;
- (connected) bits="0:*" ;;
- (banned) bits="?:1:*" ;;
- (unbanned) bits="?:0:*" ;;
- (disabled) bits="?:?:1:*" ;;
- (enabled) bits="?:?:0:*" ;;
- (stopped) bits="?:?:?:?:1:*" ;;
- (notstopped) bits="?:?:?:?:0:*" ;;
+ (unhealthy) bits="?|?|?|1|*" ;;
+ (healthy) bits="?|?|?|0|*" ;;
+ (disconnected) bits="1|*" ;;
+ (connected) bits="0|*" ;;
+ (banned) bits="?|1|*" ;;
+ (unbanned) bits="?|0|*" ;;
+ (disabled) bits="?|?|1|*" ;;
+ (enabled) bits="?|?|0|*" ;;
+ (stopped) bits="?|?|?|?|1|*" ;;
+ (notstopped) bits="?|?|?|?|0|*" ;;
(frozen) fpat='^[[:space:]]+frozen[[:space:]]+1$' ;;
(unfrozen) fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
(monon) mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
(monoff) mpat='^Monitoring mode:DISABLED \(1\)$' ;;
- (recovered) rpat='^Recovery mode:NORMAL \(0\)$' ;;
+ (recovered) rpat='^Recovery mode:RECOVERY \(1\)$' ;;
*)
echo "node_has_status: unknown status \"$status\""
return 1
if [ -n "$bits" ] ; then
local out x line
- out=$($CTDB -Y status 2>&1) || return 1
+ out=$($CTDB -X status 2>&1) || return 1
{
read x
while read line ; do
# This needs to be done in 2 steps to avoid false matches.
- local line_bits="${line#:${pnn}:*:}"
+ local line_bits="${line#|${pnn}|*|}"
[ "$line_bits" = "$line" ] && continue
[ "${line_bits#${bits}}" != "$line_bits" ] && return 0
done
elif [ -n "$mpat" ] ; then
$CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
elif [ -n "$rpat" ] ; then
- $CTDB status -n "$pnn" | egrep -q "$rpat"
+ ! $CTDB status -n "$pnn" | egrep -q "$rpat"
else
echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
return 1
}
# Useful for superficially testing IP failover.
-# IPs must be on nodes matching nodeglob.
-# If the first argument is '!' then the IPs must not be on nodes
-# matching nodeglob.
-ips_are_on_nodeglob ()
+# IPs must be on the given node.
+# If the first argument is '!' then the IPs must not be on the given node.
+ips_are_on_node ()
{
local negating=false
if [ "$1" = "!" ] ; then
negating=true ; shift
fi
- local nodeglob="$1" ; shift
+ local node="$1" ; shift
local ips="$*"
local out
- all_ips_on_node 1
+ all_ips_on_node $node
+ local check
for check in $ips ; do
+ local ip pnn
while read ip pnn ; do
if [ "$check" = "$ip" ] ; then
- case "$pnn" in
- ($nodeglob) if $negating ; then return 1 ; fi ;;
- (*) if ! $negating ; then return 1 ; fi ;;
- esac
+ if [ "$pnn" = "$node" ] ; then
+ if $negating ; then return 1 ; fi
+ else
+ if ! $negating ; then return 1 ; fi
+ fi
ips="${ips/${ip}}" # Remove from list
break
fi
[ -z "$ips" ]
}
-wait_until_ips_are_on_nodeglob ()
+wait_until_ips_are_on_node ()
{
- echo "Waiting for IPs to fail over..."
+ # Go to some trouble to print a use description of what is happening
+ local not=""
+ if [ "$1" == "!" ] ; then
+ not="no longer "
+ fi
+ local node=""
+ local ips=""
+ local i
+ for i ; do
+ [ "$i" != "!" ] || continue
+ if [ -z "$node" ] ; then
+ node="$i"
+ continue
+ fi
+ ips="${ips}${ips:+, }${i}"
+ done
+ echo "Waiting for ${ips} to ${not}be assigned to node ${node}"
- wait_until 60 ips_are_on_nodeglob "$@"
+ wait_until 60 ips_are_on_node "$@"
}
node_has_some_ips ()
local out
- all_ips_on_node 1
+ all_ips_on_node $node
while read ip pnn ; do
if [ "$node" = "$pnn" ] ; then
wait_until_node_has_some_ips ()
{
- echo "Waiting for node to have some IPs..."
+ echo "Waiting for some IPs to be assigned to node ${test_node}"
wait_until 60 node_has_some_ips "$@"
}
continue
}
- wait_until_healthy || {
- echo "Cluster didn't become healthy. Restarting..."
+ wait_until_ready || {
+ echo "Cluster didn't become ready. Restarting..."
continue
}
# help the cluster to stabilise before a subsequent test.
echo "Forcing a recovery..."
onnode -q 0 $CTDB recover
- sleep_for 1
+ sleep_for 2
+
+ if ! onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered ; then
+ echo "Cluster has gone into recovery again, waiting..."
+ wait_until 30/2 onnode -q any $CTDB_TEST_WRAPPER _cluster_is_recovered
+ fi
+
# Cluster is still healthy. Good, we're done!
if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
echo "Cluster became UNHEALTHY again [$(date)]"
- onnode -p all ctdb status -Y 2>&1
+ onnode -p all ctdb status -X 2>&1
onnode -p all ctdb scriptstatus 2>&1
echo "Restarting..."
continue
done
echo "Cluster UNHEALTHY... too many attempts..."
- onnode -p all ctdb status -Y 2>&1
+ onnode -p all ctdb status -X 2>&1
onnode -p all ctdb scriptstatus 2>&1
# Try to make the calling test fail