ctdb/tests/scripts/ctdb_test_functions.bash

   1 # Hey Emacs, this is a -*- shell-script -*- !!!  :-)
   2
   3 fail ()
   4 {
   5     echo "$*"
   6     exit 1
   7 }
   8
   9 ######################################################################
  10
  11 ctdb_test_begin ()
  12 {
  13     local name="$1"
  14
  15     teststarttime=$(date '+%s')
  16     testduration=0
  17
  18     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  19     echo "Running test $name ($(date '+%T'))"
  20     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  21 }
  22
  23 ctdb_test_end ()
  24 {
  25     local name="$1" ; shift
  26     local status="$1" ; shift
  27     # "$@" is command-line
  28
  29     local interp="SKIPPED"
  30     local statstr=" (reason $*)"
  31     if [ -n "$status" ] ; then
  32         if [ $status -eq 0 ] ; then
  33             interp="PASSED"
  34             statstr=""
  35             echo "ALL OK: $*"
  36         else
  37             interp="FAILED"
  38             statstr=" (status $status)"
  39             testfailures=$(($testfailures+1))
  40         fi
  41     fi
  42
  43     testduration=$(($(date +%s)-$teststarttime))
  44
  45     echo "=========================================================================="
  46     echo "TEST ${interp}: ${name}${statstr} (duration: ${testduration}s)"
  47     echo "=========================================================================="
  48
  49 }
  50
  51 test_exit ()
  52 {
  53     exit $(($testfailures+0))
  54 }
  55
  56 ctdb_check_time_logs ()
  57 {
  58     local threshold=20
  59
  60     local jump=false
  61     local prev=""
  62     local ds_prev=""
  63     local node=""
  64
  65     out=$(onnode all tail -n 20 /var/log/ctdb.test.time.log 2>&1)
  66
  67     if [ $? -eq 0 ] ; then
  68         local line
  69         while read line ; do
  70             case "$line" in
  71                 \>\>\ NODE:\ *\ \<\<)
  72                     node="${line#>> NODE: }"
  73                     node=${node% <<*}
  74                     ds_prev=""
  75                     ;;
  76                 *\ *)
  77                     set -- $line
  78                     ds_curr="$1${2:0:1}"
  79                     if [ -n "$ds_prev" ] && \
  80                         [ $(($ds_curr - $ds_prev)) -ge $threshold ] ; then
  81                         echo "Node $node had time jump of $(($ds_curr - $ds_prev))ds between $(date +'%T' -d @${ds_prev%?}) and $(date +'%T' -d @${ds_curr%?})"
  82                         jump=true
  83                     fi
  84                     prev="$line"
  85                     ds_prev="$ds_curr"
  86                     ;;
  87             esac
  88         done <<<"$out"
  89     else
  90         echo Error getting time logs
  91     fi
  92     if $jump ; then
  93         echo "Check time sync (test client first):"
  94         date
  95         onnode -p all date
  96         echo "Information from test client:"
  97         hostname
  98         top -b -n 1
  99         echo "Information from cluster nodes:"
 100         onnode all "top -b -n 1 ; echo '/proc/slabinfo' ; cat /proc/slabinfo"
 101     fi
 102 }
 103
 104 ctdb_test_exit ()
 105 {
 106     local status=$?
 107
 108     trap - 0
 109
 110     [ $(($testfailures+0)) -eq 0 -a $status -ne 0 ] && testfailures=$status
 111     status=$(($testfailures+0))
 112
 113     # Avoid making a test fail from this point onwards.  The test is
 114     # now complete.
 115     set +e
 116
 117     echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."
 118
 119     if [ -n "$CTDB_TEST_REAL_CLUSTER" -a $status -ne 0 ] ; then
 120         ctdb_check_time_logs
 121     fi
 122
 123     eval "$ctdb_test_exit_hook" || true
 124     unset ctdb_test_exit_hook
 125
 126     if $ctdb_test_restart_scheduled || ! cluster_is_healthy ; then
 127
 128         restart_ctdb
 129     else
 130         # This could be made unconditional but then we might get
 131         # duplication from the recovery in restart_ctdb.  We want to
 132         # leave the recovery in restart_ctdb so that future tests that
 133         # might do a manual restart mid-test will benefit.
 134         echo "Forcing a recovery..."
 135         onnode 0 $CTDB recover
 136     fi
 137
 138     exit $status
 139 }
 140
 141 ctdb_test_exit_hook_add ()
 142 {
 143     ctdb_test_exit_hook="${ctdb_test_exit_hook}${ctdb_test_exit_hook:+ ; }$*"
 144 }
 145
 146 ctdb_test_run ()
 147 {
 148     local name="$1" ; shift
 149
 150     [ -n "$1" ] || set -- "$name"
 151
 152     ctdb_test_begin "$name"
 153
 154     local status=0
 155     "$@" || status=$?
 156
 157     ctdb_test_end "$name" "$status" "$*"
 158
 159     return $status
 160 }
 161
 162 ctdb_test_usage()
 163 {
 164     local status=${1:-2}
 165
 166     cat <<EOF
 167 Usage: $0 [option]
 168
 169 Options:
 170     -h, --help          show this screen.
 171     -v, --version       show test case version.
 172     --category          show the test category (ACL, CTDB, Samba ...).
 173     -d, --description   show test case description.
 174     --summary           show short test case summary.
 175     -x                  trace test using set -x
 176 EOF
 177
 178     exit $status
 179 }
 180
 181 ctdb_test_version ()
 182 {
 183     [ -n "$CTDB_DIR" ] || fail "Can not determine version."
 184
 185     (cd "$CTDB_DIR" && git describe)
 186 }
 187
 188 ctdb_test_cmd_options()
 189 {
 190     [ -n "$1" ] || return 0
 191
 192     case "$1" in
 193         -h|--help)        ctdb_test_usage 0   ;;
 194         -v|--version)     ctdb_test_version   ;;
 195         --category)       echo "CTDB"         ;;
 196         -d|--description) test_info           ;;
 197         -x)               set -x ; return 0   ;;
 198         *)
 199             echo "Error: Unknown parameter = $1"
 200             echo
 201             ctdb_test_usage 2
 202             ;;
 203     esac
 204
 205     exit 0
 206 }
 207
 208 ctdb_test_init ()
 209 {
 210     scriptname=$(basename "$0")
 211     testfailures=0
 212     ctdb_test_restart_scheduled=false
 213
 214     ctdb_test_cmd_options $@
 215
 216     trap "ctdb_test_exit" 0
 217 }
 218
 219 ctdb_test_check_real_cluster ()
 220 {
 221     [ -n "$CTDB_TEST_REAL_CLUSTER" ] && return 0
 222
 223     echo "ERROR: This test must be run on a real/virtual cluster, not local daemons."
 224     return 1
 225 }
 226
 227 ########################################
 228
 229 # Sets: $out
 230 try_command_on_node ()
 231 {
 232     local nodespec="$1" ; shift
 233
 234     local verbose=false
 235     local onnode_opts=""
 236
 237     while [ "${nodespec#-}" != "$nodespec" ] ; do
 238         if [ "$nodespec" = "-v" ] ; then
 239             verbose=true
 240         else
 241             onnode_opts="$nodespec"
 242         fi
 243         nodespec="$1" ; shift
 244     done
 245
 246     local cmd="$*"
 247
 248     out=$(onnode -q $onnode_opts "$nodespec" "$cmd" 2>&1) || {
 249
 250         echo "Failed to execute \"$cmd\" on node(s) \"$nodespec\""
 251         echo "$out"
 252         return 1
 253     }
 254
 255     if $verbose ; then
 256         echo "Output of \"$cmd\":"
 257         echo "$out"
 258     fi
 259 }
 260
 261 sanity_check_output ()
 262 {
 263     local min_lines="$1"
 264     local regexp="$2" # Should be anchored as necessary.
 265     local output="$3"
 266
 267     local ret=0
 268
 269     local num_lines=$(echo "$output" | wc -l)
 270     echo "There are $num_lines lines of output"
 271     if [ $num_lines -lt $min_lines ] ; then
 272         echo "BAD: that's less than the required number (${min_lines})"
 273         ret=1
 274     fi
 275
 276     local status=0
 277     local unexpected # local doesn't pass through status of command on RHS.
 278     unexpected=$(echo "$output" | egrep -v "$regexp") || status=$?
 279
 280     # Note that this is reversed.
 281     if [ $status -eq 0 ] ; then
 282         echo "BAD: unexpected lines in output:"
 283         echo "$unexpected" | cat -A
 284         ret=1
 285     else
 286         echo "Output lines look OK"
 287     fi
 288
 289     return $ret
 290 }
 291
 292 sanity_check_ips ()
 293 {
 294     local ips="$1" # list of "ip node" lines
 295
 296     echo "Sanity checking IPs..."
 297
 298     local x ipp prev
 299     prev=""
 300     while read x ipp ; do
 301         [ "$ipp" = "-1" ] && break
 302         if [ -n "$prev" -a "$ipp" != "$prev" ] ; then
 303             echo "OK"
 304             return 0
 305         fi
 306         prev="$ipp"
 307     done <<<"$ips"
 308
 309     echo "BAD: a node was -1 or IPs are only assigned to one node"
 310     echo "Are you running an old version of CTDB?"
 311     return 1
 312 }
 313
 314 # This returns a list of "ip node" lines in $out
 315 all_ips_on_node()
 316 {
 317     local node=$@
 318     try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
 319 }
 320
 321 select_test_node_and_ips ()
 322 {
 323     all_ips_on_node 0
 324
 325     # When selecting test_node we just want a node that has public
 326     # IPs.  This will work and is economically semi-random.  :-)
 327     local x
 328     read x test_node <<<"$out"
 329
 330     test_node_ips=""
 331     local ip pnn
 332     while read ip pnn ; do
 333         if [ "$pnn" = "$test_node" ] ; then
 334             test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
 335         fi
 336     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 337
 338     echo "Selected node ${test_node} with IPs: ${test_node_ips}."
 339     test_ip="${test_node_ips%% *}"
 340 }
 341
 342 #######################################
 343
 344 # Wait until either timeout expires or command succeeds.  The command
 345 # will be tried once per second.
 346 wait_until ()
 347 {
 348     local timeout="$1" ; shift # "$@" is the command...
 349
 350     local negate=false
 351     if [ "$1" = "!" ] ; then
 352         negate=true
 353         shift
 354     fi
 355
 356     echo -n "<${timeout}|"
 357     local t=$timeout
 358     while [ $t -gt 0 ] ; do
 359         local rc=0
 360         "$@" || rc=$?
 361         if { ! $negate && [ $rc -eq 0 ] ; } || \
 362             { $negate && [ $rc -ne 0 ] ; } ; then
 363             echo "|$(($timeout - $t))|"
 364             echo "OK"
 365             return 0
 366         fi
 367         echo -n .
 368         t=$(($t - 1))
 369         sleep 1
 370     done
 371
 372     echo "*TIMEOUT*"
 373
 374     return 1
 375 }
 376
 377 sleep_for ()
 378 {
 379     echo -n "=${1}|"
 380     for i in $(seq 1 $1) ; do
 381         echo -n '.'
 382         sleep 1
 383     done
 384     echo '|'
 385 }
 386
 387 _cluster_is_healthy ()
 388 {
 389     local out x count line
 390
 391     out=$($CTDB -Y status 2>/dev/null) || return 1
 392
 393     {
 394         read x
 395         count=0
 396         while read line ; do
 397             # We need to see valid lines if we're going to be healthy.
 398             [ "${line#:[0-9]}" != "$line" ] && count=$(($count + 1))
 399             # A line indicating a node is unhealthy causes failure.
 400             [ "${line##:*:*:*1:}" != "$line" ] && return 1
 401         done
 402         [ $count -gt 0 ] && return $?
 403     } <<<"$out" # Yay bash!
 404 }
 405
 406 cluster_is_healthy ()
 407 {
 408     if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
 409         echo "Cluster is HEALTHY"
 410         return 0
 411     else
 412         echo "Cluster is UNHEALTHY"
 413         if ! ${ctdb_test_restart_scheduled:-false} ; then
 414             echo "DEBUG AT $(date '+%F %T'):"
 415             local i
 416             for i in "onnode -q 0 $CTDB status" "onnode -q 0 onnode all $CTDB scriptstatus" ; do
 417                 echo "$i"
 418                 $i || true
 419             done
 420         fi
 421         return 1
 422     fi
 423 }
 424
 425 wait_until_healthy ()
 426 {
 427     local timeout="${1:-120}"
 428
 429     echo "Waiting for cluster to become healthy..."
 430
 431     wait_until 120 _cluster_is_healthy
 432 }
 433
 434 # This function is becoming nicely overloaded.  Soon it will collapse!  :-)
 435 node_has_status ()
 436 {
 437     local pnn="$1"
 438     local status="$2"
 439
 440     local bits fpat mpat
 441     case "$status" in
 442         (unhealthy)    bits="?:?:?:1:*" ;;
 443         (healthy)      bits="?:?:?:0:*" ;;
 444         (disconnected) bits="1:*" ;;
 445         (connected)    bits="0:*" ;;
 446         (banned)       bits="?:1:*" ;;
 447         (unbanned)     bits="?:0:*" ;;
 448         (disabled)     bits="?:?:1:*" ;;
 449         (enabled)      bits="?:?:0:*" ;;
 450         (stopped)      bits="?:?:?:?:1:*" ;;
 451         (notstopped)   bits="?:?:?:?:0:*" ;;
 452         (frozen)       fpat='^[[:space:]]+frozen[[:space:]]+1$' ;;
 453         (unfrozen)     fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
 454         (monon)        mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
 455         (monoff)       mpat='^Monitoring mode:DISABLED \(1\)$' ;;
 456         *)
 457             echo "node_has_status: unknown status \"$status\""
 458             return 1
 459     esac
 460
 461     if [ -n "$bits" ] ; then
 462         local out x line
 463
 464         out=$($CTDB -Y status 2>&1) || return 1
 465
 466         {
 467             read x
 468             while read line ; do
 469                 # This needs to be done in 2 steps to avoid false matches.
 470                 local line_bits="${line#:${pnn}:*:}"
 471                 [ "$line_bits" = "$line" ] && continue
 472                 [ "${line_bits#${bits}}" != "$line_bits" ] && return 0
 473             done
 474             return 1
 475         } <<<"$out" # Yay bash!
 476     elif [ -n "$fpat" ] ; then
 477         $CTDB statistics -n "$pnn" | egrep -q "$fpat"
 478     elif [ -n "$mpat" ] ; then
 479         $CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
 480     else
 481         echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
 482         return 1
 483     fi
 484 }
 485
 486 wait_until_node_has_status ()
 487 {
 488     local pnn="$1"
 489     local status="$2"
 490     local timeout="${3:-30}"
 491     local proxy_pnn="${4:-any}"
 492
 493     echo "Waiting until node $pnn has status \"$status\"..."
 494
 495     if ! wait_until $timeout onnode $proxy_pnn $CTDB_TEST_WRAPPER node_has_status "$pnn" "$status" ; then
 496         for i in "onnode -q any $CTDB status" "onnode -q any onnode all $CTDB scriptstatus" ; do
 497             echo "$i"
 498             $i || true
 499         done
 500
 501         return 1
 502     fi
 503
 504 }
 505
 506 # Useful for superficially testing IP failover.
 507 # IPs must be on nodes matching nodeglob.
 508 ips_are_on_nodeglob ()
 509 {
 510     local nodeglob="$1" ; shift
 511     local ips="$*"
 512
 513     local out
 514
 515     all_ips_on_node 1
 516
 517     while read ip pnn ; do
 518         for check in $ips ; do
 519             if [ "$check" = "$ip" ] ; then
 520                 case "$pnn" in
 521                     ($nodeglob) : ;;
 522                     (*) return 1  ;;
 523                 esac
 524                 ips="${ips/${ip}}" # Remove from list
 525             fi
 526         done
 527     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 528
 529     ips="${ips// }" # Remove any spaces.
 530     [ -z "$ips" ]
 531 }
 532
 533 wait_until_ips_are_on_nodeglob ()
 534 {
 535     echo "Waiting for IPs to fail over..."
 536
 537     wait_until 60 ips_are_on_nodeglob "$@"
 538 }
 539
 540 node_has_some_ips ()
 541 {
 542     local node="$1"
 543
 544     local out
 545
 546     all_ips_on_node 1
 547
 548     while read ip pnn ; do
 549         if [ "$node" = "$pnn" ] ; then
 550             return 0
 551         fi
 552     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 553
 554     return 1
 555 }
 556
 557 wait_until_node_has_some_ips ()
 558 {
 559     echo "Waiting for node to have some IPs..."
 560
 561     wait_until 60 node_has_some_ips "$@"
 562 }
 563
 564 get_src_socket ()
 565 {
 566     local proto="$1"
 567     local dst_socket="$2"
 568     local pid="$3"
 569     local prog="$4"
 570
 571     local pat="^${proto}[[:space:]]+[[:digit:]]+[[:space:]]+[[:digit:]]+[[:space:]]+[^[:space:]]+[[:space:]]+${dst_socket//./\\.}[[:space:]]+ESTABLISHED[[:space:]]+${pid}/${prog}[[:space:]]*\$"
 572     out=$(netstat -tanp |
 573         egrep "$pat" |
 574         awk '{ print $4 }')
 575
 576     [ -n "$out" ]
 577 }
 578
 579 wait_until_get_src_socket ()
 580 {
 581     local proto="$1"
 582     local dst_socket="$2"
 583     local pid="$3"
 584     local prog="$4"
 585
 586     echo "Waiting for ${prog} to establish connection to ${dst_socket}..."
 587
 588     wait_until 5 get_src_socket "$@"
 589 }
 590
 591 #######################################
 592
 593 # filename will be in $tcpdump_filename, pid in $tcpdump_pid
 594 tcpdump_start ()
 595 {
 596     tcpdump_filter="$1" # global
 597
 598     echo "Running tcpdump..."
 599     tcpdump_filename=$(mktemp)
 600     ctdb_test_exit_hook_add "rm -f $tcpdump_filename"
 601
 602     # The only way of being sure that tcpdump is listening is to send
 603     # some packets that it will see.  So we use dummy pings - the -U
 604     # option to tcpdump ensures that packets are flushed to the file
 605     # as they are captured.
 606     local dummy_addr="127.3.2.1"
 607     local dummy="icmp and dst host ${dummy_addr} and icmp[icmptype] == icmp-echo"
 608     tcpdump -n -p -s 0 -e -U -w $tcpdump_filename -i any "($tcpdump_filter) or ($dummy)" &
 609     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 610
 611     echo "Waiting for tcpdump output file to be ready..."
 612     ping -q "$dummy_addr" >/dev/null 2>&1 &
 613     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 614
 615     tcpdump_listen_for_dummy ()
 616     {
 617         tcpdump -n -r $tcpdump_filename -c 1 "$dummy" >/dev/null 2>&1
 618     }
 619
 620     wait_until 10 tcpdump_listen_for_dummy
 621 }
 622
 623 # By default, wait for 1 matching packet.
 624 tcpdump_wait ()
 625 {
 626     local count="${1:-1}"
 627     local filter="${2:-${tcpdump_filter}}"
 628
 629     tcpdump_check ()
 630     {
 631         local found=$(tcpdump -n -r $tcpdump_filename "$filter" 2>/dev/null | wc -l)
 632         [ $found -ge $count ]
 633     }
 634
 635     echo "Waiting for tcpdump to capture some packets..."
 636     if ! wait_until 30 tcpdump_check ; then
 637         echo "DEBUG AT $(date '+%F %T'):"
 638         local i
 639         for i in "onnode -q 0 $CTDB status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
 640             echo "$i"
 641             $i || true
 642         done
 643         return 1
 644     fi
 645 }
 646
 647 tcpdump_show ()
 648 {
 649     local filter="${1:-${tcpdump_filter}}"
 650
 651     tcpdump -n -r $tcpdump_filename  "$filter" 2>/dev/null
 652 }
 653
 654 tcptickle_sniff_start ()
 655 {
 656     local src="$1"
 657     local dst="$2"
 658
 659     local in="src host ${dst%:*} and tcp src port ${dst##*:} and dst host ${src%:*} and tcp dst port ${src##*:}"
 660     local out="src host ${src%:*} and tcp src port ${src##*:} and dst host ${dst%:*} and tcp dst port ${dst##*:}"
 661     local tickle_ack="${in} and (tcp[tcpflags] & tcp-ack != 0) and (tcp[14] == 4) and (tcp[15] == 210)" # win == 1234
 662     local ack_ack="${out} and (tcp[tcpflags] & tcp-ack != 0)"
 663     tcptickle_reset="${in} and tcp[tcpflags] & tcp-rst != 0"
 664     local filter="(${tickle_ack}) or (${ack_ack}) or (${tcptickle_reset})"
 665
 666     tcpdump_start "$filter"
 667 }
 668
 669 tcptickle_sniff_wait_show ()
 670 {
 671     tcpdump_wait 1 "$tcptickle_reset"
 672
 673     echo "GOOD: here are some TCP tickle packets:"
 674     tcpdump_show
 675 }
 676
 677 gratarp_sniff_start ()
 678 {
 679     tcpdump_start "arp host ${test_ip}"
 680 }
 681
 682 gratarp_sniff_wait_show ()
 683 {
 684     tcpdump_wait 2
 685
 686     echo "GOOD: this should be the some gratuitous ARPs:"
 687     tcpdump_show
 688 }
 689
 690
 691 #######################################
 692
 693 daemons_stop ()
 694 {
 695     echo "Attempting to politely shutdown daemons..."
 696     onnode 1 $CTDB shutdown -n all || true
 697
 698     echo "Sleeping for a while..."
 699     sleep_for 1
 700
 701     if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 702         echo "Killing remaining daemons..."
 703         pkill -f $CTDB_DIR/bin/ctdbd
 704
 705         if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 706             echo "Once more with feeling.."
 707             pkill -9 $CTDB_DIR/bin/ctdbd
 708         fi
 709     fi
 710
 711     local var_dir=$CTDB_DIR/tests/var
 712     rm -rf $var_dir/test.db
 713 }
 714
 715 daemons_setup ()
 716 {
 717     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 718
 719     local var_dir=$CTDB_DIR/tests/var
 720
 721     mkdir -p $var_dir/test.db/persistent
 722
 723     local nodes=$var_dir/nodes.txt
 724     local public_addresses=$var_dir/public_addresses.txt
 725     local no_public_addresses=$var_dir/no_public_addresses.txt
 726     rm -f $nodes $public_addresses $no_public_addresses
 727
 728     # If there are (strictly) greater than 2 nodes then we'll randomly
 729     # choose a node to have no public addresses.
 730     local no_public_ips=-1
 731     [ $num_nodes -gt 2 ] && no_public_ips=$(($RANDOM % $num_nodes))
 732     echo "$no_public_ips" >$no_public_addresses
 733
 734     local i
 735     for i in $(seq 1 $num_nodes) ; do
 736         if [ "${CTDB_USE_IPV6}x" != "x" ]; then
 737             echo ::$i >> $nodes
 738             ip addr add ::$i/128 dev lo
 739         else
 740             echo 127.0.0.$i >> $nodes
 741             # 2 public addresses on most nodes, just to make things interesting.
 742             if [ $(($i - 1)) -ne $no_public_ips ] ; then
 743                 echo "192.0.2.$i/24 lo" >> $public_addresses
 744                 echo "192.0.2.$(($i + $num_nodes))/24 lo" >> $public_addresses
 745             fi
 746         fi
 747     done
 748 }
 749
 750 daemons_start_1 ()
 751 {
 752     local pnn="$1"
 753     shift # "$@" gets passed to ctdbd
 754
 755     local var_dir=$CTDB_DIR/tests/var
 756
 757     local nodes=$var_dir/nodes.txt
 758     local public_addresses=$var_dir/public_addresses.txt
 759     local no_public_addresses=$var_dir/no_public_addresses.txt
 760
 761     local no_public_ips=-1
 762     [ -r $no_public_addresses ] && read no_public_ips <$no_public_addresses
 763
 764     if  [ "$no_public_ips" = $pnn ] ; then
 765         echo "Node $no_public_ips will have no public IPs."
 766     fi
 767
 768     local ctdb_options="--reclock=$var_dir/rec.lock --nlist $nodes --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 0 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent --dbdir-state=$var_dir/test.db/state"
 769
 770     if [ $(id -u) -eq 0 ]; then
 771         ctdb_options="$ctdb_options --public-interface=lo"
 772     fi
 773
 774     if [ $pnn -eq $no_public_ips ] ; then
 775         ctdb_options="$ctdb_options --public-addresses=/dev/null"
 776     else
 777         ctdb_options="$ctdb_options --public-addresses=$public_addresses"
 778     fi
 779
 780     # Need full path so we can use "pkill -f" to kill the daemons.
 781     $VALGRIND $CTDB_DIR/bin/ctdbd --socket=$var_dir/sock.$pnn $ctdb_options "$@" ||return 1
 782 }
 783
 784 daemons_start ()
 785 {
 786     # "$@" gets passed to ctdbd
 787
 788     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 789
 790     echo "Starting $num_nodes ctdb daemons..."
 791
 792     for i in $(seq 0 $(($num_nodes - 1))) ; do
 793         daemons_start_1 $i "$@"
 794     done
 795
 796     local var_dir=$CTDB_DIR/tests/var
 797
 798     if [ -L /tmp/ctdb.socket -o ! -S /tmp/ctdb.socket ] ; then
 799         ln -sf $var_dir/sock.0 /tmp/ctdb.socket || return 1
 800     fi
 801 }
 802
 803 #######################################
 804
 805 _ctdb_hack_options ()
 806 {
 807     local ctdb_options="$*"
 808
 809     # We really just want to pass CTDB_OPTIONS but on RH
 810     # /etc/sysconfig/ctdb can, and frequently does, set that variable.
 811     # So instead, we hack badly.  We'll add these as we use them.
 812     # Note that these may still be overridden by the above file... but
 813     # we tend to use the exotic options here... so that is unlikely.
 814
 815     case "$ctdb_options" in
 816         *--start-as-stopped*)
 817             export CTDB_START_AS_STOPPED="yes"
 818     esac
 819 }
 820
 821 _restart_ctdb ()
 822 {
 823     _ctdb_hack_options "$@"
 824
 825     if [ -e /etc/redhat-release ] ; then
 826         service ctdb restart
 827     else
 828         /etc/init.d/ctdb restart
 829     fi
 830 }
 831
 832 _ctdb_start ()
 833 {
 834     _ctdb_hack_options "$@"
 835
 836     /etc/init.d/ctdb start
 837 }
 838
 839 setup_ctdb ()
 840 {
 841     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 842         daemons_setup
 843     fi
 844 }
 845
 846 # Common things to do after starting one or more nodes.
 847 _ctdb_start_post ()
 848 {
 849     onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
 850
 851     echo "Setting RerecoveryTimeout to 1"
 852     onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
 853
 854     # In recent versions of CTDB, forcing a recovery like this blocks
 855     # until the recovery is complete.  Hopefully this will help the
 856     # cluster to stabilise before a subsequent test.
 857     echo "Forcing a recovery..."
 858     onnode -q 0 $CTDB recover
 859     sleep_for 1
 860     echo "Forcing a recovery..."
 861     onnode -q 0 $CTDB recover
 862
 863     echo "ctdb is ready"
 864 }
 865
 866 # This assumes that ctdbd is not running on the given node.
 867 ctdb_start_1 ()
 868 {
 869     local pnn="$1"
 870     shift # "$@" is passed to ctdbd start.
 871
 872     echo -n "Starting CTDB on node ${pnn}..."
 873
 874     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 875         daemons_start_1 $pnn "$@"
 876     else
 877         onnode $pnn $CTDB_TEST_WRAPPER _ctdb_start "$@"
 878     fi
 879
 880     # If we're starting only 1 node then we're doing something weird.
 881     ctdb_restart_when_done
 882 }
 883
 884 restart_ctdb ()
 885 {
 886     # "$@" is passed to ctdbd start.
 887
 888     echo -n "Restarting CTDB"
 889     if $ctdb_test_restart_scheduled ; then
 890         echo -n " (scheduled)"
 891     fi
 892     echo "..."
 893
 894     local i
 895     for i in $(seq 1 5) ; do
 896         if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 897             daemons_stop
 898             daemons_start "$@"
 899         else
 900             onnode -p all $CTDB_TEST_WRAPPER _restart_ctdb "$@"
 901         fi || {
 902             echo "Restart failed.  Trying again in a few seconds..."
 903             sleep_for 5
 904             continue
 905         }
 906
 907         onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || {
 908             echo "Cluster didn't become healthy.  Restarting..."
 909             continue
 910         }
 911
 912         local debug_out=$(onnode -p all ctdb status -Y 2>&1; onnode -p all ctdb scriptstatus 2>&1)
 913
 914         echo "Setting RerecoveryTimeout to 1"
 915         onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
 916
 917         # In recent versions of CTDB, forcing a recovery like this
 918         # blocks until the recovery is complete.  Hopefully this will
 919         # help the cluster to stabilise before a subsequent test.
 920         echo "Forcing a recovery..."
 921         onnode -q 0 $CTDB recover
 922         sleep_for 1
 923         echo "Forcing a recovery..."
 924         onnode -q 0 $CTDB recover
 925
 926         # Cluster is still healthy.  Good, we're done!
 927         if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
 928             echo "Cluster become UNHEALTHY again.  Restarting..."
 929             continue
 930         fi
 931
 932         echo "ctdb is ready"
 933         return 0
 934     done
 935
 936     echo "Cluster UNHEALTHY...  too many attempts..."
 937     echo "$debug_out"
 938     # Try to make the calling test fail
 939     status=1
 940     return 1
 941 }
 942
 943 ctdb_restart_when_done ()
 944 {
 945     ctdb_test_restart_scheduled=true
 946 }
 947
 948 #######################################
 949
 950 install_eventscript ()
 951 {
 952     local script_name="$1"
 953     local script_contents="$2"
 954
 955     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 956         # The quoting here is *very* fragile.  However, we do
 957         # experience the joy of installing a short script using
 958         # onnode, and without needing to know the IP addresses of the
 959         # nodes.
 960         onnode all "f=\"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\" ; echo \"Installing \$f\" ; echo '${script_contents}' > \"\$f\" ; chmod 755 \"\$f\""
 961     else
 962         f="${CTDB_DIR}/tests/events.d/${script_name}"
 963         echo "$script_contents" >"$f"
 964         chmod 755 "$f"
 965     fi
 966 }
 967
 968 uninstall_eventscript ()
 969 {
 970     local script_name="$1"
 971
 972     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 973         onnode all "rm -vf \"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\""
 974     else
 975         rm -vf "${CTDB_DIR}/tests/events.d/${script_name}"
 976     fi
 977 }
 978
 979 #######################################
 980
 981 # This section deals with the 99.ctdb_test eventscript.
 982
 983 # Metafunctions: Handle a ctdb-test file on a node.
 984 # given event.
 985 ctdb_test_eventscript_file_create ()
 986 {
 987     local pnn="$1"
 988     local type="$2"
 989
 990     try_command_on_node $pnn touch "/tmp/ctdb-test-${type}.${pnn}"
 991 }
 992
 993 ctdb_test_eventscript_file_remove ()
 994 {
 995     local pnn="$1"
 996     local type="$2"
 997
 998     try_command_on_node $pnn rm -f "/tmp/ctdb-test-${type}.${pnn}"
 999 }
1000
1001 ctdb_test_eventscript_file_exists ()
1002 {
1003     local pnn="$1"
1004     local type="$2"
1005
1006     try_command_on_node $pnn test -f "/tmp/ctdb-test-${type}.${pnn}" >/dev/null 2>&1
1007 }
1008
1009
1010 # Handle a flag file on a node that is removed by 99.ctdb_test on the
1011 # given event.
1012 ctdb_test_eventscript_flag ()
1013 {
1014     local cmd="$1"
1015     local pnn="$2"
1016     local event="$3"
1017
1018     ctdb_test_eventscript_file_${cmd} "$pnn" "flag-${event}"
1019 }
1020
1021
1022 # Handle a trigger that causes 99.ctdb_test to fail it's monitor
1023 # event.
1024 ctdb_test_eventscript_unhealthy_trigger ()
1025 {
1026     local cmd="$1"
1027     local pnn="$2"
1028
1029     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-trigger"
1030 }
1031
1032 # Handle the file that 99.ctdb_test created to show that it has marked
1033 # a node unhealthy because it detected the above trigger.
1034 ctdb_test_eventscript_unhealthy_detected ()
1035 {
1036     local cmd="$1"
1037     local pnn="$2"
1038
1039     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-detected"
1040 }
1041
1042 # Handle a trigger that causes 99.ctdb_test to timeout it's monitor
1043 # event.  This should cause the node to be banned.
1044 ctdb_test_eventscript_timeout_trigger ()
1045 {
1046     local cmd="$1"
1047     local pnn="$2"
1048     local event="$3"
1049
1050     ctdb_test_eventscript_file_${cmd} "$pnn" "${event}-timeout"
1051 }
1052
1053 # Note that the eventscript can't use the above functions!
1054 ctdb_test_eventscript_install ()
1055 {
1056
1057     local script='#!/bin/sh
1058 out=$(ctdb pnn)
1059 pnn="${out#PNN:}"
1060
1061 rm -vf "/tmp/ctdb-test-flag-${1}.${pnn}"
1062
1063 trigger="/tmp/ctdb-test-unhealthy-trigger.${pnn}"
1064 detected="/tmp/ctdb-test-unhealthy-detected.${pnn}"
1065 timeout_trigger="/tmp/ctdb-test-${1}-timeout.${pnn}"
1066 case "$1" in
1067     monitor)
1068         if [ -e "$trigger" ] ; then
1069             echo "${0}: Unhealthy because \"$trigger\" detected"
1070             touch "$detected"
1071             exit 1
1072         elif [ -e "$detected" -a ! -e "$trigger" ] ; then
1073             echo "${0}: Healthy again, \"$trigger\" no longer detected"
1074             rm "$detected"
1075         fi
1076
1077         ;;
1078     *)
1079         if [ -e "$timeout_trigger" ] ; then
1080             echo "${0}: Sleeping for a long time because \"$timeout_trigger\" detected"
1081             sleep 9999
1082         fi
1083         ;;
1084         *)
1085
1086 esac
1087
1088 exit 0
1089 '
1090     install_eventscript "99.ctdb_test" "$script"
1091 }
1092
1093 ctdb_test_eventscript_uninstall ()
1094 {
1095     uninstall_eventscript "99.ctdb_test"
1096 }
1097
1098 # Note that this only works if you know all other monitor events will
1099 # succeed.  You also need to install the eventscript before using it.
1100 wait_for_monitor_event ()
1101 {
1102     local pnn="$1"
1103
1104     echo "Waiting for a monitor event on node ${pnn}..."
1105     ctdb_test_eventscript_flag create $pnn "monitor"
1106
1107     wait_until 120 ! ctdb_test_eventscript_flag exists $pnn "monitor"
1108
1109 }
1110
1111 # Make sure that $CTDB is set.
1112 : ${CTDB:=ctdb}