ctdb/tests/scripts/ctdb_test_functions.bash

   1 # Hey Emacs, this is a -*- shell-script -*- !!!  :-)
   2
   3 fail ()
   4 {
   5     echo "$*"
   6     exit 1
   7 }
   8
   9 ######################################################################
  10
  11 ctdb_test_begin ()
  12 {
  13     local name="$1"
  14
  15     teststarttime=$(date '+%s')
  16     testduration=0
  17
  18     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  19     echo "Running test $name ($(date '+%T'))"
  20     echo "--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--==--"
  21 }
  22
  23 ctdb_test_end ()
  24 {
  25     local name="$1" ; shift
  26     local status="$1" ; shift
  27     # "$@" is command-line
  28
  29     local interp="SKIPPED"
  30     local statstr=" (reason $*)"
  31     if [ -n "$status" ] ; then
  32         if [ $status -eq 0 ] ; then
  33             interp="PASSED"
  34             statstr=""
  35             echo "ALL OK: $*"
  36         else
  37             interp="FAILED"
  38             statstr=" (status $status)"
  39             testfailures=$(($testfailures+1))
  40         fi
  41     fi
  42
  43     testduration=$(($(date +%s)-$teststarttime))
  44
  45     echo "=========================================================================="
  46     echo "TEST ${interp}: ${name}${statstr} (duration: ${testduration}s)"
  47     echo "=========================================================================="
  48
  49 }
  50
  51 test_exit ()
  52 {
  53     exit $(($testfailures+0))
  54 }
  55
  56 ctdb_check_time_logs ()
  57 {
  58     local threshold=20
  59
  60     local jump=false
  61     local prev=""
  62     local ds_prev=""
  63     local node=""
  64
  65     out=$(onnode all tail -n 20 /var/log/ctdb.test.time.log 2>&1)
  66
  67     if [ $? -eq 0 ] ; then
  68         local line
  69         while read line ; do
  70             case "$line" in
  71                 \>\>\ NODE:\ *\ \<\<)
  72                     node="${line#>> NODE: }"
  73                     node=${node% <<*}
  74                     ds_prev=""
  75                     ;;
  76                 *\ *)
  77                     set -- $line
  78                     ds_curr="$1${2:0:1}"
  79                     if [ -n "$ds_prev" ] && \
  80                         [ $(($ds_curr - $ds_prev)) -ge $threshold ] ; then
  81                         echo "Node $node had time jump of $(($ds_curr - $ds_prev))ds between $(date +'%T' -d @${ds_prev%?}) and $(date +'%T' -d @${ds_curr%?})"
  82                         jump=true
  83                     fi
  84                     prev="$line"
  85                     ds_prev="$ds_curr"
  86                     ;;
  87             esac
  88         done <<<"$out"
  89     else
  90         echo Error getting time logs
  91     fi
  92     if $jump ; then
  93         echo "Check time sync (test client first):"
  94         date
  95         onnode -p all date
  96         echo "Information from test client:"
  97         hostname
  98         top -b -n 1
  99         echo "Information from cluster nodes:"
 100         onnode all "top -b -n 1 ; echo '/proc/slabinfo' ; cat /proc/slabinfo"
 101     fi
 102 }
 103
 104 ctdb_test_exit ()
 105 {
 106     local status=$?
 107
 108     trap - 0
 109
 110     [ $(($testfailures+0)) -eq 0 -a $status -ne 0 ] && testfailures=$status
 111     status=$(($testfailures+0))
 112
 113     # Avoid making a test fail from this point onwards.  The test is
 114     # now complete.
 115     set +e
 116
 117     echo "*** TEST COMPLETED (RC=$status) AT $(date '+%F %T'), CLEANING UP..."
 118
 119     if [ -n "$CTDB_TEST_REAL_CLUSTER" -a $status -ne 0 ] ; then
 120         ctdb_check_time_logs
 121     fi
 122
 123     eval "$ctdb_test_exit_hook" || true
 124     unset ctdb_test_exit_hook
 125
 126     if $ctdb_test_restart_scheduled || ! cluster_is_healthy ; then
 127
 128         restart_ctdb
 129     else
 130         # This could be made unconditional but then we might get
 131         # duplication from the recovery in restart_ctdb.  We want to
 132         # leave the recovery in restart_ctdb so that future tests that
 133         # might do a manual restart mid-test will benefit.
 134         echo "Forcing a recovery..."
 135         onnode 0 $CTDB recover
 136     fi
 137
 138     exit $status
 139 }
 140
 141 ctdb_test_exit_hook_add ()
 142 {
 143     ctdb_test_exit_hook="${ctdb_test_exit_hook}${ctdb_test_exit_hook:+ ; }$*"
 144 }
 145
 146 ctdb_test_run ()
 147 {
 148     local name="$1" ; shift
 149
 150     [ -n "$1" ] || set -- "$name"
 151
 152     ctdb_test_begin "$name"
 153
 154     local status=0
 155     "$@" || status=$?
 156
 157     ctdb_test_end "$name" "$status" "$*"
 158
 159     return $status
 160 }
 161
 162 ctdb_test_usage()
 163 {
 164     local status=${1:-2}
 165
 166     cat <<EOF
 167 Usage: $0 [option]
 168
 169 Options:
 170     -h, --help          show this screen.
 171     -v, --version       show test case version.
 172     --category          show the test category (ACL, CTDB, Samba ...).
 173     -d, --description   show test case description.
 174     --summary           show short test case summary.
 175     -x                  trace test using set -x
 176 EOF
 177
 178     exit $status
 179 }
 180
 181 ctdb_test_version ()
 182 {
 183     [ -n "$CTDB_DIR" ] || fail "Can not determine version."
 184
 185     (cd "$CTDB_DIR" && git describe)
 186 }
 187
 188 ctdb_test_cmd_options()
 189 {
 190     [ -n "$1" ] || return 0
 191
 192     case "$1" in
 193         -h|--help)        ctdb_test_usage 0   ;;
 194         -v|--version)     ctdb_test_version   ;;
 195         --category)       echo "CTDB"         ;;
 196         -d|--description) test_info           ;;
 197         -x)               set -x ; return 0   ;;
 198         *)
 199             echo "Error: Unknown parameter = $1"
 200             echo
 201             ctdb_test_usage 2
 202             ;;
 203     esac
 204
 205     exit 0
 206 }
 207
 208 ctdb_test_init ()
 209 {
 210     scriptname=$(basename "$0")
 211     testfailures=0
 212     ctdb_test_restart_scheduled=false
 213
 214     ctdb_test_cmd_options $@
 215
 216     trap "ctdb_test_exit" 0
 217 }
 218
 219 ctdb_test_check_real_cluster ()
 220 {
 221     [ -n "$CTDB_TEST_REAL_CLUSTER" ] && return 0
 222
 223     echo "ERROR: This test must be run on a real/virtual cluster, not local daemons."
 224     return 1
 225 }
 226
 227 ########################################
 228
 229 # Sets: $out
 230 try_command_on_node ()
 231 {
 232     local nodespec="$1" ; shift
 233
 234     local verbose=false
 235     local onnode_opts=""
 236
 237     while [ "${nodespec#-}" != "$nodespec" ] ; do
 238         if [ "$nodespec" = "-v" ] ; then
 239             verbose=true
 240         else
 241             onnode_opts="$nodespec"
 242         fi
 243         nodespec="$1" ; shift
 244     done
 245
 246     local cmd="$*"
 247
 248     out=$(onnode -q $onnode_opts "$nodespec" "$cmd" 2>&1) || {
 249
 250         echo "Failed to execute \"$cmd\" on node(s) \"$nodespec\""
 251         echo "$out"
 252         return 1
 253     }
 254
 255     if $verbose ; then
 256         echo "Output of \"$cmd\":"
 257         echo "$out"
 258     fi
 259 }
 260
 261 sanity_check_output ()
 262 {
 263     local min_lines="$1"
 264     local regexp="$2" # Should be anchored as necessary.
 265     local output="$3"
 266
 267     local ret=0
 268
 269     local num_lines=$(echo "$output" | wc -l)
 270     echo "There are $num_lines lines of output"
 271     if [ $num_lines -lt $min_lines ] ; then
 272         echo "BAD: that's less than the required number (${min_lines})"
 273         ret=1
 274     fi
 275
 276     local status=0
 277     local unexpected # local doesn't pass through status of command on RHS.
 278     unexpected=$(echo "$output" | egrep -v "$regexp") || status=$?
 279
 280     # Note that this is reversed.
 281     if [ $status -eq 0 ] ; then
 282         echo "BAD: unexpected lines in output:"
 283         echo "$unexpected" | cat -A
 284         ret=1
 285     else
 286         echo "Output lines look OK"
 287     fi
 288
 289     return $ret
 290 }
 291
 292 sanity_check_ips ()
 293 {
 294     local ips="$1" # list of "ip node" lines
 295
 296     echo "Sanity checking IPs..."
 297
 298     local x ipp prev
 299     prev=""
 300     while read x ipp ; do
 301         [ "$ipp" = "-1" ] && break
 302         if [ -n "$prev" -a "$ipp" != "$prev" ] ; then
 303             echo "OK"
 304             return 0
 305         fi
 306         prev="$ipp"
 307     done <<<"$ips"
 308
 309     echo "BAD: a node was -1 or IPs are only assigned to one node"
 310     echo "Are you running an old version of CTDB?"
 311     return 1
 312 }
 313
 314 # This returns a list of "ip node" lines in $out
 315 all_ips_on_node()
 316 {
 317     local node=$@
 318     try_command_on_node $node "$CTDB ip -Y -n all | cut -d ':' -f1-3 | sed -e '1d' -e 's@^:@@' -e 's@:@ @g'"
 319 }
 320
 321 select_test_node_and_ips ()
 322 {
 323     all_ips_on_node 0
 324
 325     # When selecting test_node we just want a node that has public
 326     # IPs.  This will work and is economically semi-random.  :-)
 327     local x
 328     read x test_node <<<"$out"
 329
 330     test_node_ips=""
 331     local ip pnn
 332     while read ip pnn ; do
 333         if [ "$pnn" = "$test_node" ] ; then
 334             test_node_ips="${test_node_ips}${test_node_ips:+ }${ip}"
 335         fi
 336     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 337
 338     echo "Selected node ${test_node} with IPs: ${test_node_ips}."
 339     test_ip="${test_node_ips%% *}"
 340 }
 341
 342 #######################################
 343
 344 # Wait until either timeout expires or command succeeds.  The command
 345 # will be tried once per second.
 346 wait_until ()
 347 {
 348     local timeout="$1" ; shift # "$@" is the command...
 349
 350     local negate=false
 351     if [ "$1" = "!" ] ; then
 352         negate=true
 353         shift
 354     fi
 355
 356     echo -n "<${timeout}|"
 357     local t=$timeout
 358     while [ $t -gt 0 ] ; do
 359         local rc=0
 360         "$@" || rc=$?
 361         if { ! $negate && [ $rc -eq 0 ] ; } || \
 362             { $negate && [ $rc -ne 0 ] ; } ; then
 363             echo "|$(($timeout - $t))|"
 364             echo "OK"
 365             return 0
 366         fi
 367         echo -n .
 368         t=$(($t - 1))
 369         sleep 1
 370     done
 371
 372     echo "*TIMEOUT*"
 373
 374     return 1
 375 }
 376
 377 sleep_for ()
 378 {
 379     echo -n "=${1}|"
 380     for i in $(seq 1 $1) ; do
 381         echo -n '.'
 382         sleep 1
 383     done
 384     echo '|'
 385 }
 386
 387 _cluster_is_healthy ()
 388 {
 389     local out x count line
 390
 391     out=$($CTDB -Y status 2>/dev/null) || return 1
 392
 393     {
 394         read x
 395         count=0
 396         while read line ; do
 397             # We need to see valid lines if we're going to be healthy.
 398             [ "${line#:[0-9]}" != "$line" ] && count=$(($count + 1))
 399             # A line indicating a node is unhealthy causes failure.
 400             [ "${line##:*:*:*1:}" != "$line" ] && return 1
 401         done
 402         [ $count -gt 0 ] && return $?
 403     } <<<"$out" # Yay bash!
 404 }
 405
 406 cluster_is_healthy ()
 407 {
 408     if onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
 409         echo "Cluster is HEALTHY"
 410         return 0
 411     else
 412         echo "Cluster is UNHEALTHY"
 413         if ! ${ctdb_test_restart_scheduled:-false} ; then
 414             echo "DEBUG AT $(date '+%F %T'):"
 415             local i
 416             for i in "onnode -q 0 $CTDB status" "onnode -q 0 onnode all $CTDB scriptstatus" ; do
 417                 echo "$i"
 418                 $i || true
 419             done
 420         fi
 421         return 1
 422     fi
 423 }
 424
 425 wait_until_healthy ()
 426 {
 427     local timeout="${1:-120}"
 428
 429     echo "Waiting for cluster to become healthy..."
 430
 431     wait_until 120 _cluster_is_healthy
 432 }
 433
 434 # This function is becoming nicely overloaded.  Soon it will collapse!  :-)
 435 node_has_status ()
 436 {
 437     local pnn="$1"
 438     local status="$2"
 439
 440     local bits fpat mpat
 441     case "$status" in
 442         (unhealthy)    bits="?:?:?:1:*" ;;
 443         (healthy)      bits="?:?:?:0:*" ;;
 444         (disconnected) bits="1:*" ;;
 445         (connected)    bits="0:*" ;;
 446         (banned)       bits="?:1:*" ;;
 447         (unbanned)     bits="?:0:*" ;;
 448         (disabled)     bits="?:?:1:*" ;;
 449         (enabled)      bits="?:?:0:*" ;;
 450         (stopped)      bits="?:?:?:?:1:*" ;;
 451         (notstopped)   bits="?:?:?:?:0:*" ;;
 452         (frozen)       fpat='^[[:space:]]+frozen[[:space:]]+1$' ;;
 453         (unfrozen)     fpat='^[[:space:]]+frozen[[:space:]]+0$' ;;
 454         (monon)        mpat='^Monitoring mode:ACTIVE \(0\)$' ;;
 455         (monoff)       mpat='^Monitoring mode:DISABLED \(1\)$' ;;
 456         *)
 457             echo "node_has_status: unknown status \"$status\""
 458             return 1
 459     esac
 460
 461     if [ -n "$bits" ] ; then
 462         local out x line
 463
 464         out=$($CTDB -Y status 2>&1) || return 1
 465
 466         {
 467             read x
 468             while read line ; do
 469                 # This needs to be done in 2 steps to avoid false matches.
 470                 local line_bits="${line#:${pnn}:*:}"
 471                 [ "$line_bits" = "$line" ] && continue
 472                 [ "${line_bits#${bits}}" != "$line_bits" ] && return 0
 473             done
 474             return 1
 475         } <<<"$out" # Yay bash!
 476     elif [ -n "$fpat" ] ; then
 477         $CTDB statistics -n "$pnn" | egrep -q "$fpat"
 478     elif [ -n "$mpat" ] ; then
 479         $CTDB getmonmode -n "$pnn" | egrep -q "$mpat"
 480     else
 481         echo 'node_has_status: unknown mode, neither $bits nor $fpat is set'
 482         return 1
 483     fi
 484 }
 485
 486 wait_until_node_has_status ()
 487 {
 488     local pnn="$1"
 489     local status="$2"
 490     local timeout="${3:-30}"
 491     local proxy_pnn="${4:-any}"
 492
 493     echo "Waiting until node $pnn has status \"$status\"..."
 494
 495     if ! wait_until $timeout onnode $proxy_pnn $CTDB_TEST_WRAPPER node_has_status "$pnn" "$status" ; then
 496         for i in "onnode -q any $CTDB status" "onnode -q any onnode all $CTDB scriptstatus" ; do
 497             echo "$i"
 498             $i || true
 499         done
 500
 501         return 1
 502     fi
 503
 504 }
 505
 506 # Useful for superficially testing IP failover.
 507 # IPs must be on nodes matching nodeglob.
 508 ips_are_on_nodeglob ()
 509 {
 510     local nodeglob="$1" ; shift
 511     local ips="$*"
 512
 513     local out
 514
 515     all_ips_on_node 1
 516
 517     while read ip pnn ; do
 518         for check in $ips ; do
 519             if [ "$check" = "$ip" ] ; then
 520                 case "$pnn" in
 521                     ($nodeglob) : ;;
 522                     (*) return 1  ;;
 523                 esac
 524                 ips="${ips/${ip}}" # Remove from list
 525             fi
 526         done
 527     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 528
 529     ips="${ips// }" # Remove any spaces.
 530     [ -z "$ips" ]
 531 }
 532
 533 wait_until_ips_are_on_nodeglob ()
 534 {
 535     echo "Waiting for IPs to fail over..."
 536
 537     wait_until 60 ips_are_on_nodeglob "$@"
 538 }
 539
 540 node_has_some_ips ()
 541 {
 542     local node="$1"
 543
 544     local out
 545
 546     all_ips_on_node 1
 547
 548     while read ip pnn ; do
 549         if [ "$node" = "$pnn" ] ; then
 550             return 0
 551         fi
 552     done <<<"$out" # bashism to avoid problem setting variable in pipeline.
 553
 554     return 1
 555 }
 556
 557 wait_until_node_has_some_ips ()
 558 {
 559     echo "Waiting for node to have some IPs..."
 560
 561     wait_until 60 node_has_some_ips "$@"
 562 }
 563
 564 get_src_socket ()
 565 {
 566     local proto="$1"
 567     local dst_socket="$2"
 568     local pid="$3"
 569     local prog="$4"
 570
 571     local pat="^${proto}[[:space:]]+[[:digit:]]+[[:space:]]+[[:digit:]]+[[:space:]]+[^[:space:]]+[[:space:]]+${dst_socket//./\\.}[[:space:]]+ESTABLISHED[[:space:]]+${pid}/${prog}[[:space:]]*\$"
 572     out=$(netstat -tanp |
 573         egrep "$pat" |
 574         awk '{ print $4 }')
 575
 576     [ -n "$out" ]
 577 }
 578
 579 wait_until_get_src_socket ()
 580 {
 581     local proto="$1"
 582     local dst_socket="$2"
 583     local pid="$3"
 584     local prog="$4"
 585
 586     echo "Waiting for ${prog} to establish connection to ${dst_socket}..."
 587
 588     wait_until 5 get_src_socket "$@"
 589 }
 590
 591 #######################################
 592
 593 # filename will be in $tcpdump_filename, pid in $tcpdump_pid
 594 tcpdump_start ()
 595 {
 596     tcpdump_filter="$1" # global
 597
 598     echo "Running tcpdump..."
 599     tcpdump_filename=$(mktemp)
 600     ctdb_test_exit_hook_add "rm -f $tcpdump_filename"
 601
 602     # The only way of being sure that tcpdump is listening is to send
 603     # some packets that it will see.  So we use dummy pings - the -U
 604     # option to tcpdump ensures that packets are flushed to the file
 605     # as they are captured.
 606     local dummy_addr="127.3.2.1"
 607     local dummy="icmp and dst host ${dummy_addr} and icmp[icmptype] == icmp-echo"
 608     tcpdump -n -p -s 0 -e -U -w $tcpdump_filename -i any "($tcpdump_filter) or ($dummy)" &
 609     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 610
 611     echo "Waiting for tcpdump output file to be ready..."
 612     ping -q "$dummy_addr" >/dev/null 2>&1 &
 613     ctdb_test_exit_hook_add "kill $! >/dev/null 2>&1"
 614
 615     tcpdump_listen_for_dummy ()
 616     {
 617         tcpdump -n -r $tcpdump_filename -c 1 "$dummy" >/dev/null 2>&1
 618     }
 619
 620     wait_until 10 tcpdump_listen_for_dummy
 621 }
 622
 623 # By default, wait for 1 matching packet.
 624 tcpdump_wait ()
 625 {
 626     local count="${1:-1}"
 627     local filter="${2:-${tcpdump_filter}}"
 628
 629     tcpdump_check ()
 630     {
 631         local found=$(tcpdump -n -r $tcpdump_filename "$filter" 2>/dev/null | wc -l)
 632         [ $found -ge $count ]
 633     }
 634
 635     echo "Waiting for tcpdump to capture some packets..."
 636     if ! wait_until 30 tcpdump_check ; then
 637         echo "DEBUG AT $(date '+%F %T'):"
 638         local i
 639         for i in "onnode -q 0 $CTDB status" "netstat -tanp" "tcpdump -n -e -r $tcpdump_filename" ; do
 640             echo "$i"
 641             $i || true
 642         done
 643         return 1
 644     fi
 645 }
 646
 647 tcpdump_show ()
 648 {
 649     local filter="${1:-${tcpdump_filter}}"
 650
 651     tcpdump -n -r $tcpdump_filename  "$filter" 2>/dev/null
 652 }
 653
 654 tcptickle_sniff_start ()
 655 {
 656     local src="$1"
 657     local dst="$2"
 658
 659     local in="src host ${dst%:*} and tcp src port ${dst##*:} and dst host ${src%:*} and tcp dst port ${src##*:}"
 660     local out="src host ${src%:*} and tcp src port ${src##*:} and dst host ${dst%:*} and tcp dst port ${dst##*:}"
 661     local tickle_ack="${in} and (tcp[tcpflags] & tcp-ack != 0) and (tcp[14] == 4) and (tcp[15] == 210)" # win == 1234
 662     local ack_ack="${out} and (tcp[tcpflags] & tcp-ack != 0)"
 663     tcptickle_reset="${in} and tcp[tcpflags] & tcp-rst != 0"
 664     local filter="(${tickle_ack}) or (${ack_ack}) or (${tcptickle_reset})"
 665
 666     tcpdump_start "$filter"
 667 }
 668
 669 tcptickle_sniff_wait_show ()
 670 {
 671     tcpdump_wait 1 "$tcptickle_reset"
 672
 673     echo "GOOD: here are some TCP tickle packets:"
 674     tcpdump_show
 675 }
 676
 677 gratarp_sniff_start ()
 678 {
 679     tcpdump_start "arp host ${test_ip}"
 680 }
 681
 682 gratarp_sniff_wait_show ()
 683 {
 684     tcpdump_wait 2
 685
 686     echo "GOOD: this should be the some gratuitous ARPs:"
 687     tcpdump_show
 688 }
 689
 690
 691 #######################################
 692
 693 daemons_stop ()
 694 {
 695     echo "Attempting to politely shutdown daemons..."
 696     onnode 1 $CTDB shutdown -n all || true
 697
 698     echo "Sleeping for a while..."
 699     sleep_for 1
 700
 701     if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 702         echo "Killing remaining daemons..."
 703         pkill -f $CTDB_DIR/bin/ctdbd
 704
 705         if pgrep -f $CTDB_DIR/bin/ctdbd >/dev/null ; then
 706             echo "Once more with feeling.."
 707             pkill -9 $CTDB_DIR/bin/ctdbd
 708         fi
 709     fi
 710
 711     local var_dir=$CTDB_DIR/tests/var
 712     rm -rf $var_dir/test.db
 713 }
 714
 715 daemons_setup ()
 716 {
 717     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 718
 719     local var_dir=$CTDB_DIR/tests/var
 720
 721     mkdir -p $var_dir/test.db/persistent
 722
 723     local public_addresses=$var_dir/public_addresses.txt
 724     local no_public_addresses=$var_dir/no_public_addresses.txt
 725     rm -f $CTDB_NODES $public_addresses $no_public_addresses
 726
 727     # If there are (strictly) greater than 2 nodes then we'll randomly
 728     # choose a node to have no public addresses.
 729     local no_public_ips=-1
 730     [ $num_nodes -gt 2 ] && no_public_ips=$(($RANDOM % $num_nodes))
 731     echo "$no_public_ips" >$no_public_addresses
 732
 733     local i
 734     for i in $(seq 1 $num_nodes) ; do
 735         if [ "${CTDB_USE_IPV6}x" != "x" ]; then
 736             echo ::$i >> $nodes
 737             ip addr add ::$i/128 dev lo
 738         else
 739             echo 127.0.0.$i >> $CTDB_NODES
 740             # 2 public addresses on most nodes, just to make things interesting.
 741             if [ $(($i - 1)) -ne $no_public_ips ] ; then
 742                 echo "192.0.2.$i/24 lo" >> $public_addresses
 743                 echo "192.0.2.$(($i + $num_nodes))/24 lo" >> $public_addresses
 744             fi
 745         fi
 746     done
 747 }
 748
 749 daemons_start_1 ()
 750 {
 751     local pnn="$1"
 752     shift # "$@" gets passed to ctdbd
 753
 754     local var_dir=$CTDB_DIR/tests/var
 755
 756     local public_addresses=$var_dir/public_addresses.txt
 757     local no_public_addresses=$var_dir/no_public_addresses.txt
 758
 759     local no_public_ips=-1
 760     [ -r $no_public_addresses ] && read no_public_ips <$no_public_addresses
 761
 762     if  [ "$no_public_ips" = $pnn ] ; then
 763         echo "Node $no_public_ips will have no public IPs."
 764     fi
 765
 766     local ctdb_options="--reclock=$var_dir/rec.lock --nlist $CTDB_NODES --nopublicipcheck --event-script-dir=$CTDB_DIR/tests/events.d --logfile=$var_dir/daemons.log -d 3 --dbdir=$var_dir/test.db --dbdir-persistent=$var_dir/test.db/persistent --dbdir-state=$var_dir/test.db/state"
 767
 768     if [ -z "$CTDB_TEST_REAL_CLUSTER" ]; then
 769         ctdb_options="$ctdb_options --public-interface=lo"
 770     fi
 771
 772     if [ $pnn -eq $no_public_ips ] ; then
 773         ctdb_options="$ctdb_options --public-addresses=/dev/null"
 774     else
 775         ctdb_options="$ctdb_options --public-addresses=$public_addresses"
 776     fi
 777
 778     # Need full path so we can use "pkill -f" to kill the daemons.
 779     $VALGRIND $CTDB_DIR/bin/ctdbd --socket=$var_dir/sock.$pnn $ctdb_options "$@" ||return 1
 780 }
 781
 782 daemons_start ()
 783 {
 784     # "$@" gets passed to ctdbd
 785
 786     local num_nodes="${CTDB_TEST_NUM_DAEMONS:-2}" # default is 2 nodes
 787
 788     echo "Starting $num_nodes ctdb daemons..."
 789
 790     for i in $(seq 0 $(($num_nodes - 1))) ; do
 791         daemons_start_1 $i "$@"
 792     done
 793
 794     local var_dir=$CTDB_DIR/tests/var
 795
 796     if [ -L /tmp/ctdb.socket -o ! -S /tmp/ctdb.socket ] ; then
 797         ln -sf $var_dir/sock.0 /tmp/ctdb.socket || return 1
 798     fi
 799 }
 800
 801 #######################################
 802
 803 _ctdb_hack_options ()
 804 {
 805     local ctdb_options="$*"
 806
 807     # We really just want to pass CTDB_OPTIONS but on RH
 808     # /etc/sysconfig/ctdb can, and frequently does, set that variable.
 809     # So instead, we hack badly.  We'll add these as we use them.
 810     # Note that these may still be overridden by the above file... but
 811     # we tend to use the exotic options here... so that is unlikely.
 812
 813     case "$ctdb_options" in
 814         *--start-as-stopped*)
 815             export CTDB_START_AS_STOPPED="yes"
 816     esac
 817 }
 818
 819 _restart_ctdb ()
 820 {
 821     _ctdb_hack_options "$@"
 822
 823     if [ -e /etc/redhat-release ] ; then
 824         service ctdb restart
 825     else
 826         /etc/init.d/ctdb restart
 827     fi
 828 }
 829
 830 _ctdb_start ()
 831 {
 832     _ctdb_hack_options "$@"
 833
 834     /etc/init.d/ctdb start
 835 }
 836
 837 setup_ctdb ()
 838 {
 839     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 840         daemons_setup
 841     fi
 842 }
 843
 844 # Common things to do after starting one or more nodes.
 845 _ctdb_start_post ()
 846 {
 847     onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || return 1
 848
 849     echo "Setting RerecoveryTimeout to 1"
 850     onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
 851
 852     # In recent versions of CTDB, forcing a recovery like this blocks
 853     # until the recovery is complete.  Hopefully this will help the
 854     # cluster to stabilise before a subsequent test.
 855     echo "Forcing a recovery..."
 856     onnode -q 0 $CTDB recover
 857     sleep_for 1
 858     echo "Forcing a recovery..."
 859     onnode -q 0 $CTDB recover
 860
 861     echo "ctdb is ready"
 862 }
 863
 864 # This assumes that ctdbd is not running on the given node.
 865 ctdb_start_1 ()
 866 {
 867     local pnn="$1"
 868     shift # "$@" is passed to ctdbd start.
 869
 870     echo -n "Starting CTDB on node ${pnn}..."
 871
 872     if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 873         daemons_start_1 $pnn "$@"
 874     else
 875         onnode $pnn $CTDB_TEST_WRAPPER _ctdb_start "$@"
 876     fi
 877
 878     # If we're starting only 1 node then we're doing something weird.
 879     ctdb_restart_when_done
 880 }
 881
 882 restart_ctdb ()
 883 {
 884     # "$@" is passed to ctdbd start.
 885
 886     echo -n "Restarting CTDB"
 887     if $ctdb_test_restart_scheduled ; then
 888         echo -n " (scheduled)"
 889     fi
 890     echo "..."
 891
 892     local i
 893     for i in $(seq 1 5) ; do
 894         if [ -n "$CTDB_NODES_SOCKETS" ] ; then
 895             daemons_stop
 896             daemons_start "$@"
 897         else
 898             onnode -p all $CTDB_TEST_WRAPPER _restart_ctdb "$@"
 899         fi || {
 900             echo "Restart failed.  Trying again in a few seconds..."
 901             sleep_for 5
 902             continue
 903         }
 904
 905         onnode -q 1  $CTDB_TEST_WRAPPER wait_until_healthy || {
 906             echo "Cluster didn't become healthy.  Restarting..."
 907             continue
 908         }
 909
 910         local debug_out=$(onnode -p all ctdb status -Y 2>&1; onnode -p all ctdb scriptstatus 2>&1)
 911
 912         echo "Setting RerecoveryTimeout to 1"
 913         onnode -pq all "$CTDB setvar RerecoveryTimeout 1"
 914
 915         # In recent versions of CTDB, forcing a recovery like this
 916         # blocks until the recovery is complete.  Hopefully this will
 917         # help the cluster to stabilise before a subsequent test.
 918         echo "Forcing a recovery..."
 919         onnode -q 0 $CTDB recover
 920         sleep_for 1
 921         echo "Forcing a recovery..."
 922         onnode -q 0 $CTDB recover
 923
 924         # Cluster is still healthy.  Good, we're done!
 925         if ! onnode 0 $CTDB_TEST_WRAPPER _cluster_is_healthy ; then
 926             echo "Cluster become UNHEALTHY again.  Restarting..."
 927             continue
 928         fi
 929
 930         echo "Doing a sync..."
 931         onnode -q 0 $CTDB sync
 932
 933         echo "ctdb is ready"
 934         return 0
 935     done
 936
 937     echo "Cluster UNHEALTHY...  too many attempts..."
 938     echo "$debug_out"
 939     # Try to make the calling test fail
 940     status=1
 941     return 1
 942 }
 943
 944 ctdb_restart_when_done ()
 945 {
 946     ctdb_test_restart_scheduled=true
 947 }
 948
 949 #######################################
 950
 951 install_eventscript ()
 952 {
 953     local script_name="$1"
 954     local script_contents="$2"
 955
 956     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 957         # The quoting here is *very* fragile.  However, we do
 958         # experience the joy of installing a short script using
 959         # onnode, and without needing to know the IP addresses of the
 960         # nodes.
 961         onnode all "f=\"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\" ; echo \"Installing \$f\" ; echo '${script_contents}' > \"\$f\" ; chmod 755 \"\$f\""
 962     else
 963         f="${CTDB_DIR}/tests/events.d/${script_name}"
 964         echo "$script_contents" >"$f"
 965         chmod 755 "$f"
 966     fi
 967 }
 968
 969 uninstall_eventscript ()
 970 {
 971     local script_name="$1"
 972
 973     if [ -n "$CTDB_TEST_REAL_CLUSTER" ] ; then
 974         onnode all "rm -vf \"\${CTDB_BASE:-/etc/ctdb}/events.d/${script_name}\""
 975     else
 976         rm -vf "${CTDB_DIR}/tests/events.d/${script_name}"
 977     fi
 978 }
 979
 980 #######################################
 981
 982 # This section deals with the 99.ctdb_test eventscript.
 983
 984 # Metafunctions: Handle a ctdb-test file on a node.
 985 # given event.
 986 ctdb_test_eventscript_file_create ()
 987 {
 988     local pnn="$1"
 989     local type="$2"
 990
 991     try_command_on_node $pnn touch "/tmp/ctdb-test-${type}.${pnn}"
 992 }
 993
 994 ctdb_test_eventscript_file_remove ()
 995 {
 996     local pnn="$1"
 997     local type="$2"
 998
 999     try_command_on_node $pnn rm -f "/tmp/ctdb-test-${type}.${pnn}"
1000 }
1001
1002 ctdb_test_eventscript_file_exists ()
1003 {
1004     local pnn="$1"
1005     local type="$2"
1006
1007     try_command_on_node $pnn test -f "/tmp/ctdb-test-${type}.${pnn}" >/dev/null 2>&1
1008 }
1009
1010
1011 # Handle a flag file on a node that is removed by 99.ctdb_test on the
1012 # given event.
1013 ctdb_test_eventscript_flag ()
1014 {
1015     local cmd="$1"
1016     local pnn="$2"
1017     local event="$3"
1018
1019     ctdb_test_eventscript_file_${cmd} "$pnn" "flag-${event}"
1020 }
1021
1022
1023 # Handle a trigger that causes 99.ctdb_test to fail it's monitor
1024 # event.
1025 ctdb_test_eventscript_unhealthy_trigger ()
1026 {
1027     local cmd="$1"
1028     local pnn="$2"
1029
1030     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-trigger"
1031 }
1032
1033 # Handle the file that 99.ctdb_test created to show that it has marked
1034 # a node unhealthy because it detected the above trigger.
1035 ctdb_test_eventscript_unhealthy_detected ()
1036 {
1037     local cmd="$1"
1038     local pnn="$2"
1039
1040     ctdb_test_eventscript_file_${cmd} "$pnn" "unhealthy-detected"
1041 }
1042
1043 # Handle a trigger that causes 99.ctdb_test to timeout it's monitor
1044 # event.  This should cause the node to be banned.
1045 ctdb_test_eventscript_timeout_trigger ()
1046 {
1047     local cmd="$1"
1048     local pnn="$2"
1049     local event="$3"
1050
1051     ctdb_test_eventscript_file_${cmd} "$pnn" "${event}-timeout"
1052 }
1053
1054 # Note that the eventscript can't use the above functions!
1055 ctdb_test_eventscript_install ()
1056 {
1057
1058     local script='#!/bin/sh
1059 out=$(ctdb pnn)
1060 pnn="${out#PNN:}"
1061
1062 rm -vf "/tmp/ctdb-test-flag-${1}.${pnn}"
1063
1064 trigger="/tmp/ctdb-test-unhealthy-trigger.${pnn}"
1065 detected="/tmp/ctdb-test-unhealthy-detected.${pnn}"
1066 timeout_trigger="/tmp/ctdb-test-${1}-timeout.${pnn}"
1067 case "$1" in
1068     monitor)
1069         if [ -e "$trigger" ] ; then
1070             echo "${0}: Unhealthy because \"$trigger\" detected"
1071             touch "$detected"
1072             exit 1
1073         elif [ -e "$detected" -a ! -e "$trigger" ] ; then
1074             echo "${0}: Healthy again, \"$trigger\" no longer detected"
1075             rm "$detected"
1076         fi
1077
1078         ;;
1079     *)
1080         if [ -e "$timeout_trigger" ] ; then
1081             echo "${0}: Sleeping for a long time because \"$timeout_trigger\" detected"
1082             sleep 9999
1083         fi
1084         ;;
1085         *)
1086
1087 esac
1088
1089 exit 0
1090 '
1091     install_eventscript "99.ctdb_test" "$script"
1092 }
1093
1094 ctdb_test_eventscript_uninstall ()
1095 {
1096     uninstall_eventscript "99.ctdb_test"
1097 }
1098
1099 # Note that this only works if you know all other monitor events will
1100 # succeed.  You also need to install the eventscript before using it.
1101 wait_for_monitor_event ()
1102 {
1103     local pnn="$1"
1104
1105     echo "Waiting for a monitor event on node ${pnn}..."
1106     ctdb_test_eventscript_flag create $pnn "monitor"
1107
1108     wait_until 120 ! ctdb_test_eventscript_flag exists $pnn "monitor"
1109
1110 }
1111
1112 # Make sure that $CTDB is set.
1113 : ${CTDB:=ctdb}