#!/bin/bash #make running virsh commands on a cluster easier #################### # show program usage usage () { cat >&2 < options: -x enable script debugging -n don't include the TSM node (if any) -w, --wait [timeout] for start/shutdown wait for desired state change commands: start CLUSTERNAME start cluster nodes destroy CLUSTERNAME power off cluster nodes (may cause data loss) shutdown CLUSTERNAME shutdown cluster nodes undefine CLUSTERNAME remove cluster CLUSTERNAME can be a glob-style pattern that specifies cluster nodes EOF exit 1 } ############################ # parse command line options temp=$(getopt -n "$prog" -o "xnw::" -l help -l wait:: -- "$@") [ $? != 0 ] && usage eval set -- "$temp" no_tsm=0 wait=false timeout="" while true ; do case "$1" in -x) set -x; shift ;; -n) no_tsm=1; shift ;; -w|--wait) wait=true ; timeout="$2" ; shift 2 ;; --) shift ; break ;; -h|--help|*) usage ;; # Shouldn't happen, so this is reasonable. esac done if [ $# -lt 2 ]; then echo "Usage: vircmd COMMAND CLUSTERNAME" exit 1 fi cmd="$1" cluster="$2" count=0 if $wait ; then case "$cmd" in start) desired_state="running" ;; shutdown) desired_state="shut off" ;; *) echo "waiting not supported with \"$cmd\"" ; echo ; usage ;; esac fi export VIRSH_DEFAULT_CONNECT_URI=qemu:///system get_nodes () { for i in $domains ; do case "$i" in ($1) # If we're not skipping the TSM node or this isn't the TSM node... if [ "$no_tsm" = 0 -o "${i/tsm/}" = "$i" ] ; then nodes="${nodes} ${i}" fi esac done } domains=$(virsh list --all | awk '{print $2}' | tail -n +3) nodes="" # If the cluster name doesn't have a wildcard then we need to be inventive. if [ "${cluster/[\[\]\?\*]/}" = "$cluster" ] ; then get_nodes "${cluster}[a-z]*[0-9]" else get_nodes "$cluster" fi [ -n "$nodes" ] || { echo "No nodes in cluster $2" exit 1 } rc=0 for i in $nodes ; do # We want to retry the command when we see an internal error. for x in $(seq 1 5) ; do out=$(virsh $cmd "$i" 2>&1) ret=$? # Hard to avoid this since we always want to echo $out :-( echo "$out" if [ $ret -ne 0 ] ; then case "$out" in *internal\ error*) echo "Retrying \"virsh $cmd $i\" due to internal error" sleep 3 continue esac fi break done [ $ret = 0 ] || rc=$ret done # Now comes the waiting... but we don't wait if there was an error. if [ $rc -ne 0 ] || ! $wait ; then exit $rc fi count=0 while : ; do if [ -n "$timeout" ] && [ $count -ge "$timeout" ] ; then echo "Timed out after ${timeout}s waiting for nodes to enter state \"${desired_state}\":" echo fmt='%-20s %s\n' printf "$fmt" "Domain" "State" printf "$fmt" "------" "-----" for i in $nodes ; do state=$(virsh dominfo "$i" | sed -nr -e 's@^State:[[:space:]]+@@p') printf "$fmt" "$i" "$state" done exit 62 # ETIME fi pat="^State:[[:space:]]+${desired_state}\$" all_good=true for i in $nodes ; do # Often "vircmd dominfo" returns 1 and prints rubbish like this: # error: operation failed: could not query memory balloon allocation # so we take pains to avoid this cluttering the output... if virsh_out=$(virsh dominfo "$i" 2>&1) ; then if ! echo "$virsh_out" | grep -E -q "$pat" ; then all_good=false fi fi done $all_good && exit 0 sleep 1 count=$(($count + 1)) done exit $rc