#!/bin/bash if [[ "$#" -ne "4" ]]; then echo "Usage: $0 "'"$SERVICENAME$" $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$' exit 1 fi . /home/nagios/.ssh/.ssh-agent sname="$1" sstate="$2" stype="$3" sattempt="$4" # Failover clusterines declare -A cluster=( \ ["NS Untrust"]="ims204" \ ["151 Untrust"]="ims205" \ ["NS PrivateDMZ"]="ims220" \ ["151 PrivateDMZ"]="ims221" \ ["NS Trust"]="ims222" \ ["151 Trust"]="ims223" \ ["NS PublicDMZ"]="ims224" \ ["151 PublicDMZ"]="ims225" \ ) # ["ims205 151 Cross hei"]="ims205 hei" \ # ["ims204 NS Cross hei"]="ims204 hei" \ # stop failover declare -A worker=( \ ["ims204 151 Untrust alef"]="ims205 gimel" \ ["ims204 151 Untrust bet"]="ims205 dalet" \ ["ims205 NS Untrust alef"]="ims204 gimel" \ ["ims205 NS Untrust bet"]="ims204 dalet" \ ["ims218 151 PrivateDMZ alef"]="ims219 gimel" \ ["ims218 151 PrivateDMZ bet"]="ims219 dalet" \ ["ims219 NS PrivateDMZ alef"]="ims218 gimel" \ ["ims219 NS PrivateDMZ bet"]="ims218 dalet" \ ["ims220 151 PrivateDMZ alef"]="ims221 gimel" \ ["ims220 151 PrivateDMZ bet"]="ims221 dalet" \ ["ims221 NS PrivateDMZ alef"]="ims220 gimel" \ ["ims221 NS PrivateDMZ bet"]="ims220 dalet" \ ["ims222 151 Trust alef"]="ims223 gimel" \ ["ims222 151 Trust bet"]="ims223 dalet" \ ["ims223 NS Trust alef"]="ims222 gimel" \ ["ims223 NS Trust bet"]="ims222 dalet" \ ["ims224 151 PublicDMZ alef"]="ims225 gimel" \ ["ims224 151 PublicDMZ bet"]="ims225 dalet" \ ["ims225 NS PublicDMZ alef"]="ims224 gimel" \ ["ims225 NS PublicDMZ bet"]="ims224 dalet" \ ["ims316 EBT alef"]="ims317 gimel" \ ["ims316 EBT bet"]="ims317 dalet" \ ["ims317 EBT alef"]="ims316 gimel" \ ["ims317 EBT bet"]="ims316 dalet" \ ) zone=`echo $sname | sed -n 's/^Gearman worker //p'` case $sstate in OK) # Recovery # This needs executing ONLY if we are NOT a cluster -- stop redundant workers if [[ -z ${worker[$zone]} ]]; then # cluster exit 0 fi clusterine=`echo ${worker[$zone]} | cut -d' ' -f1` worker=`echo ${worker[$zone]} | cut -d' ' -f2` /usr/bin/ssh $clusterine '/usr/bin/sudo /sbin/service mod_gearman_worker stop '"$worker"' force' 2>&1 | mail -s "RECOVERY: Gearman worker $worker stop on $clusterine" networking@Princeton.EDU exit $? ;; WARNING) # Don't care for cluster - one worker is still good to go ;; CRITICAL) case $sattempt in 2) # Attempt to restart a failed worker only if [[ -z ${worker[$zone]} ]]; then # not a worker exit 0 fi mach=`echo $zone | cut -d' ' -f1` wid=`echo $zone | sed 's/.* //'` /usr/bin/ssh $mach '/usr/bin/sudo /sbin/service mod_gearman_worker restart '"$wid" 2>&1 | mail -s "Gearman worker $wid restart on $mach" networking@Princeton.EDU exit $? ;; 3) # Start redundant workers if [[ -z ${cluster[$zone]} ]]; then # not a cluster exit 0 fi for wrkr in "${!worker[@]}"; do if echo "$wrkr" | grep -q "$zone"; then wid=`echo ${worker[$wrkr]} | cut -d' ' -f2` /usr/bin/ssh ${cluster[$zone]} '/usr/bin/sudo /sbin/service mod_gearman_worker start '"$wid"' force' 2>&1 | mail -s "Starting redundant Gearman worker $wid on ${cluster[$zone]}" networking@Princeton.EDU fi done exit $? ;; 4) # Last resort - attempt to restart Gearman server if [[ -z ${cluster[$zone]} ]]; then # not a cluster exit 0 fi /usr/bin/ssh mon204l '/usr/bin/sudo systemctl gearmand restart' 2>&1 | mail -s "LAST RESORT: Gearman server restart on mon204l" networking@Princeton.EDU sleep 10 /usr/bin/ssh mon205l '/usr/bin/sudo systemctl gearmand restart' 2>&1 | mail -s "LAST RESORT: Gearman server restart on mon205l" networking@Princeton.EDU exit $? ;; *) # Do nothing - too early to do anything ;; esac ;; UNKNOWN) echo "Service state is unknown" exit 1 ;; *) echo "Invalid service state passed" exit 1 ;; esac exit 0