#!/bin/bash if [[ "$#" -ne "4" ]]; then echo "Usage: $0 "'"$SERVICENAME$" $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$' exit 1 fi . /home/nagios/.ssh/.ssh-agent sname="$1" sstate="$2" stype="$3" sattempt="$4" # Failover clusterines declare -A cluster=( \ ["NS Untrust"]="ims318" \ ["151 Untrust"]="ims319" \ ["NS PublicDMZ"]="ims314" \ ["151 PublicDMZ"]="ims315" \ ["NS PrivateDMZ"]="ims320" \ ["151 PrivateDMZ"]="ims321" \ ["NS Trust"]="ims320" \ ["151 Trust"]="ims321" \ ) # stop failover declare -A worker=( \ ["ims318 151 Untrust alef"]="ims319 gimel" \ ["ims318 151 Untrust bet"]="ims319 dalet" \ ["ims319 NS Untrust alef"]="ims318 gimel" \ ["ims319 NS Untrust bet"]="ims318 dalet" \ ["ims314 151 PublicDMZ alef"]="ims315 gimel" \ ["ims314 151 PublicDMZ bet"]="ims315 dalet" \ ["ims315 NS PublicDMZ alef"]="ims314 gimel" \ ["ims315 NS PublicDMZ bet"]="ims314 dalet" \ ["ims320 151 PrivateDMZ alef"]="ims321 gimel" \ ["ims320 151 PrivateDMZ bet"]="ims321 dalet" \ ["ims321 NS PrivateDMZ hei"]="ims320 gimel" \ ["ims321 NS PrivateDMZ vav"]="ims320 dalet" \ ["ims320 151 Trust hei"]="ims321 zain" \ ["ims320 151 Trust vav"]="ims321 het" \ ["ims321 NS Trust alef"]="ims320 zain" \ ["ims321 NS Trust bet"]="ims320 het" \ ) zone=`echo $sname | sed -n 's/^Gearman worker //p'` case $sstate in OK) # Recovery # This needs executing ONLY if we are NOT a cluster -- stop redundant workers if [[ -z ${worker[$zone]} ]]; then # cluster exit 0 fi clusterine=`echo ${worker[$zone]} | cut -d' ' -f1` worker=`echo ${worker[$zone]} | cut -d' ' -f2` /usr/bin/ssh $clusterine '/usr/bin/sudo /sbin/service mod_gearman_worker stop '"$worker"' force' 2>&1 | mail -s "RECOVERY: Gearman worker $worker stop on $clusterine" epm-list@Princeton.EDU igubenko@Princeton.EDU exit $? ;; WARNING) # Don't care for cluster - one worker is still good to go ;; CRITICAL) case $sattempt in 2) # Attempt to restart a failed worker only if [[ -z ${worker[$zone]} ]]; then # not a worker exit 0 fi mach=`echo $zone | cut -d' ' -f1` wid=`echo $zone | sed 's/.* //'` /usr/bin/ssh $mach '/usr/bin/sudo /sbin/service mod_gearman_worker restart '"$wid" 2>&1 | mail -s "Gearman worker $wid restart on $mach" epm-list@Princeton.EDU igubenko@Princeton.EDU exit $? ;; 3) # Start redundant workers if [[ -z ${cluster[$zone]} ]]; then # not a cluster exit 0 fi for wrkr in "${!worker[@]}"; do if echo "$wrkr" | grep -q "$zone"; then wid=`echo ${worker[$wrkr]} | cut -d' ' -f2` /usr/bin/ssh ${cluster[$zone]} '/usr/bin/sudo /sbin/service mod_gearman_worker start '"$wid"' force' 2>&1 | mail -s "Starting redundant Gearman worker $wid on ${cluster[$zone]}" epm-list@Princeton.EDU igubenko@Princeton.EDU fi done exit $? ;; 4) # Last resort - attempt to restart Gearman server if [[ -z ${cluster[$zone]} ]]; then # not a cluster exit 0 fi /usr/bin/ssh mon302l 'sudo systemctl restart gearmand 2>&1' | mail -s "LAST RESORT: Gearman server restart" epm-list@Princeton.EDU igubenko@Princeton.EDU /usr/bin/ssh mon303l 'sudo systemctl restart gearmand 2>&1' | mail -s "LAST RESORT: Gearman server restart" epm-list@Princeton.EDU igubenko@Princeton.EDU exit $? ;; *) # Do nothing - too early to do anything ;; esac ;; UNKNOWN) echo "Service state is unknown" exit 1 ;; *) echo "Invalid service state passed" exit 1 ;; esac exit 0