You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Princeton/pu/libexec/eventhandlers/gearman_devqa_recovery

124 lines
3.4 KiB
Bash

#!/bin/bash
if [[ "$#" -ne "4" ]]; then
echo "Usage: $0 "'"$SERVICENAME$" $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$'
exit 1
fi
. /home/nagios/.ssh/.ssh-agent
sname="$1"
sstate="$2"
stype="$3"
sattempt="$4"
# Failover clusterines
declare -A cluster=( \
["NS Untrust"]="ims318" \
["151 Untrust"]="ims319" \
["NS PublicDMZ"]="ims314" \
["151 PublicDMZ"]="ims315" \
["NS PrivateDMZ"]="ims320" \
["151 PrivateDMZ"]="ims321" \
["NS Trust"]="ims320" \
["151 Trust"]="ims321" \
)
# stop failover
declare -A worker=( \
["ims318 151 Untrust alef"]="ims319 gimel" \
["ims318 151 Untrust bet"]="ims319 dalet" \
["ims319 NS Untrust alef"]="ims318 gimel" \
["ims319 NS Untrust bet"]="ims318 dalet" \
["ims314 151 PublicDMZ alef"]="ims315 gimel" \
["ims314 151 PublicDMZ bet"]="ims315 dalet" \
["ims315 NS PublicDMZ alef"]="ims314 gimel" \
["ims315 NS PublicDMZ bet"]="ims314 dalet" \
["ims320 151 PrivateDMZ alef"]="ims321 gimel" \
["ims320 151 PrivateDMZ bet"]="ims321 dalet" \
["ims321 NS PrivateDMZ hei"]="ims320 gimel" \
["ims321 NS PrivateDMZ vav"]="ims320 dalet" \
["ims320 151 Trust hei"]="ims321 zain" \
["ims320 151 Trust vav"]="ims321 het" \
["ims321 NS Trust alef"]="ims320 zain" \
["ims321 NS Trust bet"]="ims320 het" \
)
zone=`echo $sname | sed -n 's/^Gearman worker //p'`
case $sstate in
OK) # Recovery
# This needs executing ONLY if we are NOT a cluster -- stop redundant workers
if [[ -z ${worker[$zone]} ]]; then # cluster
exit 0
fi
clusterine=`echo ${worker[$zone]} | cut -d' ' -f1`
worker=`echo ${worker[$zone]} | cut -d' ' -f2`
/usr/bin/ssh $clusterine '/usr/bin/sudo /sbin/service mod_gearman_worker stop '"$worker"' force' 2>&1 | mail -s "RECOVERY: Gearman worker $worker stop on $clusterine" epm-list@Princeton.EDU igubenko@Princeton.EDU
exit $?
;;
WARNING) # Don't care for cluster - one worker is still good to go
;;
CRITICAL)
case $sattempt in
2) # Attempt to restart a failed worker only
if [[ -z ${worker[$zone]} ]]; then # not a worker
exit 0
fi
mach=`echo $zone | cut -d' ' -f1`
wid=`echo $zone | sed 's/.* //'`
/usr/bin/ssh $mach '/usr/bin/sudo /sbin/service mod_gearman_worker restart '"$wid" 2>&1 | mail -s "Gearman worker $wid restart on $mach" epm-list@Princeton.EDU igubenko@Princeton.EDU
exit $?
;;
3) # Start redundant workers
if [[ -z ${cluster[$zone]} ]]; then # not a cluster
exit 0
fi
for wrkr in "${!worker[@]}"; do
if echo "$wrkr" | grep -q "$zone"; then
wid=`echo ${worker[$wrkr]} | cut -d' ' -f2`
/usr/bin/ssh ${cluster[$zone]} '/usr/bin/sudo /sbin/service mod_gearman_worker start '"$wid"' force' 2>&1 | mail -s "Starting redundant Gearman worker $wid on ${cluster[$zone]}" epm-list@Princeton.EDU igubenko@Princeton.EDU
fi
done
exit $?
;;
4) # Last resort - attempt to restart Gearman server
if [[ -z ${cluster[$zone]} ]]; then # not a cluster
exit 0
fi
/usr/bin/ssh mon302l 'sudo systemctl restart gearmand 2>&1' | mail -s "LAST RESORT: Gearman server restart" epm-list@Princeton.EDU igubenko@Princeton.EDU
/usr/bin/ssh mon303l 'sudo systemctl restart gearmand 2>&1' | mail -s "LAST RESORT: Gearman server restart" epm-list@Princeton.EDU igubenko@Princeton.EDU
exit $?
;;
*)
# Do nothing - too early to do anything
;;
esac
;;
UNKNOWN)
echo "Service state is unknown"
exit 1
;;
*)
echo "Invalid service state passed"
exit 1
;;
esac
exit 0