You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Princeton/pu/libexec/eventhandlers/gearman_recovery

137 lines
3.8 KiB
Bash

#!/bin/bash
if [[ "$#" -ne "4" ]]; then
echo "Usage: $0 "'"$SERVICENAME$" $SERVICESTATE$ $SERVICESTATETYPE$ $SERVICEATTEMPT$'
exit 1
fi
. /home/nagios/.ssh/.ssh-agent
sname="$1"
sstate="$2"
stype="$3"
sattempt="$4"
# Failover clusterines
declare -A cluster=( \
["NS Untrust"]="ims204" \
["151 Untrust"]="ims205" \
["NS PrivateDMZ"]="ims220" \
["151 PrivateDMZ"]="ims221" \
["NS Trust"]="ims222" \
["151 Trust"]="ims223" \
["NS PublicDMZ"]="ims224" \
["151 PublicDMZ"]="ims225" \
)
# ["ims205 151 Cross hei"]="ims205 hei" \
# ["ims204 NS Cross hei"]="ims204 hei" \
# stop failover
declare -A worker=( \
["ims204 151 Untrust alef"]="ims205 gimel" \
["ims204 151 Untrust bet"]="ims205 dalet" \
["ims205 NS Untrust alef"]="ims204 gimel" \
["ims205 NS Untrust bet"]="ims204 dalet" \
["ims218 151 PrivateDMZ alef"]="ims219 gimel" \
["ims218 151 PrivateDMZ bet"]="ims219 dalet" \
["ims219 NS PrivateDMZ alef"]="ims218 gimel" \
["ims219 NS PrivateDMZ bet"]="ims218 dalet" \
["ims220 151 PrivateDMZ alef"]="ims221 gimel" \
["ims220 151 PrivateDMZ bet"]="ims221 dalet" \
["ims221 NS PrivateDMZ alef"]="ims220 gimel" \
["ims221 NS PrivateDMZ bet"]="ims220 dalet" \
["ims222 151 Trust alef"]="ims223 gimel" \
["ims222 151 Trust bet"]="ims223 dalet" \
["ims223 NS Trust alef"]="ims222 gimel" \
["ims223 NS Trust bet"]="ims222 dalet" \
["ims224 151 PublicDMZ alef"]="ims225 gimel" \
["ims224 151 PublicDMZ bet"]="ims225 dalet" \
["ims225 NS PublicDMZ alef"]="ims224 gimel" \
["ims225 NS PublicDMZ bet"]="ims224 dalet" \
["ims316 EBT alef"]="ims317 gimel" \
["ims316 EBT bet"]="ims317 dalet" \
["ims317 EBT alef"]="ims316 gimel" \
["ims317 EBT bet"]="ims316 dalet" \
)
zone=`echo $sname | sed -n 's/^Gearman worker //p'`
case $sstate in
OK) # Recovery
# This needs executing ONLY if we are NOT a cluster -- stop redundant workers
if [[ -z ${worker[$zone]} ]]; then # cluster
exit 0
fi
clusterine=`echo ${worker[$zone]} | cut -d' ' -f1`
worker=`echo ${worker[$zone]} | cut -d' ' -f2`
/usr/bin/ssh $clusterine '/usr/bin/sudo /sbin/service mod_gearman_worker stop '"$worker"' force' 2>&1 | mail -s "RECOVERY: Gearman worker $worker stop on $clusterine" networking@Princeton.EDU
exit $?
;;
WARNING) # Don't care for cluster - one worker is still good to go
;;
CRITICAL)
case $sattempt in
2) # Attempt to restart a failed worker only
if [[ -z ${worker[$zone]} ]]; then # not a worker
exit 0
fi
mach=`echo $zone | cut -d' ' -f1`
wid=`echo $zone | sed 's/.* //'`
/usr/bin/ssh $mach '/usr/bin/sudo /sbin/service mod_gearman_worker restart '"$wid" 2>&1 | mail -s "Gearman worker $wid restart on $mach" networking@Princeton.EDU
exit $?
;;
3) # Start redundant workers
if [[ -z ${cluster[$zone]} ]]; then # not a cluster
exit 0
fi
for wrkr in "${!worker[@]}"; do
if echo "$wrkr" | grep -q "$zone"; then
wid=`echo ${worker[$wrkr]} | cut -d' ' -f2`
/usr/bin/ssh ${cluster[$zone]} '/usr/bin/sudo /sbin/service mod_gearman_worker start '"$wid"' force' 2>&1 | mail -s "Starting redundant Gearman worker $wid on ${cluster[$zone]}" networking@Princeton.EDU
fi
done
exit $?
;;
4) # Last resort - attempt to restart Gearman server
if [[ -z ${cluster[$zone]} ]]; then # not a cluster
exit 0
fi
/usr/bin/ssh mon204l '/usr/bin/sudo systemctl gearmand restart' 2>&1 | mail -s "LAST RESORT: Gearman server restart on mon204l" networking@Princeton.EDU
sleep 10
/usr/bin/ssh mon205l '/usr/bin/sudo systemctl gearmand restart' 2>&1 | mail -s "LAST RESORT: Gearman server restart on mon205l" networking@Princeton.EDU
exit $?
;;
*)
# Do nothing - too early to do anything
;;
esac
;;
UNKNOWN)
echo "Service state is unknown"
exit 1
;;
*)
echo "Invalid service state passed"
exit 1
;;
esac
exit 0