You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Princeton/pu/libexec/eventhandlers/slave_conn_reset

95 lines
2.7 KiB
Plaintext

!/bin/bash
# This script bounces the Selenium Hub service and the Gearmand service on the slave to kick the connections back to the master
if [[ "$#" -ne "4" ]]; then
echo "Usage: $0 <SLAVE_\$HOSTNAME> <SLAVE_\$SERVICESTATE_FOR_GEARMAN> <MASTER_\$SERVICESTATE_FOR_GEARMAN> <SLAVE_\$SERVICESTATEATTEMPT>"
exit 1
fi
. /home/nagios/.ssh/.ssh-agent
slave_name="$1"
slave_state="$2"
master_state="$3"
n_slave_attempts="$4"
logfile="/usr/local/monitoring/log/hub_recovery"
echo >> $logfile
echo "*****************" >> $logfile
echo "$(date): Running event handler..." >> $logfile
echo "Args:###$slave_name###$slave_state###$master_state###$n_slave_attempts###" >> $logfile
echo >> $logfile
if [[ "$slave_name" == "mon205l" ]]; then
master="mon204l"
elif [[ "$slave_name" == "mon303l" ]]; then
master="mon302l"
elif [[ "$slave_state" == "OK" && "$master_state" != "OK" ]]; then ### In this section "slave_*" actually refers to the master, and "$master_*" to the slave
# We are master. Provoke slave to reset
declare -A masmap=(["mon204l"]="mon205l" ["mon302l"]="mon303l")
declare -A sermap1=(["mon204l"]="Gearman" ["mon302l"]="Gearman DevQA")
declare -A sermap2=(["mon204l"]="Selenium" ["mon302l"]="Selenium DevQA")
/usr/local/monitoring/libexec/eventhandlers/submit_check_result ${masmap[$slave_name]} "${sermap1[$slave_name]}" 0 "Need to bounce slave..." &
exec /usr/local/monitoring/libexec/eventhandlers/submit_check_result ${masmap[$slave_name]} "${sermap2[$slave_name]}" 0 "Need to bounce slave..."
else
# Transient
exit 0
fi
# What is slave state?
case $slave_state in
"WARNING" | "CRITICAL")
res=`/usr/bin/ssh $slave_name 'netstat -tn | grep "4\(730\|444\)[[:space:]]\+.*ESTABLISHED" | wc -l'`
stat="$?"
## SSH failed???
if [[ "$stat" -ne "0" ]]; then exit 1; fi
# Master is having issues - we should keep on running
if [[ "$master_state" != "OK" ]]; then exit 0; fi
if [[ "$res" -gt "10" ]]
then
res=`/usr/bin/ssh $slave_name '
echo "Killing Selenium Hub"
/usr/bin/sudo /bin/systemctl stop selenium
echo "Stopping Gearman"
/usr/bin/sudo /bin/systemctl stop gearmand
echo "Giving it 20 seconds..."
sleep 20
echo "Starting Selenium"
/usr/bin/sudo /bin/systemctl start selenium
echo "Starting Gearman"
/usr/bin/sudo /bin/systemctl start gearmand
exit 0
' 2>&1`
stat="$?"
echo "$res" >> $logfile
echo "******************" >> $logfile
echo >> $logfile
echo "$res" | /bin/mail -s "WARNING: Gearman and/or Selenium entities connected to inactive slave. Bounced slave processes on $slave_name" epm-list@Princeton.EDU igubenko@Princeton.EDU
exit $stat
fi
exit 0
;;
UNKNOWN)
echo "Service state is unknown"
exit 1
;;
*)
echo "Invalid service state passed"
exit 1
;;
esac
exit 0