You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
95 lines
2.7 KiB
Plaintext
95 lines
2.7 KiB
Plaintext
!/bin/bash
|
|
|
|
# This script bounces the Selenium Hub service and the Gearmand service on the slave to kick the connections back to the master
|
|
|
|
if [[ "$#" -ne "4" ]]; then
|
|
echo "Usage: $0 <SLAVE_\$HOSTNAME> <SLAVE_\$SERVICESTATE_FOR_GEARMAN> <MASTER_\$SERVICESTATE_FOR_GEARMAN> <SLAVE_\$SERVICESTATEATTEMPT>"
|
|
exit 1
|
|
fi
|
|
|
|
. /home/nagios/.ssh/.ssh-agent
|
|
|
|
slave_name="$1"
|
|
slave_state="$2"
|
|
master_state="$3"
|
|
n_slave_attempts="$4"
|
|
|
|
|
|
logfile="/usr/local/monitoring/log/hub_recovery"
|
|
|
|
echo >> $logfile
|
|
echo "*****************" >> $logfile
|
|
echo "$(date): Running event handler..." >> $logfile
|
|
echo "Args:###$slave_name###$slave_state###$master_state###$n_slave_attempts###" >> $logfile
|
|
echo >> $logfile
|
|
|
|
if [[ "$slave_name" == "mon205l" ]]; then
|
|
master="mon204l"
|
|
elif [[ "$slave_name" == "mon303l" ]]; then
|
|
master="mon302l"
|
|
elif [[ "$slave_state" == "OK" && "$master_state" != "OK" ]]; then ### In this section "slave_*" actually refers to the master, and "$master_*" to the slave
|
|
# We are master. Provoke slave to reset
|
|
declare -A masmap=(["mon204l"]="mon205l" ["mon302l"]="mon303l")
|
|
declare -A sermap1=(["mon204l"]="Gearman" ["mon302l"]="Gearman DevQA")
|
|
declare -A sermap2=(["mon204l"]="Selenium" ["mon302l"]="Selenium DevQA")
|
|
/usr/local/monitoring/libexec/eventhandlers/submit_check_result ${masmap[$slave_name]} "${sermap1[$slave_name]}" 0 "Need to bounce slave..." &
|
|
exec /usr/local/monitoring/libexec/eventhandlers/submit_check_result ${masmap[$slave_name]} "${sermap2[$slave_name]}" 0 "Need to bounce slave..."
|
|
else
|
|
# Transient
|
|
exit 0
|
|
fi
|
|
|
|
# What is slave state?
|
|
case $slave_state in
|
|
"WARNING" | "CRITICAL")
|
|
|
|
res=`/usr/bin/ssh $slave_name 'netstat -tn | grep "4\(730\|444\)[[:space:]]\+.*ESTABLISHED" | wc -l'`
|
|
stat="$?"
|
|
|
|
## SSH failed???
|
|
if [[ "$stat" -ne "0" ]]; then exit 1; fi
|
|
|
|
# Master is having issues - we should keep on running
|
|
if [[ "$master_state" != "OK" ]]; then exit 0; fi
|
|
|
|
if [[ "$res" -gt "10" ]]
|
|
then
|
|
res=`/usr/bin/ssh $slave_name '
|
|
echo "Killing Selenium Hub"
|
|
/usr/bin/sudo /bin/systemctl stop selenium
|
|
echo "Stopping Gearman"
|
|
/usr/bin/sudo /bin/systemctl stop gearmand
|
|
echo "Giving it 20 seconds..."
|
|
sleep 20
|
|
echo "Starting Selenium"
|
|
/usr/bin/sudo /bin/systemctl start selenium
|
|
echo "Starting Gearman"
|
|
/usr/bin/sudo /bin/systemctl start gearmand
|
|
exit 0
|
|
' 2>&1`
|
|
stat="$?"
|
|
|
|
echo "$res" >> $logfile
|
|
echo "******************" >> $logfile
|
|
echo >> $logfile
|
|
echo "$res" | /bin/mail -s "WARNING: Gearman and/or Selenium entities connected to inactive slave. Bounced slave processes on $slave_name" epm-list@Princeton.EDU igubenko@Princeton.EDU
|
|
exit $stat
|
|
fi
|
|
|
|
exit 0
|
|
|
|
;;
|
|
|
|
UNKNOWN)
|
|
echo "Service state is unknown"
|
|
exit 1
|
|
;;
|
|
|
|
*)
|
|
echo "Invalid service state passed"
|
|
exit 1
|
|
;;
|
|
esac
|
|
|
|
exit 0
|