!/bin/bash # This script bounces the Selenium Hub service and the Gearmand service on the slave to kick the connections back to the master if [[ "$#" -ne "4" ]]; then echo "Usage: $0 " exit 1 fi . /home/nagios/.ssh/.ssh-agent slave_name="$1" slave_state="$2" master_state="$3" n_slave_attempts="$4" logfile="/usr/local/monitoring/log/hub_recovery" echo >> $logfile echo "*****************" >> $logfile echo "$(date): Running event handler..." >> $logfile echo "Args:###$slave_name###$slave_state###$master_state###$n_slave_attempts###" >> $logfile echo >> $logfile if [[ "$slave_name" == "mon205l" ]]; then master="mon204l" elif [[ "$slave_name" == "mon303l" ]]; then master="mon302l" elif [[ "$slave_state" == "OK" && "$master_state" != "OK" ]]; then ### In this section "slave_*" actually refers to the master, and "$master_*" to the slave # We are master. Provoke slave to reset declare -A masmap=(["mon204l"]="mon205l" ["mon302l"]="mon303l") declare -A sermap1=(["mon204l"]="Gearman" ["mon302l"]="Gearman DevQA") declare -A sermap2=(["mon204l"]="Selenium" ["mon302l"]="Selenium DevQA") /usr/local/monitoring/libexec/eventhandlers/submit_check_result ${masmap[$slave_name]} "${sermap1[$slave_name]}" 0 "Need to bounce slave..." & exec /usr/local/monitoring/libexec/eventhandlers/submit_check_result ${masmap[$slave_name]} "${sermap2[$slave_name]}" 0 "Need to bounce slave..." else # Transient exit 0 fi # What is slave state? case $slave_state in "WARNING" | "CRITICAL") res=`/usr/bin/ssh $slave_name 'netstat -tn | grep "4\(730\|444\)[[:space:]]\+.*ESTABLISHED" | wc -l'` stat="$?" ## SSH failed??? if [[ "$stat" -ne "0" ]]; then exit 1; fi # Master is having issues - we should keep on running if [[ "$master_state" != "OK" ]]; then exit 0; fi if [[ "$res" -gt "10" ]] then res=`/usr/bin/ssh $slave_name ' echo "Killing Selenium Hub" /usr/bin/sudo /bin/systemctl stop selenium echo "Stopping Gearman" /usr/bin/sudo /bin/systemctl stop gearmand echo "Giving it 20 seconds..." sleep 20 echo "Starting Selenium" /usr/bin/sudo /bin/systemctl start selenium echo "Starting Gearman" /usr/bin/sudo /bin/systemctl start gearmand exit 0 ' 2>&1` stat="$?" echo "$res" >> $logfile echo "******************" >> $logfile echo >> $logfile echo "$res" | /bin/mail -s "WARNING: Gearman and/or Selenium entities connected to inactive slave. Bounced slave processes on $slave_name" epm-list@Princeton.EDU igubenko@Princeton.EDU exit $stat fi exit 0 ;; UNKNOWN) echo "Service state is unknown" exit 1 ;; *) echo "Invalid service state passed" exit 1 ;; esac exit 0