You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

369 lines
10 KiB

#!/bin/sh
# set -x
# Program name: metricator_cleaner.sh
# Purpose - Frontal script to metricator_cleaner.py and metricator_cleaner.pl, will launch Python or Perl script depending on interpreter availability
# See metricator_cleaner.py | metricator_cleaner.pl
# Author - Guilhem Marchand
# Version 2.0.1
# For AIX / Linux / Solaris
#################################################
## Your Customizations Go Here ##
#################################################
# format date output to strftime dd/mm/YYYY HH:MM:SS
log_date () {
date "+%d-%m-%Y %H:%M:%S"
}
# hostname
HOST=`hostname`
# Which type of OS are we running
UNAME=`uname`
if [ -z "${SPLUNK_HOME}" ]; then
echo "`log_date`, ${HOST} ERROR, SPLUNK_HOME variable is not defined"
exit 1
fi
# APP path discovery
if [ -d "$SPLUNK_HOME/etc/apps/TA-metricator-for-nmon" ]; then
APP=$SPLUNK_HOME/etc/apps/TA-metricator-for-nmon
elif [ -d "$SPLUNK_HOME/etc/peer-apps/TA-metricator-for-nmon" ];then
APP=$SPLUNK_HOME/etc/peer-apps/TA-metricator-for-nmon
else
echo "`log_date`, ${HOST} ERROR, the APP directory could not be defined, is the TA-metricator-for-nmon installed ?"
exit 1
fi
# source default nmon.conf
if [ -f $APP/default/nmon.conf ]; then
# During initial deployment, the nmon.conf needs to be managed properly by the metricator_consumer.sh
# wait for this to be done
grep '\[nmon\]' $APP/default/nmon.conf >/dev/null
if [ $? -eq 0 ]; then
echo "`log_date`, ${HOST} INFO, initial deployment condition detected, safe exiting."
exit 0
else
. $APP/default/nmon.conf
fi
fi
# source local nmon.conf, if any
# Search for a local nmon.conf file located in $SPLUNK_HOME/etc/apps/TA-metricator-for-nmon/local
if [ -f $APP/local/nmon.conf ]; then
. $APP/local/nmon.conf
fi
# On a per server basis, you can also set in /etc/nmon.conf
if [ -f /etc/nmon.conf ]; then
. /etc/nmon.conf
fi
# Manage FQDN option
echo $nmonparser_options | grep '\-\-use_fqdn' >/dev/null
if [ $? -eq 0 ]; then
# Only relevant for Linux OS
case $UNAME in
Linux)
HOST=`hostname -f` ;;
AIX)
HOST=`hostname` ;;
SunOS)
HOST=`hostname` ;;
esac
else
HOST=`hostname`
fi
# Manage host override option based on Splunk hostname defined
case $override_sys_hostname in
"1")
# Retrieve the Splunk host value
HOST=`cat $SPLUNK_HOME/etc/system/local/inputs.conf | grep '^host =' | awk -F\= '{print $2}' | sed 's/ //g'`
;;
esac
#
# Interpreter choice
#
PYTHON=0
PYTHON2=0
PYTHON3=0
PERL=0
# Set the default interpreter
INTERPRETER="python"
# Get the version for both worlds
PYTHON2=`which python 2>&1`
PYTHON3=`which python3 2>&1`
PERL=`which perl 2>&1`
# Handle Python
PYTHON_available="false"
case $PYTHON3 in
*python*)
PYTHON_available="true"
INTERPRETER="python3" ;;
*)
case $PYTHON2 in
*python*)
PYTHON_available="true"
INTERPRETER="python" ;;
esac
;;
esac
# Handle Perl
case $PERL in
*perl*)
PERL_available="true"
;;
*)
PERL_available="false"
;;
esac
case `uname` in
# AIX priority is Perl
"AIX")
case $PERL_available in
"true")
INTERPRETER="perl" ;;
"false")
INTERPRETER="$INTERPRETER" ;;
esac
;;
# Other OS, priority is Python
*)
case $PYTHON_available in
"true")
INTERPRETER="$INTERPRETER" ;;
"false")
INTERPRETER="perl" ;;
esac
;;
esac
# POSIX process run time in seconds (for Solaris only)
P_RUNTIME () {
t=`LC_ALL=POSIX ps -o etime= -p $1 | awk '{print $1}'`
d=0 h=0
case $t in *-*) d=$((0 + ${t%%-*})); t=${t#*-};; esac
case $t in *:*:*) h=$((0 + ${t%%:*})); t=${t#*:};; esac
s=$((10#$d*86400 + 10#$h*3600 + 10#${t%%:*}*60 + 10#${t#*:}))
echo $s
}
####################################################################
############# Main Program ############
####################################################################
# Store arguments sent to script
userargs=$@
###### Maintenance tasks ######
#
# Maintenance task1
#
# Maintenance task 1: verify if we have nmon processes running over the allowed period
# This issue seems to happen sometimes specially on AIX servers
# If an nmon process has not been terminated after its grace period, the process will be killed
# get the allowed runtime in seconds for an nmon process according to the configuration
# and add a 10 minute grace period
case `uname` in
"AIX"|"Linux"|"SunOS")
echo "`log_date`, ${HOST} INFO, starting maintenance task 1: verify nmon processes running over expected time period"
endtime=0
case ${mode_fifo} in
"1")
endtime=`expr ${fifo_interval} \* ${fifo_snapshot}` ;;
*)
endtime=`expr ${interval} \* ${snapshot}` ;;
esac
endtime=`expr ${endtime} + 600`
# get the list of running processes
case $UNAME in
"AIX"|"Linux")
oldPidList=`ps -eo user,pid,command,etime,args | grep "nmon" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep | awk '{ print $2 }'`
ps -eo user,pid,command,etime,args | grep "nmon" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep >/dev/null ;;
"SunOS")
oldPidList=`ps auxwww | grep "sadc" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep | awk '{ print $2 }'`
ps auxwww | grep "sadc" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep >/dev/null ;;
esac
if [ $? -eq 0 ]; then
for pid in $oldPidList; do
pid_runtime=0
# only run the process is running
if [ -d /proc/${pid} ]; then
# get the process runtime in seconds
case $UNAME in
"AIX"|"Linux")
pid_runtime=`ps -p ${pid} -oetime= | tr '-' ':' | awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}'`
;;
"SunOS")
pid_runtime=`P_RUNTIME ${pid}`
;;
esac
# additional protection
case ${pid_runtime} in
"")
;;
*)
if [ ${pid_runtime} -gt ${endtime} ]; then
echo "`log_date`, ${HOST} WARN, old nmon process found due to: `ps auxwww | grep $pid | grep -v grep` killing (SIGTERM) process $pid"
kill $pid
# Allow some time for the process to end
sleep 5
# re-check the status
ps -p ${pid} -oetime= >/dev/null
if [ $? -eq 0 ]; then
echo "`log_date`, ${HOST} WARN, old nmon process found due to: `ps auxwww | grep $pid | grep -v grep` failed to stop, killing (-9) process $pid"
kill -9 $pid
fi
fi
;;
esac
fi
done
fi
#
# Maintenance task2
# set -x
# - manage any fifo reader orphan processes (no associated nmon process)
# - manage any fifo reader duplicated (abnormal situation)
echo "`log_date`, ${HOST} INFO, starting maintenance task 2: verify orphan or duplicated fifo_reader processes"
for instance in fifo1 fifo2; do
# Initiate
oldPidNb=0
case $INTERPRETER in
"perl")
readerNbProc=2 ;;
"python"|"python3")
readerNbProc=3 ;;
esac
# get the list of running processes
ps auxwww | grep "nmon" | grep "splunk" | grep metricator_reader | grep ${instance} >/dev/null
if [ $? -eq 0 ]; then
oldPidList=`ps auxwwww | grep "nmon" | grep "splunk" | grep metricator_reader | grep ${instance} | grep -v grep | awk '{ print $2 }'`
oldPidNb=`ps auxwww | grep "nmon" | grep "splunk" | grep metricator_reader | grep ${instance} | grep -v grep | wc -l | awk '{print $1}'`
# search for associated nmon process
case $UNAME in
"AIX"|"Linux")
ps auxwww | grep "nmon" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep ${instance} >/dev/null
;;
"SunOS")
ps auxwww | grep "sadc" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep ${instance} >/dev/null
;;
esac
if [ $? -ne 0 ] && [ $oldPidNb -eq $readerNbProc ]; then
# no process found, kill the reader processes
for pid in $oldPidList; do
echo "`log_date`, ${HOST} WARN, orphan reader process found (no associated nmon process) due to: `ps auxwww | grep $pid | grep -v grep` killing (SIGTERM) process $pid"
kill $pid
# Allow some time for the process to end
sleep 5
# re-check the status
ps -p ${pid} -oetime= >/dev/null
if [ $? -eq 0 ]; then
echo "`log_date`, ${HOST} WARN, orphan reader process (no associated nmon process) due to: `ps auxwww | grep $pid | grep -v grep` failed to stop, killing (-9) process $pid"
kill -9 $pid
fi
done
# If nmon is running but the number of reader processes is higher than 2 (shell parent + Python/Perl child), something went wrong
elif [ $oldPidNb -gt $readerNbProc ]; then
echo "`log_date`, ${HOST} WARN, multiple reader for the same fifo were detected, this is an abnormal situation and reader will be killed."
# no process found, kill the reader processes
for pid in $oldPidList; do
echo "`log_date`, ${HOST} WARN, duplicated reader process found due to: `ps auxwww | grep $pid | grep -v grep` killing (SIGTERM) process $pid"
kill $pid
# Allow some time for the process to end
sleep 5
# re-check the status
ps -p ${pid} -oetime= >/dev/null
if [ $? -eq 0 ]; then
echo "`log_date`, ${HOST} WARN, duplicated reader process found due to: `ps auxwww | grep $pid | grep -v grep` failed to stop, killing (-9) process $pid"
kill -9 $pid
fi
done
fi
fi
done
;;
# End of per OS case
esac
###### End maintenance tasks ######
###### Start cleaner ######
case ${INTERPRETER} in
"python"|"python3")
$INTERPRETER $APP/bin/metricator_cleaner.py ${userargs} ;;
"perl")
$APP/bin/metricator_cleaner.pl ${userargs} ;;
esac
exit 0