You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
369 lines
10 KiB
369 lines
10 KiB
#!/bin/sh
|
|
|
|
# set -x
|
|
|
|
# Program name: metricator_cleaner.sh
|
|
# Purpose - Frontal script to metricator_cleaner.py and metricator_cleaner.pl, will launch Python or Perl script depending on interpreter availability
|
|
# See metricator_cleaner.py | metricator_cleaner.pl
|
|
# Author - Guilhem Marchand
|
|
|
|
# Version 2.0.1
|
|
|
|
# For AIX / Linux / Solaris
|
|
|
|
#################################################
|
|
## Your Customizations Go Here ##
|
|
#################################################
|
|
|
|
# format date output to strftime dd/mm/YYYY HH:MM:SS
|
|
log_date () {
|
|
date "+%d-%m-%Y %H:%M:%S"
|
|
}
|
|
|
|
# hostname
|
|
HOST=`hostname`
|
|
|
|
# Which type of OS are we running
|
|
UNAME=`uname`
|
|
|
|
if [ -z "${SPLUNK_HOME}" ]; then
|
|
echo "`log_date`, ${HOST} ERROR, SPLUNK_HOME variable is not defined"
|
|
exit 1
|
|
fi
|
|
|
|
# APP path discovery
|
|
if [ -d "$SPLUNK_HOME/etc/apps/TA-metricator-for-nmon" ]; then
|
|
APP=$SPLUNK_HOME/etc/apps/TA-metricator-for-nmon
|
|
|
|
elif [ -d "$SPLUNK_HOME/etc/peer-apps/TA-metricator-for-nmon" ];then
|
|
APP=$SPLUNK_HOME/etc/peer-apps/TA-metricator-for-nmon
|
|
|
|
else
|
|
echo "`log_date`, ${HOST} ERROR, the APP directory could not be defined, is the TA-metricator-for-nmon installed ?"
|
|
exit 1
|
|
fi
|
|
|
|
# source default nmon.conf
|
|
if [ -f $APP/default/nmon.conf ]; then
|
|
# During initial deployment, the nmon.conf needs to be managed properly by the metricator_consumer.sh
|
|
# wait for this to be done
|
|
grep '\[nmon\]' $APP/default/nmon.conf >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
echo "`log_date`, ${HOST} INFO, initial deployment condition detected, safe exiting."
|
|
exit 0
|
|
else
|
|
. $APP/default/nmon.conf
|
|
fi
|
|
fi
|
|
|
|
# source local nmon.conf, if any
|
|
|
|
# Search for a local nmon.conf file located in $SPLUNK_HOME/etc/apps/TA-metricator-for-nmon/local
|
|
if [ -f $APP/local/nmon.conf ]; then
|
|
. $APP/local/nmon.conf
|
|
fi
|
|
|
|
# On a per server basis, you can also set in /etc/nmon.conf
|
|
if [ -f /etc/nmon.conf ]; then
|
|
. /etc/nmon.conf
|
|
fi
|
|
|
|
# Manage FQDN option
|
|
echo $nmonparser_options | grep '\-\-use_fqdn' >/dev/null
|
|
if [ $? -eq 0 ]; then
|
|
# Only relevant for Linux OS
|
|
case $UNAME in
|
|
Linux)
|
|
HOST=`hostname -f` ;;
|
|
AIX)
|
|
HOST=`hostname` ;;
|
|
SunOS)
|
|
HOST=`hostname` ;;
|
|
esac
|
|
else
|
|
HOST=`hostname`
|
|
fi
|
|
|
|
# Manage host override option based on Splunk hostname defined
|
|
case $override_sys_hostname in
|
|
"1")
|
|
# Retrieve the Splunk host value
|
|
HOST=`cat $SPLUNK_HOME/etc/system/local/inputs.conf | grep '^host =' | awk -F\= '{print $2}' | sed 's/ //g'`
|
|
;;
|
|
esac
|
|
|
|
#
|
|
# Interpreter choice
|
|
#
|
|
|
|
PYTHON=0
|
|
PYTHON2=0
|
|
PYTHON3=0
|
|
PERL=0
|
|
# Set the default interpreter
|
|
INTERPRETER="python"
|
|
|
|
# Get the version for both worlds
|
|
PYTHON2=`which python 2>&1`
|
|
PYTHON3=`which python3 2>&1`
|
|
PERL=`which perl 2>&1`
|
|
|
|
# Handle Python
|
|
PYTHON_available="false"
|
|
case $PYTHON3 in
|
|
*python*)
|
|
PYTHON_available="true"
|
|
INTERPRETER="python3" ;;
|
|
*)
|
|
case $PYTHON2 in
|
|
*python*)
|
|
PYTHON_available="true"
|
|
INTERPRETER="python" ;;
|
|
esac
|
|
;;
|
|
esac
|
|
|
|
# Handle Perl
|
|
case $PERL in
|
|
*perl*)
|
|
PERL_available="true"
|
|
;;
|
|
*)
|
|
PERL_available="false"
|
|
;;
|
|
esac
|
|
|
|
case `uname` in
|
|
|
|
# AIX priority is Perl
|
|
"AIX")
|
|
case $PERL_available in
|
|
"true")
|
|
INTERPRETER="perl" ;;
|
|
"false")
|
|
INTERPRETER="$INTERPRETER" ;;
|
|
esac
|
|
;;
|
|
|
|
# Other OS, priority is Python
|
|
*)
|
|
case $PYTHON_available in
|
|
"true")
|
|
INTERPRETER="$INTERPRETER" ;;
|
|
"false")
|
|
INTERPRETER="perl" ;;
|
|
esac
|
|
;;
|
|
esac
|
|
|
|
# POSIX process run time in seconds (for Solaris only)
|
|
P_RUNTIME () {
|
|
t=`LC_ALL=POSIX ps -o etime= -p $1 | awk '{print $1}'`
|
|
d=0 h=0
|
|
case $t in *-*) d=$((0 + ${t%%-*})); t=${t#*-};; esac
|
|
case $t in *:*:*) h=$((0 + ${t%%:*})); t=${t#*:};; esac
|
|
s=$((10#$d*86400 + 10#$h*3600 + 10#${t%%:*}*60 + 10#${t#*:}))
|
|
echo $s
|
|
}
|
|
|
|
####################################################################
|
|
############# Main Program ############
|
|
####################################################################
|
|
|
|
# Store arguments sent to script
|
|
userargs=$@
|
|
|
|
###### Maintenance tasks ######
|
|
|
|
#
|
|
# Maintenance task1
|
|
#
|
|
|
|
# Maintenance task 1: verify if we have nmon processes running over the allowed period
|
|
# This issue seems to happen sometimes specially on AIX servers
|
|
|
|
# If an nmon process has not been terminated after its grace period, the process will be killed
|
|
|
|
# get the allowed runtime in seconds for an nmon process according to the configuration
|
|
# and add a 10 minute grace period
|
|
|
|
case `uname` in
|
|
|
|
"AIX"|"Linux"|"SunOS")
|
|
|
|
echo "`log_date`, ${HOST} INFO, starting maintenance task 1: verify nmon processes running over expected time period"
|
|
|
|
endtime=0
|
|
|
|
case ${mode_fifo} in
|
|
"1")
|
|
endtime=`expr ${fifo_interval} \* ${fifo_snapshot}` ;;
|
|
*)
|
|
endtime=`expr ${interval} \* ${snapshot}` ;;
|
|
esac
|
|
|
|
endtime=`expr ${endtime} + 600`
|
|
|
|
# get the list of running processes
|
|
case $UNAME in
|
|
"AIX"|"Linux")
|
|
oldPidList=`ps -eo user,pid,command,etime,args | grep "nmon" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep | awk '{ print $2 }'`
|
|
ps -eo user,pid,command,etime,args | grep "nmon" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep >/dev/null ;;
|
|
"SunOS")
|
|
oldPidList=`ps auxwww | grep "sadc" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep | awk '{ print $2 }'`
|
|
ps auxwww | grep "sadc" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep -v grep >/dev/null ;;
|
|
esac
|
|
|
|
if [ $? -eq 0 ]; then
|
|
|
|
for pid in $oldPidList; do
|
|
|
|
pid_runtime=0
|
|
# only run the process is running
|
|
if [ -d /proc/${pid} ]; then
|
|
# get the process runtime in seconds
|
|
|
|
case $UNAME in
|
|
"AIX"|"Linux")
|
|
pid_runtime=`ps -p ${pid} -oetime= | tr '-' ':' | awk -F: '{ total=0; m=1; } { for (i=0; i < NF; i++) {total += $(NF-i)*m; m *= i >= 2 ? 24 : 60 }} {print total}'`
|
|
;;
|
|
"SunOS")
|
|
pid_runtime=`P_RUNTIME ${pid}`
|
|
;;
|
|
esac
|
|
|
|
# additional protection
|
|
case ${pid_runtime} in
|
|
"")
|
|
;;
|
|
*)
|
|
if [ ${pid_runtime} -gt ${endtime} ]; then
|
|
echo "`log_date`, ${HOST} WARN, old nmon process found due to: `ps auxwww | grep $pid | grep -v grep` killing (SIGTERM) process $pid"
|
|
kill $pid
|
|
|
|
# Allow some time for the process to end
|
|
sleep 5
|
|
|
|
# re-check the status
|
|
ps -p ${pid} -oetime= >/dev/null
|
|
|
|
if [ $? -eq 0 ]; then
|
|
echo "`log_date`, ${HOST} WARN, old nmon process found due to: `ps auxwww | grep $pid | grep -v grep` failed to stop, killing (-9) process $pid"
|
|
kill -9 $pid
|
|
fi
|
|
|
|
fi
|
|
;;
|
|
esac
|
|
fi
|
|
|
|
done
|
|
|
|
fi
|
|
|
|
#
|
|
# Maintenance task2
|
|
# set -x
|
|
# - manage any fifo reader orphan processes (no associated nmon process)
|
|
# - manage any fifo reader duplicated (abnormal situation)
|
|
|
|
echo "`log_date`, ${HOST} INFO, starting maintenance task 2: verify orphan or duplicated fifo_reader processes"
|
|
|
|
for instance in fifo1 fifo2; do
|
|
|
|
# Initiate
|
|
oldPidNb=0
|
|
|
|
case $INTERPRETER in
|
|
"perl")
|
|
readerNbProc=2 ;;
|
|
"python"|"python3")
|
|
readerNbProc=3 ;;
|
|
esac
|
|
|
|
# get the list of running processes
|
|
ps auxwww | grep "nmon" | grep "splunk" | grep metricator_reader | grep ${instance} >/dev/null
|
|
|
|
if [ $? -eq 0 ]; then
|
|
|
|
oldPidList=`ps auxwwww | grep "nmon" | grep "splunk" | grep metricator_reader | grep ${instance} | grep -v grep | awk '{ print $2 }'`
|
|
oldPidNb=`ps auxwww | grep "nmon" | grep "splunk" | grep metricator_reader | grep ${instance} | grep -v grep | wc -l | awk '{print $1}'`
|
|
|
|
# search for associated nmon process
|
|
case $UNAME in
|
|
"AIX"|"Linux")
|
|
ps auxwww | grep "nmon" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep ${instance} >/dev/null
|
|
;;
|
|
"SunOS")
|
|
ps auxwww | grep "sadc" | grep "splunk" | grep "var/log/metricator" | grep -v metricator_reader | grep ${instance} >/dev/null
|
|
;;
|
|
esac
|
|
|
|
if [ $? -ne 0 ] && [ $oldPidNb -eq $readerNbProc ]; then
|
|
|
|
# no process found, kill the reader processes
|
|
for pid in $oldPidList; do
|
|
echo "`log_date`, ${HOST} WARN, orphan reader process found (no associated nmon process) due to: `ps auxwww | grep $pid | grep -v grep` killing (SIGTERM) process $pid"
|
|
kill $pid
|
|
|
|
# Allow some time for the process to end
|
|
sleep 5
|
|
|
|
# re-check the status
|
|
ps -p ${pid} -oetime= >/dev/null
|
|
|
|
if [ $? -eq 0 ]; then
|
|
echo "`log_date`, ${HOST} WARN, orphan reader process (no associated nmon process) due to: `ps auxwww | grep $pid | grep -v grep` failed to stop, killing (-9) process $pid"
|
|
kill -9 $pid
|
|
fi
|
|
done
|
|
|
|
# If nmon is running but the number of reader processes is higher than 2 (shell parent + Python/Perl child), something went wrong
|
|
elif [ $oldPidNb -gt $readerNbProc ]; then
|
|
|
|
echo "`log_date`, ${HOST} WARN, multiple reader for the same fifo were detected, this is an abnormal situation and reader will be killed."
|
|
|
|
# no process found, kill the reader processes
|
|
for pid in $oldPidList; do
|
|
echo "`log_date`, ${HOST} WARN, duplicated reader process found due to: `ps auxwww | grep $pid | grep -v grep` killing (SIGTERM) process $pid"
|
|
kill $pid
|
|
|
|
# Allow some time for the process to end
|
|
sleep 5
|
|
|
|
# re-check the status
|
|
ps -p ${pid} -oetime= >/dev/null
|
|
|
|
if [ $? -eq 0 ]; then
|
|
echo "`log_date`, ${HOST} WARN, duplicated reader process found due to: `ps auxwww | grep $pid | grep -v grep` failed to stop, killing (-9) process $pid"
|
|
kill -9 $pid
|
|
fi
|
|
done
|
|
|
|
fi
|
|
|
|
fi
|
|
|
|
done
|
|
|
|
;;
|
|
|
|
# End of per OS case
|
|
esac
|
|
|
|
###### End maintenance tasks ######
|
|
|
|
###### Start cleaner ######
|
|
|
|
case ${INTERPRETER} in
|
|
|
|
"python"|"python3")
|
|
$INTERPRETER $APP/bin/metricator_cleaner.py ${userargs} ;;
|
|
|
|
"perl")
|
|
$APP/bin/metricator_cleaner.pl ${userargs} ;;
|
|
|
|
esac
|
|
|
|
exit 0
|