You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
SH-Deployer/apps/SA-ITOA/bin/command_fill_gaps_backfill.py

273 lines
12 KiB

# Copyright (C) 2005-2024 Splunk Inc. All Rights Reserved.
import sys
import csv
from splunk.clilib.bundle_paths import make_splunkhome_path
sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib']))
sys.path.append(make_splunkhome_path(['etc', 'apps', 'SA-ITOA', 'lib', 'SA_ITOA_app_common']))
import itsi_path
from itsi_py3 import _
from ITOA.setup_logging import logger
from ITOA.splunk_search_chunk_protocol import SearchChunkProtocol
from ITOA.itoa_common import is_string_numeric, get_csv_dict_writer, get_log_message_for_exception
class FillDataGapsBackfillCommand(SearchChunkProtocol):
"""
This custom search command is specifically used to fill gaps for KPI with "last available value",
while backfilling the KPI. We need a custom search command to perform filling of gaps, coz
streamstats SPL command would not stick in missing bucket or entity results. While backfilling
KPI, we may encounter missing bucket or entity results. To fill those gaps (generate results for
entities and/or timestamps/buckets) with "last reported value", for entity/service aggregate,
we use this command.
NOTE: Filling Gaps for KPI with "Custom Value" means overriding N/A values for KPI, with
a custom value, which doesn't involve adding missing entity or timestamp events. Therefore,
we don't use this custom search command in that scenario. setseverityfields command handles that
case.
"""
def __init__(self):
hand_shake_output_data = {
'type': 'reporting'
}
super(FillDataGapsBackfillCommand, self).__init__(output_meta_data=hand_shake_output_data, logger=logger)
# entity_split_field arg is needed when KPI has "split by entity" option enabled, i.e.
# kpi generates entity results. kpi_type arg is not required in that case.
self.entity_split_field = self.args.get('entity_split_field')
self.alert_period = self.args.get('alert_period')
self.is_service_aggregate = False
# arg kpi_type is needed for "service_aggregate" KPI (no entity results)
if self.args.get('kpi_type') == 'service_aggregate':
self.is_service_aggregate = True
self.cached_results = {}
self.last_timestamp = None
self.existing_entity_results = set([])
self.entity_results_to_add = None
if self.is_service_aggregate:
self.expected_fieldnames = ['_time', 'alert_value']
else:
self.expected_fieldnames = ['_time', 'alert_value', self.entity_split_field]
def validate_search_args(self):
"""
Validate search arguments
@rtype: tuple
@return: return boolean flag and messages
"""
entity_split_field = self.args.get('entity_split_field')
alert_period = self.args.get('alert_period')
kpi_type = self.args.get('kpi_type')
msgs = []
if not kpi_type and not entity_split_field and not alert_period:
message = _('Invalid options passed to "fillgapsbackfill" command; must have alert period and '
'kpi type or entity split field.')
logger.error(message)
msgs.append(message)
if not kpi_type:
# Validate entity_split_field
if not entity_split_field:
logger.error(message)
message = _('`entity_split_field` argument not provided to "fillgapsbackfill" search command.')
msgs.append(message)
if not alert_period:
message = _('`alert_period` argument not provided to "fillgapsbackfill" search command.')
logger.error(message)
msgs.append(message)
try:
int(alert_period)
except (ValueError, TypeError):
message = _('`alert_period` argument provided to "fillgapsbackfill" search command, is not integer.')
logger.error(message)
msgs.append(message)
if len(msgs) > 0:
return False, msgs
else:
return True, msgs
def pre_processing(self):
"""
Override function
Convert alert period to integer from string
@return:
"""
self.alert_period = int(self.alert_period) * 60 # convert to seconds
def run(self, metadata, reader, chunk):
"""
Override function
Read the chunk data and, override N/A values and fill gaps.
@return:
"""
out_metadata = {'finished': metadata.get('finished', False)}
if reader.fieldnames:
if self.is_service_aggregate:
self._cache_and_fill_aggregate_results(self.writer, reader)
else:
self._cache_and_fill_entity_results(self.writer, reader)
# if it is the last chunk, then check for missing entity results at current timestamp.
if metadata.get('finished', False):
self.entity_results_to_add = set(self.cached_results.keys()).difference(self.existing_entity_results)
if self.entity_results_to_add:
out_metadata = {'finished': False}
self.write_chunk(out_metadata, self.output_buf.getvalue())
def post_processing(self):
"""
Perform post processing.
@return:
"""
# check for missing entities at final timestamp in results. If there are any, fill them with cached entity results.
if self.entity_results_to_add:
output_buf = self.get_string_buffer()
writer_last_bucket = get_csv_dict_writer(output_buf, fieldnames=self.expected_fieldnames)
self._write_missing_entity_results(writer_last_bucket)
self.write_chunk({'finished': True}, output_buf.getvalue())
def _write_missing_bucket_results(self, writer, curr_timestamp):
"""
Fill cached results for buckets or timestamps, for which results do not
exist. Compares last result's timestamp and current result's timestamp,
to find if there are missing results for buckets in between, as per
the alert period of KPI.
@param writer: csv writer object
@param curr_timestamp: timestamp of current results
@return:
"""
new_time_bucket = int(self.last_timestamp) + self.alert_period
if new_time_bucket >= curr_timestamp:
return
logger.debug('Data gaps found between timestamp %s and timestamp %s. Filling results for missing timestamps.' %
(self.last_timestamp, curr_timestamp))
while new_time_bucket < curr_timestamp:
if self.is_service_aggregate:
aggregate_result_to_add = {
'alert_value': self.cached_results['alert_value'],
'_time': new_time_bucket
}
writer.writerow(aggregate_result_to_add)
else:
for entity_title in self.cached_results:
entity_to_add = {
'alert_value': self.cached_results[entity_title]['alert_value'],
'_time': new_time_bucket,
self.entity_split_field: entity_title
}
writer.writerow(entity_to_add)
new_time_bucket += self.alert_period
def _write_missing_entity_results(self, writer):
"""
Fill missing entity results for current timestamp/bucket using cached results.
@param writer: csv writer object
@return:
"""
for entity_title in self.entity_results_to_add:
entity_to_add = {
'alert_value': self.cached_results[entity_title]['alert_value'],
'_time': self.last_timestamp,
self.entity_split_field: entity_title
}
writer.writerow(entity_to_add)
def _cache_and_fill_entity_results(self, writer, reader):
"""
Read entity results, cache them and fill missing entity and
bucket/timestamp results.
@param writer: csv writer object
@param reader: csv DictReader object
@return:
"""
for result in reader:
entity_name = result.get(self.entity_split_field)
curr_timestamp = result.get('_time')
if self.last_timestamp is None or curr_timestamp == self.last_timestamp:
# collect entity results for a specific timestamp or bucket.
self.existing_entity_results.add(entity_name)
else:
# check the difference between last entity results and cached entity results, and write the missing
# entity results in last timestamp
self.entity_results_to_add = set(self.cached_results.keys()).difference(self.existing_entity_results)
logger.debug('Data gaps found for entities="%s", at timestamp="%s"' %
(list(self.entity_results_to_add), self.last_timestamp))
self._write_missing_entity_results(writer)
self._write_missing_bucket_results(writer, int(curr_timestamp))
self.existing_entity_results = set([entity_name])
# cache latest alert value, if it is not a gap or N/A
if entity_name in self.cached_results:
if result.get('alert_value') and result.get('alert_value').strip() != 'N/A':
logger.debug('New alert value found for entity="%s" at timestamp="%s". Updating cache with alert '
'value.' % (entity_name, curr_timestamp))
self.cached_results[entity_name] = {
'alert_value': result.get('alert_value'),
'_time': curr_timestamp
}
else:
logger.debug('Data gap found for entity="%s" at timestamp="%s". Filling data gap with cached alert '
'value of entity.' % (entity_name, curr_timestamp))
result['alert_value'] = self.cached_results[entity_name]['alert_value']
else:
logger.debug('New entity="%s" found at timestamp="%s". Adding entity\'s alert value to the cache.' %
(entity_name, curr_timestamp))
self.cached_results[entity_name] = {
'alert_value': result.get('alert_value'),
'_time': curr_timestamp
}
self.last_timestamp = curr_timestamp
writer.writerow(result)
def _cache_and_fill_aggregate_results(self, writer, reader):
"""
If KPI has only service aggregate results, read service aggregate results, cache
them and, override N/A aggregate and missing bucket results.
@param writer: csv writer object
@param reader: csv DictReader object
@return:
"""
for result in reader:
curr_timestamp = result.get('_time')
if self.last_timestamp is not None and self.cached_results:
self._write_missing_bucket_results(writer, int(curr_timestamp))
if result.get('alert_value') and result.get('alert_value').strip() != 'N/A':
logger.debug('New "service_aggregate" alert value found at timestamp="%s". Updating cache with alert '
'value.' % curr_timestamp)
self.cached_results['alert_value'] = result.get('alert_value')
self.cached_results['_time'] = curr_timestamp
else:
if self.cached_results:
logger.debug('"service_aggregate" data gap found at timestamp="%s". Filling data gap with cached alert '
'value.' % curr_timestamp)
result['alert_value'] = self.cached_results['alert_value']
self.last_timestamp = curr_timestamp
writer.writerow(result)
if __name__ == "__main__":
fill_gaps_command = None
try:
fill_gaps_command = FillDataGapsBackfillCommand()
fill_gaps_command.execute()
except Exception as e:
logger.exception(e)
if fill_gaps_command is not None:
fill_gaps_command.exit_with_error({'finished': True}, [get_log_message_for_exception(e)])
else:
raise