SH-Deployer/apps/SA-ITSI-DriftDetection/bin/detectdrift.py

import os
import sys
import time

import exec_anaconda

exec_anaconda.exec_anaconda()

sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "lib"))

# Add the directory where this script resides to the Python path
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))

from util.constants import (
    ITSI_TIMESTAMP_FORMAT,
    COL_VALUE, COL_DATE,
    DEFAULT_THRESHOLD,
    DRIFT_DIRECTION_BOTH,
    DRIFT_DIRECTION_UP,
    DRIFT_DIRECTION_DOWN,
)
from util.csc_output import (
    CONSTANT_INPUT,
    DRIFT_STR_DICT,
    INPUT_MIN_DATA_POINT,
    INPUT_MIN_TIME_LENGTH,
    SHORT_INPUT, EMPTY_INPUT,
    summarize_drift_result
)
from util.csc_input import parse_timestamp
from algo.drift_detection import detect_drifts

from splunklib.searchcommands import dispatch, StreamingCommand, Configuration, Option, validators

import numpy as np
import pandas as pd

from logger import get_logger
from util.telemetry_logger import log_telemetry

logger = get_logger()

from constants import (
    ALERT_VALUE,
    KPI_ID,
    SERVICE_ID,
    PART_OR_WHOLE,
    DRIFT_TYPE,
    THRESHOLD_TIME,
    DRIFT_DETECTION_RESULTS_COLLECTION,
    DRIFT_TIME_WINDOWS,
    KVSTORE_KEY,
    IS_DRIFT_DETECTED,
    LAST_DRIFT_AT,
    START_TIME,
    END_TIME,
    PERCENT_DRIFT
)


class DriftDetector:
    """
    The DriftDetector class provides static methods for preparing time series data and detecting drifts within it.

    The class is designed to operate on pandas DataFrames, expecting a specific structure and formatting of the
    input data.
    """

    @staticmethod
    def prepare_dataframe(df, time_field, alert_value_field, timestamp_format):
        """
        Prepares the dataframe for drift detection.
        """
        column_renames = {
            time_field: COL_DATE,
            alert_value_field: COL_VALUE,
        }
        df.rename(columns=column_renames, inplace=True)

        # Replace cell that's entirely space (or empty) with NaN
        #       '^' matches the beginning of a string, and '$' matches the end of a string
        df = df.replace(r'^\s*$', np.nan, regex=True)

        df = parse_timestamp(df, timestamp_format)
        df[COL_VALUE] = df[COL_VALUE].astype(float)
        df.dropna(inplace=True)

        df.set_index(COL_DATE, inplace=True)
        return df.sort_index()

    @staticmethod
    def detect_drifts(df, threshold, threshold_direction):
        """
        Detects drifts in the provided dataframe.
        """
        detected_drifts, _, _ = detect_drifts(
            series=df[COL_VALUE],
            threshold=threshold,
            threshold_direction=threshold_direction,
        )
        return detected_drifts


@Configuration()
class DriftDetectionCommand(StreamingCommand):
    alert_value_field = Option(require=False, default=ALERT_VALUE)
    kpi_id_field = Option(require=False, default=KPI_ID)
    service_id_field = Option(require=False, default=SERVICE_ID)

    time_field = '_time'

    timestamp_format = Option(require=False, default=ITSI_TIMESTAMP_FORMAT)
    threshold = Option(require=False, default=DEFAULT_THRESHOLD, validate=validators.Integer())
    threshold_direction = Option(
        require=False,
        default=DRIFT_DIRECTION_BOTH,
        validate=validators.Set(DRIFT_DIRECTION_BOTH, DRIFT_DIRECTION_UP, DRIFT_DIRECTION_DOWN)
    )

    # Add the two options for debugging/investigating purpose
    # The check_time_length option will be handy when testing a time series
    #       with enough data points but its time span is less than the configured limit.
    check_time_length = Option(require=False, default=True, validate=validators.Boolean())
    # The output_epoch_time option will be handy when one needs to have more readable timestamps in an investigation
    output_epoch_time = Option(require=False, default=True, validate=validators.Boolean())


    collection_name = DRIFT_DETECTION_RESULTS_COLLECTION

    def __init__(self):
        super().__init__()
        self.df = None
        self.buffer = []  # Buffer to store records for a single KPI

    @Configuration()
    def map(self, records):
        return records

    def get_or_create_collection(self):
        kvstore = self.service.kvstore
        return kvstore.create(self.collection_name) if self.collection_name not in kvstore else kvstore[
            self.collection_name]

    @staticmethod
    def _find_latest_end_time(data):
        """
        Finds the latest end_time in a list of dictionaries.

        :param data: A list of dictionaries, each containing an 'end_timestamp' key with an epoch value.
        :return: The latest end_timestamp value found in the data list, or None if no end_time is found.
        """
        max_end_time = None

        for entry in data:
            if END_TIME in entry and (max_end_time is None or entry[END_TIME] > max_end_time):
                # Update the max_end_time with the current entry's end_time value
                max_end_time = entry[END_TIME]

        return max_end_time

    def save_to_kvstore(self, kpi_id, service_id, data):
        # Get or create KV Store collection
        collection = self.get_or_create_collection()

        latest_end_timestamp = self._find_latest_end_time(data)

        document_to_upsert = [{
            KVSTORE_KEY: kpi_id,
            SERVICE_ID: service_id,
            IS_DRIFT_DETECTED: bool(data),
            LAST_DRIFT_AT: latest_end_timestamp,
            DRIFT_TIME_WINDOWS: data  # 'data' is the list of dictionaries to be upserted
        }]

        collection.data.batch_save(*document_to_upsert)

    def validate_record_fields(self, record):
        # Method to validate required fields in the record
        required_fields = [self.time_field, self.alert_value_field, self.kpi_id_field, self.service_id_field]

        for field in required_fields:
            if field not in record:
                raise ValueError(f'The field {field} is not a field in the dataset, or its value is empty. \
                                 Ensure the field is passed correctly to the {field} argument of detectdrift.')

    def buffer_record(self, record):
        buffer_entry = {
            self.time_field: record[self.time_field],
            self.alert_value_field: record[self.alert_value_field],
            self.kpi_id_field: record[self.kpi_id_field],
            self.service_id_field: record[self.service_id_field]
        }
        self.buffer.append(buffer_entry)

    def handle_record(self, record):
        self.validate_record_fields(record)
        self.buffer_record(record)

    def prepare_dataframe(self):
        df = pd.DataFrame.from_records(self.buffer)
        self.df = DriftDetector.prepare_dataframe(df, self.time_field, self.alert_value_field, self.timestamp_format)

    @property
    def kpi_id(self):
        """
        Returns the KPI ID from the DataFrame.

        Returns:
            str: The KPI ID.
        """
        return self.df.iloc[0][KPI_ID]

    @property
    def service_id(self):
        """
        Returns the Service ID from the DataFrame.

        Returns:
            str: The Service ID.
        """
        return self.df.iloc[0][SERVICE_ID]

    def is_buffer_empty(self):
        return len(self.buffer) == 0

    def format_drift_output(self, drift):
        def time_output(ts):
            if self.output_epoch_time: # convert output timestamps from pandas Timestamp to epoch time
                return int(ts.timestamp()) if ts is not None else -1
            else:
                return str(ts)

        return {
            PART_OR_WHOLE: DRIFT_STR_DICT[drift.part_or_whole],
            DRIFT_TYPE: drift.drift_type,
            PERCENT_DRIFT: int(drift.percent_drift()),
            START_TIME: time_output(drift.start_time),
            END_TIME: time_output(drift.end_time),
            THRESHOLD_TIME: time_output(drift.threshold_time),
        }

    @staticmethod
    def generate_warning_response(message, reason_code):
        logger.warning(message)
        return {'No Drifts': 'True', 'Reason Code': reason_code}

    def telemetry_logging_results(self, results_summary):
        def drift_to_telemetry_str(drift_types, drift_len):
            if len(drift_types) > 1: # accumulated drift
                return f'Accumulated drift of {len(drift_types)} segments, total_length={drift_len}, segment_types=({" ".join(drift_types)})'
            else:
                return f'{drift_types[0]} drift, length={drift_len}'

        if len(results_summary) > 0:
            log_telemetry(
                event_type = 'drifts_found',
                kpi_id = self.kpi_id,
                service_id = self.service_id,
                drifts = [drift_to_telemetry_str(drift_types, drift_len) for (drift_types, drift_len) in results_summary]
            )

    def stream(self, records):
        time_0 = time.time()

        for record in records:
            self.handle_record(record)

        if self.is_buffer_empty():
            yield self.generate_warning_response('The input KPI time series is empty. No Drifts.', EMPTY_INPUT)

        if not self.is_buffer_empty() and self._finished:
            self.prepare_dataframe()

            cnt_data_points = self.df[COL_VALUE].count()
            df_time_span = self.df.index[-1] - self.df.index[0]
            if self.check_time_length:
                is_short = cnt_data_points < INPUT_MIN_DATA_POINT or self.df.index[-1] - self.df.index[0] < INPUT_MIN_TIME_LENGTH
            else:
                is_short = cnt_data_points < INPUT_MIN_DATA_POINT

            if is_short:
                yield self.generate_warning_response(f'The input KPI time series is too short ({cnt_data_points}, {df_time_span}). No Drifts.', SHORT_INPUT)
                return

            if self.df[COL_VALUE].min() == self.df[COL_VALUE].max():
                yield self.generate_warning_response('The input KPI time series is constant. No Drifts.',
                                                     CONSTANT_INPUT)
                return

            time_1 = time.time()

            log_telemetry(
                event_type = 'calling_detect_drifts',
                kpi_id = self.kpi_id,
                service_id = self.service_id,
                df_length = len(self.df),
                df_time_span = str(self.df.index[-1] - self.df.index[0]),
                threshold = self.threshold,
                threshold_direction = self.threshold_direction,
            )

            drifts_detected = DriftDetector.detect_drifts(self.df, self.threshold, self.threshold_direction)

            self.telemetry_logging_results(summarize_drift_result(drifts_detected))

            results = []

            for drift in drifts_detected:
                result = self.format_drift_output(drift)
                yield result
                results.append(result)

            time_2 = time.time()
            self.save_to_kvstore(kpi_id=self.kpi_id, service_id=self.service_id, data=results)

            log_telemetry(
                event_type = 'detectdrift_complete',
                kpi_id = self.kpi_id,
                service_id = self.service_id,
                total_time = f'{time.time() - time_0:.3f}s',
                data_prepare_time = f'{time_1 - time_0:.3f}s',
                detect_drifts_time = f'{time_2 - time_1:.3f}s',
                kvstore_time = f'{time.time() - time_2:.3f}s',
            )


dispatch(DriftDetectionCommand, sys.argv, sys.stdin, sys.stdout, __name__)