SH-Deployer/apps/SA-ITSI-DriftDetection/bin/util/data_prepare.py

import numpy as np
import pandas as pd
from scipy.stats import iqr
from statsmodels.nonparametric.smoothers_lowess import lowess
from six.moves import range
from logger import get_logger
logger = get_logger(__name__)

pd.options.mode.chained_assignment = 'raise'

from util.constants import EPSILON, FEATURE_SCALED, ZERO_LEVEL_TOLERANCE

class NotEnoughDataException(Exception):
    pass

def percent_change(val_old, val_new, feature_scaled=FEATURE_SCALED):
    """Calculate percent change
        If the feature has been scaled to 1.0 at the beginning of the look-back period, for values close to
        zero, i.e., values below ZERO_LEVEL_TOLERANCE, use the alternative calculation method below to avoid
        returning extraordinarily large percent change.

    Args:
        val_old (float): old value before the change
        val_new (float): new value after the change
        feature_scaled (bool, optional): whether or not the feature has been scaled to 1.0 at the beginning
                    of the look-back period. Defaults to FEATURE_SCALED.

    Returns:
        float: percent change
    """
    if not feature_scaled:
        return 100 * (val_new - val_old) / (abs(val_old) + EPSILON)

    if abs(val_old) < ZERO_LEVEL_TOLERANCE:
        return 100 * val_new
    if abs(val_new) < ZERO_LEVEL_TOLERANCE:
        return -100 * val_old

    return 100 * (val_new - val_old) / abs(val_old)

def calc_iqr(values):
    """
    Calculate the iqr with two fallbacks, in case majority of the values are equal
    """
    df_iqr = iqr(values)
    if df_iqr == 0.0:
        df_iqr = (np.quantile(values, 0.9) - np.quantile(values, 0.1)) * 0.6
    if df_iqr == 0.0:
        df_iqr = (values.max() - values.min()) * 0.4 + EPSILON
    return df_iqr

def smooth(xs, lowess_points):
    """Simply a wrapper of the lowess function from statsmodels

    Args:
        xs (numpy array): input values to be smoothed
        lowess_points (int): number of points for smoothing

    Returns:
        numpy array: smoothed values
    """
    frac = lowess_points / len(xs)
    return lowess(xs, list(range(len(xs))), is_sorted=True, frac=frac, it=0)[:,1]

def snd(values, lowess_points, drifts=None):
    """Calculate the Smoothed Normalized Deviation (SND) of a time series
        The SND of a time series is defined as the LOWESS-smoothed deviation of data points from the median normalized by the IQR
    Args:
        values (numpy array): time series
        drifts (list[int], optional): list of level drifts. Defaults to None.

    Returns:
        numpy array: smoothed_normalized_deviation
    """
    if drifts is None or len(drifts) == 0:
        drifts = [len(values)]

    normalized_deviation = np.empty_like(values)
    left = 0
    for right in drifts:
        segment = values[left : right]
        mid = np.median(segment)
        variation_unit = calc_iqr(segment)
        normalized_deviation[left : right] = np.abs(segment - mid) / variation_unit
        left = right

    smoothed = smooth(normalized_deviation, lowess_points=lowess_points)
    if (smoothed < 0).sum() > 0:
        neg = smoothed < 0
        smoothed[neg] = np.min(smoothed[~neg])

    return smoothed