You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
3.1 KiB

import numpy as np
import pandas as pd
from scipy.stats import iqr
from statsmodels.nonparametric.smoothers_lowess import lowess
from logger import get_logger
logger = get_logger(__name__)
pd.options.mode.chained_assignment = 'raise'
from util.constants import EPSILON, FEATURE_SCALED, ZERO_LEVEL_TOLERANCE
class NotEnoughDataException(Exception):
pass
def percent_change(val_old, val_new, feature_scaled=FEATURE_SCALED):
"""Calculate percent change
If the feature has been scaled to 1.0 at the beginning of the look-back period, for values close to
zero, i.e., values below ZERO_LEVEL_TOLERANCE, use the alternative calculation method below to avoid
returning extraordinarily large percent change.
Args:
val_old (float): old value before the change
val_new (float): new value after the change
feature_scaled (bool, optional): whether or not the feature has been scaled to 1.0 at the beginning
of the look-back period. Defaults to FEATURE_SCALED.
Returns:
float: percent change
"""
if not feature_scaled:
return 100 * (val_new - val_old) / (abs(val_old) + EPSILON)
if abs(val_old) < ZERO_LEVEL_TOLERANCE:
return 100 * val_new
if abs(val_new) < ZERO_LEVEL_TOLERANCE:
return -100 * val_old
return 100 * (val_new - val_old) / abs(val_old)
def calc_iqr(values):
"""
Calculate the iqr with two fallbacks, in case majority of the values are equal
"""
df_iqr = iqr(values)
if df_iqr == 0.0:
df_iqr = (np.quantile(values, 0.9) - np.quantile(values, 0.1)) * 0.6
if df_iqr == 0.0:
df_iqr = (values.max() - values.min()) * 0.4 + EPSILON
return df_iqr
def smooth(xs, lowess_points):
"""Simply a wrapper of the lowess function from statsmodels
Args:
xs (numpy array): input values to be smoothed
lowess_points (int): number of points for smoothing
Returns:
numpy array: smoothed values
"""
frac = lowess_points / len(xs)
return lowess(xs, range(len(xs)), is_sorted=True, frac=frac, it=0)[:,1]
def snd(values, lowess_points, drifts=None):
"""Calculate the Smoothed Normalized Deviation (SND) of a time series
The SND of a time series is defined as the LOWESS-smoothed deviation of data points from the median normalized by the IQR
Args:
values (numpy array): time series
drifts (list[int], optional): list of level drifts. Defaults to None.
Returns:
numpy array: smoothed_normalized_deviation
"""
if drifts is None or len(drifts) == 0:
drifts = [len(values)]
normalized_deviation = np.empty_like(values)
left = 0
for right in drifts:
segment = values[left : right]
mid = np.median(segment)
variation_unit = calc_iqr(segment)
normalized_deviation[left : right] = np.abs(segment - mid) / variation_unit
left = right
smoothed = smooth(normalized_deviation, lowess_points=lowess_points)
if (smoothed < 0).sum() > 0:
neg = smoothed < 0
smoothed[neg] = np.min(smoothed[~neg])
return smoothed