You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
94 lines
3.0 KiB
94 lines
3.0 KiB
from datetime import timedelta
|
|
import numpy as np
|
|
|
|
from util.data_prepare import (
|
|
COL_VALUE, COL_BND_LOW, COL_BND_UP, COL_EDGE_MASK,
|
|
ALGO_STD, ALGO_IQR, ALGO_QUANTILE, ALGO_PERCENT, ALGO_RANGE,
|
|
DEFAULT_Z, DEFAULT_IQR_MULTIPLIER, DEFAULT_QUANTILE,
|
|
)
|
|
from util.sub_sequence import df_to_hour_sequences, df_to_day_sequences
|
|
|
|
# threshold with the default z value
|
|
# (mean, std) calculated on current day segment,
|
|
# or hour segment if there is not enough data
|
|
def itsi_thresholding_np(df, algo=ALGO_STD, clip_lower=False):
|
|
|
|
if df.index[-1] - df.index[0] <= timedelta(days=3):
|
|
subs = df_to_hour_sequences(df)
|
|
else:
|
|
subs = df_to_day_sequences(df)
|
|
|
|
# avoid possible partial subsequence at the beginning and the end, to reduce FP
|
|
if subs[0].length < subs[1].length:
|
|
subs = subs[1:]
|
|
if subs[-1].length < subs[-2].length:
|
|
subs = subs[:-1]
|
|
|
|
subs_total_len = sum([s.length for s in subs]) # subs may have diffferent length due to possible missing values
|
|
bnd_up = np.empty(subs_total_len)
|
|
bnd_low = np.empty(subs_total_len)
|
|
|
|
idx = 0
|
|
for sub in subs:
|
|
threshold = get_thresholds(sub.values, algo)
|
|
bnd_up[idx : idx + sub.length] = threshold[0]
|
|
bnd_low[idx : idx + sub.length] = threshold[1]
|
|
idx += sub.length
|
|
|
|
head_len = sum(df.index < subs[0].start_time)
|
|
tail_len = df.shape[0] - head_len - bnd_up.shape[0]
|
|
df_head = np.array(df[COL_VALUE][:head_len])
|
|
df_tail = np.array(df[COL_VALUE][-tail_len:])
|
|
|
|
if clip_lower:
|
|
bnd_low = np.clip(bnd_low, df[COL_VALUE].min(), None)
|
|
# bnd_up = np.clip(bnd_up, None, df[COL_VALUE].max())
|
|
|
|
df[COL_BND_LOW] = np.concatenate((
|
|
np.full(head_len, df_head.min()),
|
|
bnd_low,
|
|
np.full(tail_len, df_tail.min())))
|
|
df[COL_BND_UP] = np.concatenate((
|
|
np.full(head_len, df_head.max()),
|
|
bnd_up,
|
|
np.full(tail_len, df_tail.max())))
|
|
|
|
df[COL_EDGE_MASK] = np.ones(df.shape[0], dtype=int)
|
|
|
|
return df
|
|
|
|
# logic adopted from itsiat Python code
|
|
def get_thresholds(values, method, level=DEFAULT_Z):
|
|
|
|
if method is None:
|
|
raise UnboundLocalError("No method set for Policy.")
|
|
|
|
if method == ALGO_STD:
|
|
mid = np.mean(values)
|
|
variation = np.std(values) * level
|
|
|
|
return mid + variation, mid - variation
|
|
|
|
elif method == ALGO_IQR:
|
|
(t1, t3) = np.percentile(values, [25, 75])
|
|
iqr = t3 - t1
|
|
upper = t3 + DEFAULT_IQR_MULTIPLIER * iqr
|
|
lower = t1 - DEFAULT_IQR_MULTIPLIER * iqr
|
|
|
|
return upper, lower
|
|
|
|
elif method == ALGO_QUANTILE:
|
|
return np.percentile(values, DEFAULT_QUANTILE)
|
|
|
|
elif method == ALGO_RANGE:
|
|
dmax, dmin = max(values), min(values)
|
|
span = dmax - dmin
|
|
return dmin + (span * level)
|
|
|
|
elif method == ALGO_PERCENT:
|
|
# Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage
|
|
return np.mean(values) * (1 + level)
|
|
|
|
else:
|
|
ValueError("Invalid thresholding method: " + method)
|