You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

94 lines
3.0 KiB

from datetime import timedelta
import numpy as np
from util.data_prepare import (
COL_VALUE, COL_BND_LOW, COL_BND_UP, COL_EDGE_MASK,
ALGO_STD, ALGO_IQR, ALGO_QUANTILE, ALGO_PERCENT, ALGO_RANGE,
DEFAULT_Z, DEFAULT_IQR_MULTIPLIER, DEFAULT_QUANTILE,
)
from util.sub_sequence import df_to_hour_sequences, df_to_day_sequences
# threshold with the default z value
# (mean, std) calculated on current day segment,
# or hour segment if there is not enough data
def itsi_thresholding_np(df, algo=ALGO_STD, clip_lower=False):
if df.index[-1] - df.index[0] <= timedelta(days=3):
subs = df_to_hour_sequences(df)
else:
subs = df_to_day_sequences(df)
# avoid possible partial subsequence at the beginning and the end, to reduce FP
if subs[0].length < subs[1].length:
subs = subs[1:]
if subs[-1].length < subs[-2].length:
subs = subs[:-1]
subs_total_len = sum([s.length for s in subs]) # subs may have diffferent length due to possible missing values
bnd_up = np.empty(subs_total_len)
bnd_low = np.empty(subs_total_len)
idx = 0
for sub in subs:
threshold = get_thresholds(sub.values, algo)
bnd_up[idx : idx + sub.length] = threshold[0]
bnd_low[idx : idx + sub.length] = threshold[1]
idx += sub.length
head_len = sum(df.index < subs[0].start_time)
tail_len = df.shape[0] - head_len - bnd_up.shape[0]
df_head = np.array(df[COL_VALUE][:head_len])
df_tail = np.array(df[COL_VALUE][-tail_len:])
if clip_lower:
bnd_low = np.clip(bnd_low, df[COL_VALUE].min(), None)
# bnd_up = np.clip(bnd_up, None, df[COL_VALUE].max())
df[COL_BND_LOW] = np.concatenate((
np.full(head_len, df_head.min()),
bnd_low,
np.full(tail_len, df_tail.min())))
df[COL_BND_UP] = np.concatenate((
np.full(head_len, df_head.max()),
bnd_up,
np.full(tail_len, df_tail.max())))
df[COL_EDGE_MASK] = np.ones(df.shape[0], dtype=int)
return df
# logic adopted from itsiat Python code
def get_thresholds(values, method, level=DEFAULT_Z):
if method is None:
raise UnboundLocalError("No method set for Policy.")
if method == ALGO_STD:
mid = np.mean(values)
variation = np.std(values) * level
return mid + variation, mid - variation
elif method == ALGO_IQR:
(t1, t3) = np.percentile(values, [25, 75])
iqr = t3 - t1
upper = t3 + DEFAULT_IQR_MULTIPLIER * iqr
lower = t1 - DEFAULT_IQR_MULTIPLIER * iqr
return upper, lower
elif method == ALGO_QUANTILE:
return np.percentile(values, DEFAULT_QUANTILE)
elif method == ALGO_RANGE:
dmax, dmin = max(values), min(values)
span = dmax - dmin
return dmin + (span * level)
elif method == ALGO_PERCENT:
# Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage
return np.mean(values) * (1 + level)
else:
ValueError("Invalid thresholding method: " + method)