from datetime import timedelta import numpy as np from util.data_prepare import ( COL_VALUE, COL_BND_LOW, COL_BND_UP, COL_EDGE_MASK, ALGO_STD, ALGO_IQR, ALGO_QUANTILE, ALGO_PERCENT, ALGO_RANGE, DEFAULT_Z, DEFAULT_IQR_MULTIPLIER, DEFAULT_QUANTILE, ) from util.sub_sequence import df_to_hour_sequences, df_to_day_sequences # threshold with the default z value # (mean, std) calculated on current day segment, # or hour segment if there is not enough data def itsi_thresholding_np(df, algo=ALGO_STD, clip_lower=False): if df.index[-1] - df.index[0] <= timedelta(days=3): subs = df_to_hour_sequences(df) else: subs = df_to_day_sequences(df) # avoid possible partial subsequence at the beginning and the end, to reduce FP if subs[0].length < subs[1].length: subs = subs[1:] if subs[-1].length < subs[-2].length: subs = subs[:-1] subs_total_len = sum([s.length for s in subs]) # subs may have diffferent length due to possible missing values bnd_up = np.empty(subs_total_len) bnd_low = np.empty(subs_total_len) idx = 0 for sub in subs: threshold = get_thresholds(sub.values, algo) bnd_up[idx : idx + sub.length] = threshold[0] bnd_low[idx : idx + sub.length] = threshold[1] idx += sub.length head_len = sum(df.index < subs[0].start_time) tail_len = df.shape[0] - head_len - bnd_up.shape[0] df_head = np.array(df[COL_VALUE][:head_len]) df_tail = np.array(df[COL_VALUE][-tail_len:]) if clip_lower: bnd_low = np.clip(bnd_low, df[COL_VALUE].min(), None) # bnd_up = np.clip(bnd_up, None, df[COL_VALUE].max()) df[COL_BND_LOW] = np.concatenate(( np.full(head_len, df_head.min()), bnd_low, np.full(tail_len, df_tail.min()))) df[COL_BND_UP] = np.concatenate(( np.full(head_len, df_head.max()), bnd_up, np.full(tail_len, df_tail.max()))) df[COL_EDGE_MASK] = np.ones(df.shape[0], dtype=int) return df # logic adopted from itsiat Python code def get_thresholds(values, method, level=DEFAULT_Z): if method is None: raise UnboundLocalError("No method set for Policy.") if method == ALGO_STD: mid = np.mean(values) variation = np.std(values) * level return mid + variation, mid - variation elif method == ALGO_IQR: (t1, t3) = np.percentile(values, [25, 75]) iqr = t3 - t1 upper = t3 + DEFAULT_IQR_MULTIPLIER * iqr lower = t1 - DEFAULT_IQR_MULTIPLIER * iqr return upper, lower elif method == ALGO_QUANTILE: return np.percentile(values, DEFAULT_QUANTILE) elif method == ALGO_RANGE: dmax, dmin = max(values), min(values) span = dmax - dmin return dmin + (span * level) elif method == ALGO_PERCENT: # Simple Percentage as a baseline algorithm, calculate mean and use it as a base of percentage return np.mean(values) * (1 + level) else: ValueError("Invalid thresholding method: " + method)