You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
258 lines
9.9 KiB
258 lines
9.9 KiB
from datetime import timedelta
|
|
import numpy as np
|
|
|
|
from util.dev_util import format_float_list
|
|
from util.data_prepare import COL_VALUE, COL_BND_LOW, COL_BND_UP, COL_EDGE_MASK , DEFAULT_Z, NotEnoughDataException
|
|
from util.sub_sequence import pack_sub_values_with_filter
|
|
from util.threshold_utils import subsequence_by_time_policy, sub_sequences_groupby, get_edge_length
|
|
from util import setup_logging
|
|
logger = setup_logging.get_logger()
|
|
|
|
HISTORY_NORMAL_PERCENTILE_THRESHOLD = 50
|
|
|
|
SPLIT_DELTA = 12 # minimum split size wrt number of data points
|
|
|
|
# =============================================================
|
|
# split long subsequences to smaller segments for thresholding
|
|
# =============================================================
|
|
|
|
CLIP_LEFT = 'L'
|
|
CLIP_RIGHT = 'R'
|
|
CLIP_BOTH = 'LR'
|
|
|
|
DEFAULT_EDGE_CLIP = 1
|
|
|
|
def _calc_segment_statistics(sub_values, collection_coherence_score, clip='', calc_multiplier=True, sub_values_all=None):
|
|
if len(sub_values.shape) < 2 or sub_values.shape[0] < 1 or sub_values.shape[1] <= 2 * DEFAULT_EDGE_CLIP:
|
|
raise NotEnoughDataException("Unexpected sub_values in _calc_segment_statistics")
|
|
|
|
if clip == CLIP_LEFT:
|
|
sub_values = sub_values[:, DEFAULT_EDGE_CLIP:]
|
|
elif clip == CLIP_RIGHT:
|
|
sub_values = sub_values[:, :-DEFAULT_EDGE_CLIP]
|
|
elif clip == CLIP_BOTH:
|
|
sub_values = sub_values[:, DEFAULT_EDGE_CLIP:-DEFAULT_EDGE_CLIP]
|
|
|
|
means = np.mean(sub_values, axis=1)
|
|
stds = np.std(sub_values, axis=1)
|
|
|
|
if calc_multiplier:
|
|
if stds.max() == 0.0:
|
|
logger.warning(f'WARN Zero Std in historical normal behavior. Use default Z {DEFAULT_Z:.2f}')
|
|
stds = np.absolute(means) * 0.01
|
|
z = DEFAULT_Z
|
|
else:
|
|
zs = np.empty(sub_values.shape[0])
|
|
for i in range(sub_values.shape[0]):
|
|
if stds[i] == 0.0:
|
|
zs[i] = 0.0
|
|
continue
|
|
|
|
sub = sub_values[i,:]
|
|
sub_zs = np.absolute(sub - means[i]) / stds[i]
|
|
zs[i] = sub_zs.max()
|
|
z = zs.max()
|
|
else:
|
|
z = 0.0
|
|
|
|
return means.mean(), stds.max() + means.std() * collection_coherence_score, z
|
|
|
|
def _calc_subsequence_statistics(sub_values, collection_coherence_score, split_pnts, one_hour_len, sub_values_all=None):
|
|
def bridge_calc_segment_statistics(left, right, clip):
|
|
piece_data_all = sub_values_all[:, left:right]
|
|
return _calc_segment_statistics(sub_values[:, left:right], collection_coherence_score, clip=clip, sub_values_all=piece_data_all)
|
|
|
|
segment_len = sub_values.shape[1]
|
|
|
|
if split_pnts is not None:
|
|
stats1, stats2, multipliers = [], [], []
|
|
left = 0
|
|
for i in split_pnts:
|
|
right = i * one_hour_len
|
|
(stat1, stat2, z) = bridge_calc_segment_statistics(left, right, CLIP_LEFT if left == 0 else '')
|
|
stats1.append(stat1)
|
|
stats2.append(stat2)
|
|
multipliers.append(z)
|
|
left = right
|
|
if left < segment_len:
|
|
(stat1, stat2, z) = bridge_calc_segment_statistics(left, segment_len, CLIP_RIGHT)
|
|
stats1.append(stat1)
|
|
stats2.append(stat2)
|
|
multipliers.append(z)
|
|
|
|
return stats1, stats2, multipliers
|
|
|
|
(stat1, stat2, z) = bridge_calc_segment_statistics(0, segment_len, CLIP_BOTH)
|
|
return [stat1], [stat2], [z]
|
|
|
|
class HistoryNormal:
|
|
def __init__(self, sub_values, collection_coherence_score, split_pnts=None, one_hour_len=0, sub_values_all=None) -> None:
|
|
assert len(sub_values.shape) == 2 and sub_values.shape[0] >= 1 and sub_values.shape[1] > 2 * DEFAULT_EDGE_CLIP, "Unexpected sub_values in HistoryNormal init"
|
|
self.split_pnts = split_pnts
|
|
self.one_hour_len = one_hour_len
|
|
|
|
stats1, stats2, multipliers = _calc_subsequence_statistics(
|
|
sub_values,
|
|
collection_coherence_score,
|
|
split_pnts,
|
|
one_hour_len,
|
|
sub_values_all=sub_values_all
|
|
)
|
|
|
|
self.means = np.array(stats1)
|
|
self.stds = np.array(stats2)
|
|
self.zs = np.array(multipliers)
|
|
|
|
|
|
def __str__(self) -> str:
|
|
if self.split_pnts is not None:
|
|
return f'HistoryNormal({self.split_pnts}, means={format_float_list(self.means)}, stds={format_float_list(self.stds)}, zs={format_float_list(self.zs)})'
|
|
else:
|
|
return f'HistoryNormal(mean={self.means[0]:.3f}, std={self.stds[0]:.3f}, z={self.zs[0]:.3f})'
|
|
|
|
|
|
def calc_history_normal_behavior_for_cluster(subs, collection_coherence_score, percentile_threshold=HISTORY_NORMAL_PERCENTILE_THRESHOLD):
|
|
|
|
def is_too_short(): #TODO:
|
|
return subs[0].length <= 10 or subs[0].duration <= timedelta(hours=4)
|
|
|
|
sub_values_all, _ = pack_sub_values_with_filter(subs, 0) # pack values of subsequence w/o filtering
|
|
|
|
sub_filtered_values, percentiles = pack_sub_values_with_filter(subs, percentile_threshold)
|
|
|
|
|
|
for (sub, percentile) in zip(subs, percentiles):
|
|
sub.score_percentile = percentile # set this value here, to show in plot later
|
|
|
|
if is_too_short() : # no need to split
|
|
return HistoryNormal(sub_filtered_values, collection_coherence_score, sub_values_all=sub_values_all)
|
|
|
|
sub_len = subs[0].length
|
|
sub_hour = int(subs[0].duration.total_seconds() // 3600) # segment duration in hour
|
|
one_hour_len = sub_len // sub_hour # number of points for one hour duration
|
|
|
|
if one_hour_len < SPLIT_DELTA:
|
|
split_delta = SPLIT_DELTA // one_hour_len + 1
|
|
else:
|
|
split_delta = 1
|
|
|
|
(means, stds) = ([], [])
|
|
for i in range(0, sub_hour, split_delta):
|
|
piece_data = sub_filtered_values[:, i * one_hour_len : min(sub_len, (i + split_delta) * one_hour_len)]
|
|
clip = CLIP_LEFT if i==0 else CLIP_RIGHT if i + split_delta >= sub_len else ''
|
|
mean, std, _ = _calc_segment_statistics(piece_data, collection_coherence_score, calc_multiplier=False, clip=clip)
|
|
means.append(np.mean(mean))
|
|
stds.append(std)
|
|
|
|
split_pnts = []
|
|
for i in range(1, len(means)):
|
|
mean_diff = np.absolute(means[i] - means[i-1])
|
|
std = min(stds[i], stds[i-1])
|
|
if mean_diff > std * 0.3:
|
|
split_pnts.append(i)
|
|
|
|
if len(split_pnts) > 0:
|
|
return HistoryNormal(sub_filtered_values, collection_coherence_score,
|
|
split_pnts=[i * split_delta for i in split_pnts], one_hour_len=one_hour_len,
|
|
sub_values_all=sub_values_all)
|
|
else: # no splitting
|
|
return HistoryNormal(sub_filtered_values, collection_coherence_score, sub_values_all=sub_values_all)
|
|
|
|
def calc_history_normal_behavior(df, tp):
|
|
subs, label_method = subsequence_by_time_policy(df, tp)
|
|
|
|
history_normal_dict = {}
|
|
for label, subs_group in list(sub_sequences_groupby(subs, label_method).items()):
|
|
history_normal = calc_history_normal_behavior_for_cluster(subs_group, tp.score)
|
|
logger.debug(f'{str(history_normal)}')
|
|
history_normal_dict[label] = history_normal
|
|
return history_normal_dict, subs, label_method
|
|
|
|
def itsi_thresholding(df, history_normal_dict, subs, label_method, clip_lower=False):
|
|
|
|
# avoid possible partial subsequence at the beginning and the end, to reduce FP
|
|
if subs[0].length < subs[1].length:
|
|
subs = subs[1:]
|
|
if subs[-1].length < subs[-2].length:
|
|
subs = subs[:-1]
|
|
|
|
subs_total_len = sum([s.length for s in subs]) # subs may have different length due to possible missing values
|
|
bnd_up = np.empty(subs_total_len)
|
|
bnd_low = np.empty(subs_total_len)
|
|
|
|
edge_mask = np.empty(subs_total_len, dtype=int) # use edge_mask to reduce FP at segments' edges
|
|
edge_length = get_edge_length()
|
|
|
|
idx = 0
|
|
for sub in subs:
|
|
sub_label = label_method(sub)
|
|
history_normal = history_normal_dict[sub_label]
|
|
|
|
threshold = get_thresholds_from_history(history_normal, sub.length)
|
|
|
|
bnd_up[idx : idx + sub.length] = threshold[0]
|
|
bnd_low[idx : idx + sub.length] = threshold[1]
|
|
|
|
if sub.length > 2 * edge_length:
|
|
edge_mask[idx : idx + sub.length] = np.concatenate((
|
|
np.zeros(edge_length, dtype=int),
|
|
np.ones(sub.length - 2*edge_length, dtype=int),
|
|
np.zeros(edge_length, dtype=int)))
|
|
else:
|
|
edge_mask[idx : idx + sub.length] = np.zeros(sub.length, dtype=int)
|
|
|
|
idx += sub.length
|
|
|
|
head_len = sum(df.index < subs[0].start_time)
|
|
tail_len = df.shape[0] - head_len - bnd_up.shape[0]
|
|
df_head = np.array(df[COL_VALUE][:head_len])
|
|
df_tail = np.array(df[COL_VALUE][-tail_len:])
|
|
|
|
if clip_lower:
|
|
bnd_low = np.clip(bnd_low, df[COL_VALUE].min(), None)
|
|
|
|
df[COL_BND_LOW] = np.concatenate((
|
|
np.full(head_len, df_head.min()),
|
|
bnd_low,
|
|
np.full(tail_len, df_tail.min())))
|
|
df[COL_BND_UP] = np.concatenate((
|
|
np.full(head_len, df_head.max()),
|
|
bnd_up,
|
|
np.full(tail_len, df_tail.max())))
|
|
|
|
edge_mask = np.concatenate((
|
|
np.zeros(head_len, dtype=int),
|
|
edge_mask,
|
|
np.zeros(tail_len, dtype=int)))
|
|
df[COL_EDGE_MASK] = edge_mask
|
|
|
|
return df
|
|
|
|
def get_thresholds_from_history(history_normal, length):
|
|
|
|
def _get_up_low_bnds(idx):
|
|
variation = history_normal.stds[idx] * history_normal.zs[idx]
|
|
mid = history_normal.means[idx]
|
|
return mid + variation, mid - variation
|
|
|
|
if history_normal.split_pnts is not None:
|
|
bnd_up = np.empty(length)
|
|
bnd_low = np.empty(length)
|
|
left = 0
|
|
for i, split_pnt in enumerate(history_normal.split_pnts):
|
|
right = split_pnt * history_normal.one_hour_len
|
|
(up, low) = _get_up_low_bnds(i)
|
|
bnd_up[left : right] = up
|
|
bnd_low[left : right] = low
|
|
left = right
|
|
if left < length:
|
|
(up, low) = _get_up_low_bnds(-1)
|
|
bnd_up[left :] = up
|
|
bnd_low[left :] = low
|
|
else:
|
|
(up, low) = _get_up_low_bnds(0)
|
|
bnd_up = np.full(length, up)
|
|
bnd_low = np.full(length, low)
|
|
|
|
return bnd_up, bnd_low
|