You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

76 lines
2.9 KiB

from datetime import timedelta
import numpy as np
import scipy.stats as stats
from util.data_prepare import COL_VALUE, DAY_OF_WEEK_NAME, resample_with_offset
from six.moves import zip
class SubSequence:
def __init__(self, values, start_time, duration) -> None:
self.values = values
self.length = len(values)
self.start_time = start_time
self.duration = duration
self.silhouette = float('nan')
self.has_anomaly = False
def __str__(self):
return (
f'start_time={self.start_time} {DAY_OF_WEEK_NAME[self.start_time.dayofweek]} duration={self.duration}, ' +
f'silhouette={self.silhouette:7.3f} {"has-anomaly" if self.has_anomaly else ""}, ' +
f'values=[({self.length}) min={np.min(self.values):.3f} max={np.max(self.values):.3f} mean={np.mean(self.values):.3f} med={np.median(self.values):.3f}]')
def hour(self):
return self.start_time.hour
def dayofweek(self):
return self.start_time.dayofweek
def df_to_hour_sequences(df, hourDelta=1, offset=0):
assert hourDelta==1 or hourDelta==2 or hourDelta==3 or hourDelta==4, f'hourDelta={hourDelta} is NOT supported'
assert offset>=0 and offset<hourDelta, f'Invalid offset({offset}), should be [0, hourDelta({hourDelta})).'
dfre = resample_with_offset(df, f'{hourDelta}h', offset)
subsequence_list = [SubSequence(
np.array(x[1][COL_VALUE]),
duration=timedelta(hours=hourDelta),
start_time=x[0]) for x in dfre]
return subsequence_list[1:] # ignore beginning / ending partial block
def df_to_day_sequences(df, offset=0):
dfre = resample_with_offset(df, '24h', offset)
subsequence_list = [SubSequence(
np.array(x[1][COL_VALUE]),
duration=timedelta(days=1),
start_time=x[0]) for x in dfre]
return subsequence_list[1:] # ignore beginning / ending partial day
def df_to_half_day_sequences(df, start_hour):
dfre = resample_with_offset(df, '12h', start_hour)
subsequence_list = [SubSequence(
np.array(x[1][COL_VALUE]),
duration=timedelta(hours=12),
start_time=x[0]) for x in dfre]
return subsequence_list[1:] # ignore beginning / ending partial day
def pack_sub_values_with_filter(subs, percentile_threshold):
sub_len = len(subs[0].values)
if len(subs[-1].values) != sub_len:
subs = subs[:-1]
if percentile_threshold > 0:
scores = [sub.silhouette for sub in subs]
percentiles=[stats.percentileofscore(scores, sub.silhouette) for sub in subs]
subs = [sub for sub, percent in zip(subs, percentiles) if percent>=percentile_threshold]
else:
percentiles = []
cnt_subs = len(subs)
sub_values = np.empty((cnt_subs, sub_len))
for i, sub in enumerate(subs):
sub_values[i] = sub.values
return sub_values, percentiles