from datetime import timedelta import numpy as np import scipy.stats as stats from util.data_prepare import COL_VALUE, DAY_OF_WEEK_NAME, resample_with_offset from six.moves import zip class SubSequence: def __init__(self, values, start_time, duration) -> None: self.values = values self.length = len(values) self.start_time = start_time self.duration = duration self.silhouette = float('nan') self.has_anomaly = False def __str__(self): return ( f'start_time={self.start_time} {DAY_OF_WEEK_NAME[self.start_time.dayofweek]} duration={self.duration}, ' + f'silhouette={self.silhouette:7.3f} {"has-anomaly" if self.has_anomaly else ""}, ' + f'values=[({self.length}) min={np.min(self.values):.3f} max={np.max(self.values):.3f} mean={np.mean(self.values):.3f} med={np.median(self.values):.3f}]') def hour(self): return self.start_time.hour def dayofweek(self): return self.start_time.dayofweek def df_to_hour_sequences(df, hourDelta=1, offset=0): assert hourDelta==1 or hourDelta==2 or hourDelta==3 or hourDelta==4, f'hourDelta={hourDelta} is NOT supported' assert offset>=0 and offset 0: scores = [sub.silhouette for sub in subs] percentiles=[stats.percentileofscore(scores, sub.silhouette) for sub in subs] subs = [sub for sub, percent in zip(subs, percentiles) if percent>=percentile_threshold] else: percentiles = [] cnt_subs = len(subs) sub_values = np.empty((cnt_subs, sub_len)) for i, sub in enumerate(subs): sub_values[i] = sub.values return sub_values, percentiles