You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

377 lines
16 KiB

import datetime as dt
import numpy as np
import pandas as pd
from string import Template
import sys
import time
from util.dev_util import str_int_list_no_bracket, enable_multi_resolution
from util.data_prepare import COL_VALUE, LOG_TRANSFORM_THRESHOLD, DAY_OF_WEEK_NAME
from util.data_prepare import down_sample, get_resolution, log_transform_statistics, remove_super_spikes
from util.pattern_evaluate import THRESHOLD_SILHOUETTE_LOW, THRESHOLD_SILHOUETTE_MEDIUM, THRESHOLD_SILHOUETTE_HIGH
from util.pattern_evaluate import check_weekly_pattern, evaluate_half_day, evaluate_hour_block
from util.threshold_utils import subsequence_by_time_policy, sub_sequences_groupby
from util.itsi_at_threshold import get_history_normal_behavior
from util import setup_logging
logger = setup_logging.get_logger()
DAYS_IN_WEEK = [0, 1, 2, 3, 4, 5, 6]
DAY_IN_SECOND = 1440
WEEK_IN_SECOND = 10080
NO_RECOMMENDATION = 'No Recommendation'
INSUFFICIENT_DATA = 'INSUFFICIENT_DATA'
NO_PATTERN = 'NO_PATTERN'
PATTERN_SWITCH = 'PATTERN_SWITCH'
CRON_TEMPLATE_DAILY_PATTERN = Template('0 $start_hour * * *')
CRON_TEMPLATE_WEEKLY_PATTERN = Template('0 $start_hour * * $dow_list')
class TimePolicy:
def __init__(self, hour_block_length, offset, has_weekend, offdays_start=5, score=-2.0, label_method=None, log_trans=False) -> None:
self.hour_block_length = hour_block_length
self.offset = offset
self.has_weekend = has_weekend
self.offdays_start = offdays_start
self.score = score
self.label_method = label_method
self.log_trans = log_trans
def __str__(self) -> str:
if self.has_weekend:
week_clause = 'weekly seasonality'
else:
week_clause = ''
if self.hour_block_length == 0: # hour pattern score below threshold
rslt = week_clause + ('' if self.offset == 0 else f', offset={self.offset}h') + ('' if self.offdays_start == 5 else f', {DAY_OF_WEEK_NAME[self.offdays_start]} {DAY_OF_WEEK_NAME[(self.offdays_start + 1) % 7]} off')
else:
rslt = f'daily {int(self.hour_block_length)}-hour blocks' + ('' if self.offset == 0 else f', offset={self.offset}-hour') + week_clause
return (rslt if rslt else 'no pattern')
def _dict_key_for_weekly_cron(dow, start_hour):
return dow * 100 + start_hour
def get_cron_output(df, time_policy=None):
# Checking for possible 'No Recommendation' reasons
df_delta = df.index[-1] - df.index[0]
if df_delta <= dt.timedelta(days=1):
logger.warning(f'The difference between your ending time and starting time is {df_delta}. At least 1 day (24h) of data is required for time policy recommendation.')
return [], INSUFFICIENT_DATA, -1.0
if time_policy is None:
time_policy = recommend_time_policy(df)
if time_policy.score < THRESHOLD_SILHOUETTE_LOW: # no pattern detected
logger.warning(f'The Silhouette Score is {time_policy.score}, which is less than the minimum threshold of {THRESHOLD_SILHOUETTE_LOW}.')
return [], NO_PATTERN, round(time_policy.score, 3)
df_resolution = get_resolution(df)
df = down_sample(df, resolution=df_resolution, df_resolution=df_resolution) # don' change resolution but fill na
subs, label_method = subsequence_by_time_policy(df, time_policy)
pattern_desc = str(time_policy)
if not time_policy.has_weekend:
hour_block_length = time_policy.hour_block_length
cron_dura_thres_dict = {}
day_offset = time_policy.offset
weekly_with_offset = time_policy.has_weekend and time_policy.offset > 0
if weekly_with_offset:
# reset day_offset to re-use the code in the for-loop below
# put the logic of day_offset adjustment for the case of weekly-pattern-w-day-offset in isolation after that
day_offset = 0
for _, subs_group in sub_sequences_groupby(subs, label_method).items():
history_normal = get_history_normal_behavior(subs_group)
logger.debug(f'{str(history_normal)}')
if time_policy.has_weekend: # in this case, subsequence is 1-day length
dow_set = sorted(set([sub.dayofweek() for sub in subs_group]))
dow_disp = str_int_list_no_bracket(dow_set)
if history_normal.split_pnts is None:
key = _dict_key_for_weekly_cron(dow_set[0], day_offset)
cron_dura_thres_dict[key] = (
CRON_TEMPLATE_WEEKLY_PATTERN.substitute(start_hour=day_offset, dow_list=dow_disp),
DAY_IN_SECOND,
history_normal.zs[0]
)
else:
left = 0
split_pnts = history_normal.split_pnts + [24]
for split_pnt, z in zip(split_pnts, history_normal.zs):
start_hour=(day_offset + left) % 24
key = _dict_key_for_weekly_cron(dow_set[0], start_hour)
cron_dura_thres_dict[key] = (
CRON_TEMPLATE_WEEKLY_PATTERN.substitute(start_hour=start_hour, dow_list=dow_disp),
(split_pnt - left)*60,
z
)
left = split_pnt
else: # no weekly pattern
hour_set = sorted(set([sub.hour() for sub in subs_group]))
if history_normal.split_pnts is None:
for start_hour in hour_set:
cron_dura_thres_dict[start_hour] = (
CRON_TEMPLATE_DAILY_PATTERN.substitute(start_hour=start_hour),
hour_block_length * 60,
history_normal.zs[0]
)
else:
left = 0
split_pnts = history_normal.split_pnts + [hour_block_length]
for split_pnt, z in zip(split_pnts, history_normal.zs):
for start_hour in hour_set:
key = (start_hour + left) % 24
cron_dura_thres_dict[key] = (
CRON_TEMPLATE_DAILY_PATTERN.substitute(start_hour=key),
(split_pnt - left) * 60,
z
)
left = split_pnt
if weekly_with_offset:
cron_dura_thres_dict = compensate_weekly_with_offset(cron_dura_thres_dict, time_policy.offset)
return [cron_dura_thres_dict[k] for k in sorted(cron_dura_thres_dict.keys())], pattern_desc, round(time_policy.score, 3)
def compensate_weekly_with_offset(cron_dura_thres_dict, day_offset):
def _dow_plus_one(dow):
return (dow + 1) % 7
def _cron_idx_dow(cron):
return cron.rindex(' ') + 1
def _cron_get_dow_head(cron):
idx = _cron_idx_dow(cron)
return cron[idx: idx+1]
def _cron_get_dows(cron):
return cron[_cron_idx_dow(cron) :]
def _cron_update_dow(cron, dow):
return cron[:_cron_idx_dow(cron)] + dow
def _cron_hour_plus_offset(cron):
idx_right = cron.index(' ', 2)
start_hour = int(cron[2:idx_right]) + day_offset
return cron[:2] + str(start_hour) + cron[idx_right:], start_hour
def _cron_hour_update(cron, hour):
idx_right = cron.index(' ', 2)
return cron[:2] + str(hour) + cron[idx_right:]
def _unpack_dows_str(dows):
return [int(d) for d in dows.split(',')]
adjusted_dict = {}
for (cron, dura, thres) in cron_dura_thres_dict.values():
cron_a, start_hour = _cron_hour_plus_offset(cron)
if start_hour < 24: # still the same day
key = _dict_key_for_weekly_cron(int(_cron_get_dow_head(cron_a)), start_hour)
adjusted_dict[key] = (cron_a, dura, thres)
continue
start_hour -= 24
dows = _cron_get_dows(cron_a)
if len(dows) == 1: # only one day, just move to the next day
dows = str(_dow_plus_one(int(dows)))
cron_a = _cron_update_dow(cron_a, dows)
cron_a = _cron_hour_update(cron_a, start_hour)
key = int(dows) * 100 + start_hour
adjusted_dict[key] = (cron_a, dura, thres)
continue
# more than one days, besides +1, need to separate out the last day in the dow list
dow_list = _unpack_dows_str(dows)
dow_list = [_dow_plus_one(d) for d in dow_list]
cron_1 = CRON_TEMPLATE_WEEKLY_PATTERN.substitute(
start_hour=start_hour,
dow_list=str_int_list_no_bracket(dow_list[:-1])
)
key = _dict_key_for_weekly_cron(dow_list[0], start_hour)
adjusted_dict[key] = (cron_1, dura, thres)
cron_2 = CRON_TEMPLATE_WEEKLY_PATTERN.substitute(
start_hour=start_hour,
dow_list=str(dow_list[-1])
)
key = _dict_key_for_weekly_cron(dow_list[-1], start_hour)
adjusted_dict[key] = (cron_2, dura, thres)
return adjusted_dict
def unpack_history_normal(df, time_policy):
if time_policy.score < THRESHOLD_SILHOUETTE_LOW: # no pattern detected
return []
df_resolution = get_resolution(df)
df = down_sample(df, df_resolution) # don' change resolution but fill na
subs, label_method = subsequence_by_time_policy(df, time_policy)
if time_policy.has_weekend:
dow_thresholds_dict = {}
else:
hour_thresholds_dict = {}
hour_block_length = time_policy.hour_block_length
for _, subs_group in sub_sequences_groupby(subs, label_method).items():
history_normal = get_history_normal_behavior(subs_group)
if time_policy.has_weekend: # in this case, subsequence is 1-day length
dow_set = set([sub.dayofweek() for sub in subs_group])
if history_normal.split_pnts is None:
day_thresholds = list(history_normal.zs) * 24
else:
day_thresholds = []
left = 0
split_pnts = history_normal.split_pnts + [24]
for split_pnt, z in zip(split_pnts, history_normal.zs):
day_thresholds += [z] * (split_pnt - left)
left = split_pnt
for dow in dow_set:
dow_thresholds_dict[dow] = day_thresholds
else: # no weekly pattern
hour_set = set([sub.hour() for sub in subs_group])
if history_normal.split_pnts is None:
hours_thresholds = list(history_normal.zs) * hour_block_length
else:
hours_thresholds = []
left = 0
split_pnts = history_normal.split_pnts + [hour_block_length]
for split_pnt, z in zip(split_pnts, history_normal.zs):
hours_thresholds += [z] * (split_pnt - left)
left = split_pnt
for hour in hour_set:
hour_thresholds_dict[hour] = hours_thresholds
thresholds = []
if time_policy.has_weekend:
for dow in range(7):
thresholds += dow_thresholds_dict[dow]
else:
for hour in range(0, 24, hour_block_length):
thresholds += hour_thresholds_dict[hour + time_policy.offset]
if time_policy.offset == 0:
return thresholds
else:
return thresholds[-time_policy.offset:] + thresholds[:len(thresholds) - time_policy.offset]
def recommend_time_policy(df):
df = remove_super_spikes(df)
# 5-point smoothing
df[COL_VALUE] = df[COL_VALUE].rolling(5, center=True).mean()
df[COL_VALUE].iloc[0] = df[COL_VALUE].iloc[2]
df[COL_VALUE].iloc[1] = df[COL_VALUE].iloc[2]
df[COL_VALUE].iloc[-1] = df[COL_VALUE].iloc[-3]
df[COL_VALUE].iloc[-2] = df[COL_VALUE].iloc[-3]
log_trans = False
range_ratio, log_values, c = log_transform_statistics(df[COL_VALUE])
logger.debug(f'range_ratio={range_ratio:.3f}')
if range_ratio > LOG_TRANSFORM_THRESHOLD:
logger.info('Apply Log transformation in recommend_time_policy()')
df[COL_VALUE] = log_values
log_trans = True
hour_block_length, offset, has_weekend, offdays_start, score, label_method = _evaluate_seasonality_patterns_at_multi_resolution(df)
if log_trans: # invert log-trans on COL_VALUE
df[COL_VALUE] = np.exp(log_values) - c
return TimePolicy(hour_block_length, offset, has_weekend, offdays_start, score, label_method, log_trans)
def _evaluate_seasonality_patterns_at_multi_resolution(df):
df_resolution = get_resolution(df)
if enable_multi_resolution():
resolution_list = ['15min', '30min', '60min']
if df_resolution > pd.Timedelta(resolution_list[2]):
resolution_list = [df_resolution]
else: # disable multi-resolution by default to reduce latency
resolution = pd.Timedelta('15min')
if df_resolution > resolution:
resolution_list = [df_resolution]
else:
resolution_list = [resolution]
best_score = 0.0
result = (0, 0, False, 0, -2.0, None)
for resolution in resolution_list:
if df_resolution > pd.Timedelta(resolution):
continue
df = down_sample(df, resolution=resolution)
hour_block_length, offset, has_weekend, offdays_start, score, label_method = _evaluate_seasonality_patterns(df)
# time_policy = TimePolicy(hour_block_length, offset, has_weekend, offdays_start, score)
# logger.debug(f'{str(time_policy)}; (resolution={resolution})')
if score > best_score:
best_score = score
result = (hour_block_length, offset, has_weekend, offdays_start, score, label_method)
if best_score > THRESHOLD_SILHOUETTE_HIGH: break #TODO
return result
def _evaluate_seasonality_patterns(df):
time_0 = time.time()
best_score_week, offdays_start, offset, label_method = check_weekly_pattern(df)
logger.debug(f'weekly pattern time spent: {time.time() - time_0:.2f}s')
time_0 = time.time()
hour_block_candidate = evaluate_hour_block(df)
logger.debug(f'hour-block pattern time spent: {time.time() - time_0:.2f}s')
time_0 = time.time()
score_half, workhour_start = evaluate_half_day(df)
logger.debug(f'half-day pattern time spent: {time.time() - time_0:.2f}s')
if hour_block_candidate is None:
hour_block_score = -2.0
else:
hour_block_score = hour_block_candidate.silhouette
has_weekend = best_score_week > THRESHOLD_SILHOUETTE_LOW
# ONLY add hour-block or workhour pattern when it improves the score
if has_weekend and best_score_week > hour_block_candidate.silhouette and best_score_week > score_half:
return 0, offset, has_weekend, offdays_start, best_score_week, label_method
else:
if hour_block_score >= score_half:
offset = hour_block_candidate.offset
best_score = hour_block_candidate.silhouette
if best_score >= THRESHOLD_SILHOUETTE_LOW:
hour_block_length = hour_block_candidate.hourdelta
else:
hour_block_length = 0
offset = 0
else:
offset = workhour_start
best_score = score_half
if best_score >= THRESHOLD_SILHOUETTE_LOW:
hour_block_length = 12
else:
hour_block_length = 0
offset = 0
return hour_block_length, offset, False, offdays_start, best_score, None
if __name__ == '__main__':
# sample usage of time policy recommend
import os
import time
from data_prepare import NAB_TIMESTAMP_FORMAT
from data_prepare import load_time_series_from_file, down_sample
curr_dir = os.path.dirname(os.path.realpath(__file__))
timeseries_name = 'nyc_taxi'
path_data = os.path.join(curr_dir, '../..', f'data/{timeseries_name}.csv')
print('time series:', timeseries_name, file=sys.stderr)
start_time = time.time()
os.environ['TIMEPOLICY_VERBOSE'] = 'False'
df = load_time_series_from_file(path_data, NAB_TIMESTAMP_FORMAT)
df = down_sample(df)
time_policy, scores_lists = recommend_time_policy(df)
print(str(time_policy) + f' (time spent: {time.time() - start_time :.2f})', file=sys.stderr)