You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
392 lines
14 KiB
392 lines
14 KiB
from datetime import datetime
|
|
import numbers
|
|
import numpy as np
|
|
import pandas as pd
|
|
from scipy.special import expit
|
|
from scipy.stats import iqr
|
|
from statsmodels.nonparametric.smoothers_lowess import lowess
|
|
from util.dev_util import compare_versions
|
|
|
|
from io import open
|
|
from util.setup_logging import get_logger
|
|
from six.moves import range
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
pd.options.mode.chained_assignment = 'raise'
|
|
|
|
EPSILON = 1e-7
|
|
|
|
DEFAULT_Z = 3.3
|
|
DEFAULT_SENSITIVITY_MULTIPLIER = 3.0
|
|
DEFAULT_IQR_MULTIPLIER = 1.5
|
|
DEFAULT_QUANTILE = [99.5, 0.5]
|
|
|
|
THRESH_SUPER_SPIKE = 3.0
|
|
THRESH_SUPER_SPIKE_REGULARITY = 7
|
|
|
|
ALGO_STD = 'stdev'
|
|
ALGO_QUANTILE = 'quantile'
|
|
ALGO_RANGE = 'range'
|
|
ALGO_PERCENT = 'percentage'
|
|
ALGO_IQR = 'iqr'
|
|
|
|
COL_TIMESTAMP = 'timestamp'
|
|
COL_KPI_ID = 'kpi_id'
|
|
COL_SERVICE_ID = 'service_id'
|
|
COL_DATE = 'date'
|
|
COL_VALUE = 'value'
|
|
COL_HOUR = 'HourOfDay'
|
|
COL_DAY_OF_WEEK = 'DayOfWeek'
|
|
|
|
DAY_OF_WEEK_NAME = {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'}
|
|
|
|
NAB_TIMESTAMP_FORMAT = '%Y-%m-%d %H:%M:%S'
|
|
ITSI_TIMESTAMP_FORMAT = '%Y-%m-%dT%H:%M:%S.000%z'
|
|
ITSI_TIMESTAMP_FORMAT_PD = '%Y-%m-%d %H:%M:%S%z'
|
|
|
|
COL_BND_LOW = 'bnd_low'
|
|
COL_BND_UP = 'bnd_up'
|
|
|
|
COL_ITSI_BND_LOW = 'itsi_bnd_low'
|
|
COL_ITSI_BND_UP = 'itsi_bnd_up'
|
|
|
|
COL_ANOMALY_LABEL = 'anomaly_label' # binary 0-1 label
|
|
COL_ANOMALY_SCORE = 'anomaly_score' # anomaly score express as the distance between a data point to the anomaly boundary
|
|
|
|
COL_EDGE_MASK = 'edge_mask' # binary 0-1 mask to mitigate false alarms at the edge of segments
|
|
|
|
class NotEnoughDataException(Exception):
|
|
pass
|
|
|
|
def sigmoid(x, trans=1):
|
|
"""
|
|
Use the sigmoid function to transfer x to [0,1]
|
|
Use the default value 1 for trans to make sure x=1 map to 0.5
|
|
"""
|
|
# Use the expit function from scipy, b/c it is more robust and efficient:
|
|
# https://stackoverflow.com/questions/21106134/numpy-pure-functions-for-performance-caching
|
|
return expit(x - trans)
|
|
|
|
SPL_DFFAULT_TIME_COL = '_time'
|
|
SPL_VAL_COL = {'count', 'value', 'alert_value'}
|
|
|
|
def is_time_column(df, i):
|
|
if df.columns[i] == SPL_DFFAULT_TIME_COL:
|
|
return True
|
|
|
|
sample = df[df.columns[i]].iloc[0]
|
|
try:
|
|
_ = pd.to_datetime(sample)
|
|
return True
|
|
except:
|
|
return False
|
|
|
|
def is_val_column(df, i):
|
|
if df.columns[i] in SPL_VAL_COL:
|
|
return True
|
|
|
|
sample = df[df.columns[i]].iloc[0]
|
|
if isinstance(sample, numbers.Number):
|
|
return True
|
|
|
|
return False
|
|
|
|
def idenfity_time_and_value_columns(df):
|
|
if len(df) < 1:
|
|
raise RuntimeError('the dataframe has less than 1 rows')
|
|
|
|
cnt_columns = len(df.columns)
|
|
if cnt_columns < 2:
|
|
raise RuntimeError('the dataframe has less than 2 columns')
|
|
|
|
if cnt_columns == 2:
|
|
if df.columns[1].find('time') >= 0: # handle the case of some CSV files have time and value columns in opposite order
|
|
df.columns = [COL_VALUE, COL_TIMESTAMP]
|
|
else:
|
|
df.columns = [COL_TIMESTAMP, COL_VALUE]
|
|
return df
|
|
|
|
time_col_idx = -1
|
|
val_col_idx = -1
|
|
cnt_columns = len(df.columns)
|
|
for i in range(cnt_columns):
|
|
if time_col_idx < 0:
|
|
if is_time_column(df, i):
|
|
time_col_idx = i
|
|
if val_col_idx < 0:
|
|
if is_val_column(df, i):
|
|
val_col_idx = i
|
|
if time_col_idx >= 0 and val_col_idx >= 0:
|
|
break
|
|
|
|
if time_col_idx >= 0 and val_col_idx >= 0:
|
|
time_col = df.columns[time_col_idx]
|
|
val_col = df.columns[val_col_idx]
|
|
df = df[[time_col, val_col]]
|
|
df.columns = [COL_TIMESTAMP, COL_VALUE]
|
|
return df
|
|
|
|
raise RuntimeError('cannot identify time and value columns in the dataframe')
|
|
|
|
def is_unix_epoch_timestamp(timestamp):
|
|
try:
|
|
# Check if it can be converted to a number
|
|
float(timestamp)
|
|
|
|
# Additional check for reasonable range
|
|
# (assuming timestamps between 1970 and 2100)
|
|
return 0 < float(timestamp) < 4102444800
|
|
except (TypeError, ValueError):
|
|
return False
|
|
|
|
def is_mixed_timezones(timestamps):
|
|
"""
|
|
Ref of "Timezones and time offsets" :
|
|
https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html#to-datetime-tz-examples
|
|
"""
|
|
if is_unix_epoch_timestamp(timestamps.iloc[0]):
|
|
return False
|
|
|
|
datetimes = pd.to_datetime(timestamps, infer_datetime_format=True)
|
|
# In the case of mixed timezones, the pd.to_datetime(..) function will default to returning generic object types
|
|
return datetimes.dtype == np.dtype(object)
|
|
|
|
def aug_datetime(df, unit="ms", drop_raw_timestamp = True):
|
|
is_mix_tz = is_mixed_timezones(df[COL_TIMESTAMP])
|
|
|
|
try:
|
|
df[COL_DATE] = pd.to_datetime(df[COL_TIMESTAMP], unit=unit, utc=is_mix_tz)
|
|
except:
|
|
df[COL_DATE] = pd.to_datetime(df[COL_TIMESTAMP], infer_datetime_format=True, utc=is_mix_tz)
|
|
|
|
if drop_raw_timestamp:
|
|
df.drop(COL_TIMESTAMP, axis=1, inplace=True)
|
|
df[COL_DAY_OF_WEEK] = df[COL_DATE].dt.dayofweek
|
|
df[COL_HOUR] = df[COL_DATE].dt.hour
|
|
|
|
df.dropna(inplace=True) # TODO: try to remove the dropna
|
|
df[COL_DAY_OF_WEEK] = df[COL_DAY_OF_WEEK].astype("uint8")
|
|
df[COL_HOUR] = df[COL_HOUR].astype("uint8")
|
|
|
|
def load_time_series_from_file(filename, filterCol=None):
|
|
with open(filename) as f:
|
|
df = pd.read_csv(f)
|
|
|
|
if filterCol is not None and len(filterCol)==2:
|
|
col_name = filterCol[0]
|
|
col_value = filterCol[1]
|
|
df = df[df[col_name] == col_value]
|
|
|
|
df = idenfity_time_and_value_columns(df)
|
|
aug_datetime(df)
|
|
df = df.set_index(COL_DATE)
|
|
df = df.sort_index()
|
|
|
|
return df
|
|
|
|
def get_resolution(df):
|
|
ts_diff = df.index[:min(df.shape[0], 1000)].to_series().diff()
|
|
frequent_timestamp_delta_sorted = ts_diff.value_counts()
|
|
|
|
if frequent_timestamp_delta_sorted.index[0] == pd.Timedelta(seconds=0):
|
|
assert len(frequent_timestamp_delta_sorted) > 1
|
|
return frequent_timestamp_delta_sorted.index[1]
|
|
else:
|
|
return frequent_timestamp_delta_sorted.index[0]
|
|
|
|
def down_sample(df, resolution='15min', df_resolution=None):
|
|
# resolution is 'rule' supported by https://pandas.pydata.org/docs/reference/api/pandas.Series.resample.html
|
|
# examples: 3min or 3T : for resolution of 3 minutes
|
|
# 30S : for resolution of 30 seconds
|
|
|
|
if not df_resolution:
|
|
df_resolution = get_resolution(df)
|
|
if df_resolution > (pd.Timedelta(resolution) if isinstance(resolution, str) else resolution):
|
|
logger.warning(f'The downsampling target resolution of {resolution} is finer than the resolution of the input time series of {df_resolution}; no change is made to the resolution of the input time series.')
|
|
return df
|
|
|
|
# replace the missing value with forward filling
|
|
# also considerred median filling and filling with 0
|
|
# choose forward filling because forward filling require
|
|
# less domain knowledge and less assumption
|
|
df_resampled = df.resample(resolution).mean()
|
|
df_resampled[COL_VALUE].fillna(method="ffill", inplace=True)
|
|
|
|
df_resampled[COL_DAY_OF_WEEK].fillna(method="ffill", inplace=True)
|
|
df_resampled[COL_HOUR].fillna(method="ffill", inplace=True)
|
|
df[COL_DAY_OF_WEEK] = df[COL_DAY_OF_WEEK].astype("uint8")
|
|
df[COL_HOUR] = df[COL_HOUR].astype("uint8")
|
|
|
|
return df_resampled
|
|
|
|
# remove trend on the basis of day median, so that
|
|
# trend from day to day are removed, but differences among hours in a day is kept
|
|
def detrend_daily(df, offset=0):
|
|
rslt = df.copy()
|
|
overall_med = df[COL_VALUE].median() # use median to make it robust to extreme values
|
|
dfre = resample_with_offset(df, '24h', offset)
|
|
for _, sub_df in dfre:
|
|
med = sub_df[COL_VALUE].median()
|
|
rslt.loc[sub_df.index, COL_VALUE] = sub_df[COL_VALUE] - med + overall_med
|
|
return rslt
|
|
|
|
def resample_with_offset(df, resolution, offset_hours):
|
|
pandas_version = pd.__version__
|
|
comparison = compare_versions(pandas_version, '1.1.0')
|
|
if comparison < 0:
|
|
# This means we are using pandas < 1.1.0
|
|
dfre = df.resample(resolution, base=offset_hours)
|
|
else:
|
|
# This means we are using pandas >= 1.1.0
|
|
dfre = df.resample(resolution, offset=f'{offset_hours}h')
|
|
return dfre
|
|
|
|
# convert anomaly labels represented as strings to datetime
|
|
def anomaly_labels_to_datetime(anomaly_labels, timestamp_format):
|
|
anomaly_datetime = []
|
|
for anomaly in anomaly_labels:
|
|
if isinstance(anomaly, tuple): # anomaly region
|
|
anomaly_datetime.append([datetime.strptime(endpoint, timestamp_format) for endpoint in anomaly])
|
|
else:
|
|
anomaly_datetime.append(datetime.strptime(anomaly, timestamp_format))
|
|
return anomaly_datetime
|
|
|
|
def smooth(xs, frac):
|
|
filtered = lowess(xs, range(len(xs)), is_sorted=True, frac=frac, it=0)
|
|
return [x[1] for x in filtered]
|
|
|
|
def remove_super_spikes(df):
|
|
"""
|
|
As a pre-processing, remove super high spikes, so that the calculation of thresholds are not distorted by them
|
|
"""
|
|
df_resolution = get_resolution(df)
|
|
win_length = int(pd.Timedelta(days=1) / df_resolution) # one-day long window
|
|
half_day = pd.Timedelta(hours=12)
|
|
|
|
thresh_regularity = max( min(
|
|
THRESH_SUPER_SPIKE_REGULARITY,
|
|
int((df.index[-1] - df.index[0]) / pd.Timedelta(days=1)) - 1
|
|
), 4
|
|
)
|
|
|
|
values = df[COL_VALUE].to_numpy()
|
|
mid = np.median(values)
|
|
variation_unit = iqr(values)
|
|
if variation_unit == 0 :
|
|
variation_unit = values.std() + 1e-6
|
|
deviation_normalized = np.abs(df[COL_VALUE] - mid) / variation_unit
|
|
|
|
frac = 10 / len(deviation_normalized)
|
|
smoothed = pd.Series(smooth(deviation_normalized.values, frac), index=deviation_normalized.index)
|
|
deviation_normalized.update(smoothed)
|
|
|
|
def _count_high_spikes_same_hour_other_day(idx_in, similar_spikes):
|
|
cnt = 0
|
|
# Here the type of the difference between two timestamp index is pandas.Timedelta.
|
|
# There is no well-defined 'abs' and the negative case is also tricky:
|
|
# https://stackoverflow.com/questions/31836788/subtracting-pandas-timestamps-absolute-value
|
|
other_spikes = similar_spikes.loc[
|
|
# select rows at least half-day away and also with same value for hour-of-day
|
|
((similar_spikes.index - idx_in > half_day) | (idx_in - similar_spikes.index > half_day)) &
|
|
(abs(similar_spikes.index.hour - idx_in.hour)<=1)
|
|
]
|
|
if len(other_spikes) == 0:
|
|
return cnt
|
|
|
|
cnt = 1
|
|
other_spikes_idx = other_spikes.index
|
|
idx_pre = other_spikes_idx[0]
|
|
for idx in other_spikes_idx:
|
|
if idx - idx_pre > df_resolution: # consecutive points do NOT count as multiple spikes
|
|
cnt += 1
|
|
idx_pre = idx
|
|
|
|
return cnt
|
|
|
|
def _is_local_minimal(values, idx):
|
|
if idx == 0:
|
|
return values.iloc[idx] < values.iloc[idx + 1]
|
|
elif idx == len(values) - 1:
|
|
return values.iloc[idx] < values.iloc[idx - 1]
|
|
else:
|
|
return (values.iloc[idx] < values.iloc[idx + 1]) and (values.iloc[idx] < values.iloc[idx - 1])
|
|
|
|
def _locate_spike_range(idx_i):
|
|
if isinstance(idx_i, int):
|
|
left = idx_i
|
|
right = idx_i
|
|
else: # idx_i could be a slice, if df.index.get_loc does not get a unique match
|
|
left = idx_i.start
|
|
right = idx_i.stop
|
|
|
|
while left > 0 and deviation_normalized.iloc[left] > 0 and (deviation_normalized.iloc[left] > THRESH_SUPER_SPIKE or not _is_local_minimal(deviation_normalized, left)):
|
|
left -= 1
|
|
|
|
while right <= len(df) - 1 and deviation_normalized.iloc[right] > 0 and (deviation_normalized.iloc[right] > THRESH_SUPER_SPIKE or not _is_local_minimal(deviation_normalized, right)):
|
|
right += 1
|
|
return left, right
|
|
|
|
def _calc_neighbor_values(left, right):
|
|
left_neighbor = df[COL_VALUE].iloc[max(0, left - win_length):left]
|
|
left_mean, left_std = left_neighbor.mean(), left_neighbor.std()
|
|
right_neighbor = df[COL_VALUE].iloc[right : min(right + win_length, len(df))]
|
|
right_mean, right_std = right_neighbor.mean(), right_neighbor.std()
|
|
|
|
if len(left_neighbor) == 0:
|
|
neighbor_mean = right_mean
|
|
elif len(right_neighbor) == 0:
|
|
neighbor_mean = left_mean
|
|
else:
|
|
if left_std + right_std > 0:
|
|
neighbor_mean = (left_mean * right_std + right_mean * left_std) / (left_std + right_std)
|
|
else:
|
|
neighbor_mean = (left_mean + right_mean) / 2
|
|
|
|
return neighbor_mean
|
|
|
|
cnt_iter = 0
|
|
iter_limit = int(len(df)*0.01)
|
|
cnt_removed_spikes = 0
|
|
removed_spikes = []
|
|
regular_spikes = []
|
|
while True:
|
|
cnt_iter += 1
|
|
if cnt_iter >= iter_limit:
|
|
break
|
|
|
|
if len(regular_spikes) > 0:
|
|
deviation_normalized_filter_regular = deviation_normalized[~deviation_normalized.index.isin(regular_spikes)]
|
|
else:
|
|
deviation_normalized_filter_regular = deviation_normalized
|
|
|
|
if len(deviation_normalized_filter_regular) == 0:
|
|
break
|
|
max_val = deviation_normalized_filter_regular.max()
|
|
if max_val < THRESH_SUPER_SPIKE :
|
|
break
|
|
max_idx = deviation_normalized_filter_regular.idxmax()
|
|
|
|
idx_i = deviation_normalized.index.get_loc(max_idx)
|
|
# need to remove the whole spike, not just the highest part of it
|
|
left, right = _locate_spike_range(idx_i)
|
|
|
|
similar_spikes = df[(deviation_normalized > max(0.5 * max_val, THRESH_SUPER_SPIKE)) & (deviation_normalized < 2.0 * max_val)]
|
|
# Only remove the super spikes that are unlikely to happen regularly on multiple days
|
|
regularity_count = _count_high_spikes_same_hour_other_day(max_idx, similar_spikes)
|
|
if regularity_count < thresh_regularity:
|
|
idx_i = idx_i if isinstance(idx_i, int) else idx_i.start # idx_i could be a slice when df.index.get_loc does not get a unique match
|
|
removed_spikes.append(f'({max_idx}, {df[COL_VALUE].iloc[idx_i]:.3f}, {deviation_normalized.iloc[idx_i]:.3f})')
|
|
df.loc[df.index[left:right], COL_VALUE] = _calc_neighbor_values(left, right)
|
|
|
|
deviation_normalized.iloc[left:right] = -0.3
|
|
cnt_removed_spikes += 1
|
|
else:
|
|
regular_spikes += deviation_normalized.index[left:right].tolist()
|
|
|
|
if cnt_removed_spikes > 0:
|
|
logger.debug(f'cnt_removed_spikes={cnt_removed_spikes} (cnt_iter={cnt_iter}) {removed_spikes}')
|
|
|
|
return df
|