You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
134 lines
5.9 KiB
134 lines
5.9 KiB
from dataclasses import dataclass
|
|
import time
|
|
import numpy as np
|
|
import pandas as pd
|
|
import stumpy
|
|
from statsmodels.nonparametric.smoothers_lowess import lowess
|
|
from scipy.signal import argrelmin
|
|
|
|
from util.data_prepare import down_sample, get_resolution, COL_VALUE
|
|
|
|
from util import setup_logging
|
|
from six.moves import range, zip
|
|
logger = setup_logging.get_logger()
|
|
|
|
@dataclass(frozen=True)
|
|
class Params_PatternSwitchDetector:
|
|
MP_WINDOW_LENGTH = pd.Timedelta(days=1)
|
|
COUNT_CROSS_EDGE = pd.Timedelta(days=1)
|
|
COUNT_CROSS_STEP = pd.Timedelta(hours=1)
|
|
SMOOTH_LENGTH = pd.Timedelta(weeks=1)
|
|
MINIMA_WEIGHT_THRESHOLD = 0.1
|
|
|
|
PARAM_PSD = Params_PatternSwitchDetector()
|
|
|
|
def detect_pattern_switch(df):
|
|
"""This function implements pattern switch detection in time series based on matrix prlfile.
|
|
|
|
How matrix profile is used for pattern switch detection:
|
|
For a given point in the time series, the matrix profile API computes the minimum distance
|
|
between the segment starting from the current point and all other segments of the same length
|
|
in the time series. Additionally, it provides the position of the segment with the minimal distance.
|
|
This position info can be treated as a pointer to the minimal-distance segment starting from the
|
|
initial segment.
|
|
|
|
From the concept of pointer to minimal-distance segment, we can define a test statistic: count of cross.
|
|
The count of cross for a data point in the time series is defined as the count of those pointers that go cross it.
|
|
|
|
Intuitively, when encountering a pattern switch point within the time series, the segments before
|
|
the switch resemble those segments before the switch point, and similarly for segments after the
|
|
switch point. The count of cross on the switch point should be very small.
|
|
|
|
Consequently, the pattern switch detection is implemented by --
|
|
identifying significant local minima of the 'count of across' test statistic.
|
|
|
|
Args:
|
|
df (pandas Dataframe): dataframe containing a time series
|
|
|
|
Returns:
|
|
list of pandas Timestamp: list of pattern switch points, could be an empty list
|
|
"""
|
|
time0 = time.time()
|
|
df = down_sample(df)
|
|
df_resolution = get_resolution(df)
|
|
|
|
window_length = int(PARAM_PSD.MP_WINDOW_LENGTH / df_resolution)
|
|
edge_length = int(PARAM_PSD.COUNT_CROSS_EDGE / df_resolution)
|
|
step_size = int(PARAM_PSD.COUNT_CROSS_STEP / df_resolution)
|
|
smooth_length = int(PARAM_PSD.SMOOTH_LENGTH / df_resolution)
|
|
|
|
mp = stumpy.stump(df[COL_VALUE], m=window_length, normalize=True)
|
|
counts = count_cross(
|
|
mp[:,1], # pointers of minimal-distance segment
|
|
edge_length,
|
|
step_size
|
|
)
|
|
if len(counts) == 0:
|
|
return []
|
|
|
|
# smooth the raw count-cross, to remove small local random variations
|
|
frac = smooth_length / len(df)
|
|
counts_s = np.array(smooth(counts, frac))
|
|
|
|
# get the indexes of local minima
|
|
min_idx_list = argrelmin(counts_s)
|
|
min_idx_list = min_idx_list[0]
|
|
|
|
# Calcualte weights for each local minimal of the counts_s array
|
|
# When the counts_s array contains local maxima:
|
|
# For each local minimal, use its closest local maximal as the reference point, calculate the weight as the relative difference between the two
|
|
# When the counts_s array contains no local maxima:
|
|
# use the median of the counts_s array as the reference for weight calculating, instead.
|
|
min_weights = []
|
|
if len(min_idx_list) > 0:
|
|
max_idx_list = argrelmin(-counts_s)
|
|
max_idx_list = max_idx_list[0]
|
|
if len(max_idx_list) > 0:
|
|
min_closest_neighbor = [max_idx_list[np.argmin(abs(min_idx - max_idx_list))] for min_idx in min_idx_list]
|
|
min_weights = [(counts_s[min_neighbor] - counts_s[min_idx])/counts_s[min_idx] for (min_idx, min_neighbor) in zip(min_idx_list, min_closest_neighbor)]
|
|
else:
|
|
reference_value = np.median(counts_s)
|
|
min_weights = [(reference_value - counts_s[min_idx])/counts_s[min_idx] for min_idx in min_idx_list]
|
|
min_weights = np.array(min_weights)
|
|
|
|
# convert indexes of local minima to timestamps
|
|
switch_time0 = df.index[0] + PARAM_PSD.COUNT_CROSS_EDGE
|
|
switch_time = np.array([switch_time0 + pd.Timedelta(idx * PARAM_PSD.COUNT_CROSS_STEP) for idx in min_idx_list])
|
|
# only return significant local minima, filtered by the weights
|
|
rslt = switch_time[min_weights > PARAM_PSD.MINIMA_WEIGHT_THRESHOLD]
|
|
|
|
rslt_str = 'no patterh switch detected' if len(rslt) == 0 else f'patterh switch detected: {rslt}'
|
|
logger.debug(f'pattern switch detection time spent: {time.time() - time0:.2f}; {rslt_str}')
|
|
|
|
return rslt
|
|
|
|
def count_cross(mp_idx, edge, step):
|
|
"""Calculate the test statistic: count-cross
|
|
|
|
Two technical details regarding the calculating of count-cross:
|
|
1. Points at the beginning and the ending of the time series natually have smaller count-cross. Instead
|
|
of applying some normalization methtods, this implementation simply skips the two edges. This choice seems
|
|
working fine, and faster.
|
|
2. To reduce the latency of pattern switch detection, instead of calculating count-cross for all data points,
|
|
they are calculated every 'step' points.
|
|
|
|
Args:
|
|
mp_idx (numpy array): array of pointers to minimal-distance segment, returned from matrix profile API
|
|
edge (int): the size of the beginning and ending of the time series that should be skipped
|
|
step (int): the size of the stride to calculate count-cross
|
|
|
|
Returns:
|
|
_type_: _description_
|
|
"""
|
|
if len(mp_idx) <= 2 * edge:
|
|
return []
|
|
|
|
counts = np.zeros_like(mp_idx, dtype=int)
|
|
for i in range(edge, len(mp_idx)-edge, step):
|
|
counts[i] = sum(mp_idx[:i]>i) + sum(mp_idx[i:]<i)
|
|
return counts[counts>0]
|
|
|
|
def smooth(xs, frac):
|
|
filtered = lowess(xs, list(range(len(xs))), is_sorted=True, frac=frac, it=0)
|
|
return [x[1] for x in filtered]
|