You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

128 lines
4.3 KiB

#!/usr/bin/env python
import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import acf
import cexc
from base import BaseAlgo
from util import df_util
from util.algo_util import alpha_to_confidence_interval, confidence_interval_to_alpha
from util.param_util import convert_params
messages = cexc.get_messages_logger()
class ACF(BaseAlgo):
"""Compute autocorrelation function."""
def __init__(self, options):
self.handle_options(options)
params = options.get('params', {})
converted_params = convert_params(
params, ints=['k', 'conf_interval'], bools=['fft'], aliases={'k': 'nlags'}
)
# Set the default name to be used so that PACF can override
self.default_name = 'acf({})'
# Set the lags, alpha and fft parameters
self.nlags = converted_params.pop('nlags', 40)
self.fft = converted_params.pop('fft', False)
conf_int = converted_params.pop('conf_interval', 95)
if conf_int <= 0 or conf_int >= 100:
raise RuntimeError('conf_interval cannot be less than 1 or more than 99.')
if self.nlags <= 0:
raise RuntimeError('k must be greater than 0.')
self.alpha = confidence_interval_to_alpha(conf_int)
@staticmethod
def handle_options(options):
"""Ensure features are present but no target variable.
Args:
options (dict): algorithm options
Raises:
RuntimeError
"""
feature_vars = options.get('feature_variables', [])
target_vars = options.get('target_variable', [])
if len(feature_vars) != 1:
raise RuntimeError('You must specify one field.')
if len(target_vars) != 0:
raise RuntimeError('You cannot use from clause here.')
def _calculate(self, df):
"""Calculate the ACF.
Args:
X (dataframe): input data
Returns:
autocors (array): array of autocorrelations
conf_int (array): array of confidence intervals
"""
autocors, conf_int = acf(x=df.values, nlags=self.nlags, alpha=self.alpha, fft=self.fft)
return autocors, conf_int
def fit(self, df, options):
X = df.copy()
X, nans, _ = df_util.prepare_features(
X=X,
variables=self.feature_variables,
mlspl_limits=options.get('mlspl_limits'),
get_dummies=False,
)
number_of_nulls = nans.sum()
if number_of_nulls > 0:
messages.warn('{} events with nulls were dropped.'.format(number_of_nulls))
if self.nlags >= len(X):
raise RuntimeError('k must be less than number of events.')
# Only fields allowed (in case fields expanded through glob matching).
if len(self.feature_variables) > 1:
temp = 'You must specify only one field. Multiple fields found: {}'
err = temp.format(', '.join(self.feature_variables))
raise RuntimeError(err)
# Only numeric inputs allowed.
if X[self.feature_variables].dtypes.tolist()[0] == object:
temp = '{} contains non-numeric data. {} only accepts numeric data.'
err = temp.format(self.feature_variables[0], self.__class__.__name__)
raise RuntimeError(err)
# Get calculation
autocors, conf_int = self._calculate(X)
conf_int = conf_int - conf_int.mean(1)[:, None]
# autocors[:, None] converts 1D-array to 2D for concatenation match
autocors_2d = autocors[:, None]
stacked = np.concatenate([autocors_2d, conf_int], axis=1)
# Get the default name
output_name = options.get('output_name', self.feature_variables[0])
name = self.default_name.format(output_name)
# Lower and upper names
confidence_interval = alpha_to_confidence_interval(self.alpha)
lower_name = 'lower{}({})'.format(confidence_interval, name)
upper_name = 'upper{}({})'.format(confidence_interval, name)
# Splunk arranges columns via ascii ordering
# So the capital L on Lag ensures it will be in the leftmost column
output_names = ['Lag', name, lower_name, upper_name]
output = pd.DataFrame(stacked)
output = output.reset_index()
output.columns = output_names
return output