You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
Splunk_Deploiement/apps/Splunk_ML_Toolkit/bin/util/telemetry_util.py

554 lines
18 KiB

import hashlib
import json
import re
import time
import uuid
import numpy as np
import cexc
from util.constants import TELEMETRY_ID_REGEX
from util.error_util import safe_func
logger = cexc.get_logger('telemetry_logger')
example_names = {
'server_power': 'Predict Server Power Consumption',
'app_usage': 'Predict VPN Usage',
'housing': 'Predict Median House Value',
'energy_output': 'Predict Power Plant Energy Output',
'future_logins': 'Predict Future Logins',
'future_vpn_sinusoidal': 'Predict Future VPN Usage (sinusoidal time)',
'future_vpn_categorical': 'Predict Future VPN Usage (categorical time)',
'disk_failures': 'Predict Hard Drive Failure',
'malware': 'Predict the Presence of Malware',
'churn': 'Predict Telecom Customer Churn',
'diabetes': 'Predict the Presence of Diabetes',
'vehicle_type': 'Predict Vehicle Make and Model',
'external_anomalies': 'Predict External Anomalies',
'server_response_time': 'Detect Outliers in Server Response Time',
'numeric_employee_logins': 'Detect Outliers in Number of Logins (vs. Predicted Value)',
'numeric_supermarket_purchases': 'Detect Outliers in Supermarket Purchases',
'power_plant_humidity': 'Detect Outliers in Power Plant Humidity',
'call_center_cyclical': 'Detect Cyclical Outliers in Call Center Data',
'logins_cyclical': 'Detect Cyclical Outliers in Logins',
'categorical_disk_failures': 'Detect Outliers in Disk Failures',
'bitcoin_transactions': 'Detect Outliers in Bitcoin Transactions',
'categorical_supermarket_purchases': 'Detect Outliers in Supermarket Purchases',
'mortage_loans_data_ny': 'Detect Outliers in Mortgage Contracts',
'diabetic_data': 'Detect Outliers in Diabetes Patient Records',
'phone_usage': 'Detect Outliers in Mobile Phone Activity',
'internet_traffic': 'Forecast Internet Traffic',
'forecast_employee_logins': 'Forecast the Number of Employee Logins',
'souvenir_sales': 'Forecast Monthly Sales',
'bluetooth_devices': 'Forecast the Number of Bluetooth Devices',
'exchange_rate_ARIMA': 'Forecast Exchange Rate TWI using ARIMA',
'hard_drives': 'Cluster Hard Drives by SMART Metrics',
'apps': 'Cluster Behavior by App Usage',
'cluster_housing': 'Cluster Neighborhoods by Properties',
'vehicles': 'Cluster Vehicles by Onboard Metrics',
'powerplant': 'Cluster Power Plant Operating Regimes',
'business_anomalies': 'Cluster Business Anomalies to Reduce Noise',
'sf_app_usage': 'Forecast App Expenses',
'sf_call_center': 'Forecast the Number of Calls to a Call Center',
'sf_app_logons': 'Forecast App Logons with Special Days',
'sf_app_usage_multiple': 'Forecast App Expenses from Multiple Variables',
'soda_disk_failure': 'Find Anomalies in Hard Drive Metrics',
'soda_supermarket': 'Find Anomalies in Supermarket Purchases',
'property_descriptions': 'Cluster Houses by Property Descriptions',
'mortgage_loans': 'Cluster Mortgage Loans',
'disk_utilization': 'Predict Disk Utilization',
'firewall_traffic': 'Predict the Presence of Vulnerabilities',
'ai_assistant_example_I': 'Summarization',
'ai_assistant_example_II': 'Field Extraction',
'ai_assistant_example_III': 'Anomaly Detection',
}
algorithm_and_parameter_white_list = {
'ACF': {'k', 'conf_interval', 'fft'},
'ARIMA': {'order', 'forecast_k', 'conf_interval', 'holdback'},
'AutoPrediction': {
'target_type',
'test_split_ratio',
'random_state',
'n_estimators',
'max_depth',
'min_samples_split',
'max_leaf_nodes',
'max_features',
'criterion',
},
'BernoulliNB': {'alpha', 'binarize', 'fit_prior'},
'Birch': {'k'},
'DBSCAN': {'eps'},
'DecisionTreeClassifier': {
'random_state',
'max_depth',
'min_samples_split',
'max_leaf_nodes',
'criterion',
'splitter',
'max_features',
},
'DecisionTreeRegressor': {
'random_state',
'max_depth',
'min_samples_split',
'max_leaf_nodes',
'splitter',
'max_features',
},
'DensityFunction': {
'dist',
'show_density',
'threshold',
'lower_threshold',
'upper_threshold',
'metric',
'supervise_split_by',
},
'ElasticNet': {'fit_intercept', 'normalize', 'alpha', 'l1_ratio'},
'FieldSelector': {'param', 'type', 'mode'},
'GaussianNB': {},
'GMeans': {'kmax', 'random_state'},
'GradientBoostingClassifier': {
'loss',
'max_features',
'learning_rate',
'min_weight_fraction_leaf',
'n_estimators',
'max_depth',
'min_samples_split',
'min_samples_leaf',
'max_leaf_nodes',
'random_state',
},
'GradientBoostingRegressor': {
'loss',
'max_features',
'learning_rate',
'min_weight_fraction_leaf',
'alpha',
'subsample',
'n_estimators',
'max_depth',
'min_samples_split',
'min_samples_leaf',
'max_leaf_nodes',
'random_state',
},
'HashingVectorizer': {
'max_features',
'random_state',
'n_iters',
'k',
'stop_words',
'analyzer',
'norm',
'token_pattern',
'ngram_range',
'reduce',
},
'ICA': {'n_components', 'max_iter', 'random_state', 'tol', 'whiten', 'algorithm', 'fun'},
'Imputer': {'missing_values', 'strategy', 'field'},
'KernelPCA': {'k', 'degree', 'alpha', 'max_iteration', 'gamma', 'tolerance'},
'KernelRidge': {'gamma'},
'KMeans': {'k', 'random_state'},
'Lasso': {'alpha'},
'LinearRegression': {'fit_intercept', 'normalize'},
'LocalOutlierFactor': {
'n_neighbors',
'leaf_size',
'p',
'contamination',
'algorithm',
'metric',
'anomaly_score',
},
'LogisticRegression': {'fit_intercept', 'probabilities'},
'MLPClassifier': {
'batch_size',
'max_iter',
'random_state',
'tol',
'momentum',
'activation',
'solver',
'learning_rate',
'hidden_layer_sizes',
},
'MultivariateOutlierDetection': {
'dist',
'show_density',
'threshold',
'lower_threshold',
'upper_threshold',
'metric',
},
'NPR': {},
'OneClassSVM': {'gamma', 'coef0', 'tol', 'nu', 'degree', 'shrinking', 'kernel'},
'PACF': {'k', 'conf_interval', 'method'},
'PCA': {'k'},
'RandomForestClassifier': {
'random_state',
'n_estimators',
'max_depth',
'min_samples_split',
'max_leaf_nodes',
'max_features',
'criterion',
},
'RandomForestRegressor': {
'random_state',
'n_estimators',
'max_depth',
'min_samples_split',
'max_leaf_nodes',
'max_features',
},
'Ridge': {'fit_intercept', 'normalize', 'alpha'},
'RobustScaler': {'with_centering', 'with_scaling', 'quantile_range'},
'SGDClassifier': {
'fit_intercept',
'random_state',
'n_iter',
'l1_ratio',
'alpha',
'eta0',
'power_t',
'loss',
'penalty',
'learning_rate',
},
'SGDRegressor': {
'fit_intercept',
'random_state',
'n_iter',
'l1_ratio',
'alpha',
'eta0',
'power_t',
'penalty',
'learning_rate',
},
'SpectralClustering': {'gamma', 'affinity', 'k', 'random_state'},
'StandardScaler': {'with_mean', 'with_std'},
'StateSpaceForecast': {
'conf_interval',
'forecast_k',
'holdback',
'output_fit',
'period',
'specialdays',
'update_last',
},
'SVM': {'gamma', 'C'},
'SystemIdentification': {
'time_field',
'dynamics',
'layers',
'conf_interval',
'horizon',
'epochs',
'random_state',
'shuffle',
},
'TFIDF': {
'max_features',
'max_df',
'min_df',
'ngram_range',
'stop_words',
'analyzer',
'norm',
'token_pattern',
},
'XMeans': {'kmax', 'random_state'},
'onnx': {},
}
apps_white_list = {
'search',
'dga_analysis',
'Splunk_ML_Toolkit',
'Splunk_ML_Toolkit_beta',
'Splunk_ML_Toolkit_advisory',
'itsi',
'SplunkEnterpriseSecuritySuite',
'SA_mltk_contrib_app',
'Splunk_Essentials_Predictive_Maintenance_for_IOT',
'Splunk-SE-Fraud-Detection',
}
scoring_and_parameter_white_list = {
'accuracy_score': {'normalize'},
'confusion_matrix': {},
'f1_score': {'average', 'pos_label'},
'precision_score': {'average', 'pos_label'},
'precision_recall_fscore_support': {'pos_label', 'average', 'beta'},
'recall_score': {'average', 'pos_label'},
'roc_auc_score': {},
'roc_curve': {'pos_label', 'drop_intermediate'},
'silhouette_score': {'metric'},
'pairwise_distances': {'metric', ' output'},
'explained_variance_score': {'multioutput'},
'mean_absolute_error': {},
'mean_squared_error': {},
'r2_score': {'multioutput'},
'describe': {'ddof', 'bias'},
'moment': {'moment'},
'pearsonr': {},
'spearmanr': {},
'tmean': {'lower_limit', 'upper_limit'},
'trim': {'tail', 'proportiontocut'},
'tvar': {'ddof', 'lower_limit', 'upper_limit'},
'adfuller': {'autolag', 'regression', 'maxlag', 'alpha'},
'anova': {'type', 'output', 'test', 'robust', 'formula', 'scale'},
'energy_distance': {},
'f_oneway': {'alpha'},
'kpss': {'regression', 'lags', 'alpha'},
'kstest': {'cdf', 'mode', 'alternative', 'alpha', 'scale', 'df', 's', 'loc'},
'ks_2samp': {'alpha'},
'mannwhitneyu': {'alternative', 'use_continuity', 'alpha'},
'normaltest': {'alpha'},
'ttest_1samp': {'alpha', 'popmean'},
'ttest_ind': {'equal_var', 'alpha'},
'ttest_rel': {'alpha'},
'wasserstein_distance': {},
'wilcoxon': {'zero_method', 'correction', 'alpha'},
}
@safe_func
def log_algo_details(app_name, algo, algo_options):
# Number of fields that have been preprocessed. i.e. contains SS_ prefix, etc.
# Number of pre-processed fields with Standard Scaler
num_fields_SS = len([f for f in algo.feature_variables if f.startswith('SS_')])
# Robust Scaler
num_fields_RS = len([f for f in algo.feature_variables if f.startswith('RS_')])
# PCA or Kernel_PCA
num_fields_PC = len([f for f in algo.feature_variables if f.startswith('PC_')])
# Field Selector
num_fields_fs = len([f for f in algo.feature_variables if f.startswith('fs_')])
# TFIDF
num_fields_tfidf = len([f for f in algo.feature_variables if '_tfidf_' in f])
logger.debug(
f'num_fields={len(algo.feature_variables)}, num_fields_prefixed={num_fields_SS + num_fields_RS + num_fields_PC + num_fields_tfidf + num_fields_fs}, num_fields_SS={num_fields_SS}, num_fields_RS={num_fields_RS}, num_fields_PC={num_fields_PC}, num_fields_fs={num_fields_fs}, num_fields_tfidf={num_fields_tfidf}'
)
algo_name = algo_options.get('algo_name')
options_params = algo_options.get('params')
_log_algorithm_and_param_info(app_name, algo_name, options_params)
@safe_func
def log_scoring_details(app_name, scoring, scoring_options):
logger.debug(f'num_fields={len(scoring.variables)}')
scoring_name = scoring_options.get('scoring_name')
options_params = scoring_options.get('params')
_log_scoring_and_param_info(app_name, scoring_name, options_params)
@safe_func
def log_uuid():
logger.debug("UUID=%s" % str(uuid.uuid4()))
@safe_func
def log_apply_time(interval):
logger.debug("command=apply, apply_time=%f" % interval)
@safe_func
def log_ai_commander_time(interval):
logger.debug("command=ai, ai_processing_time=%f" % interval)
@safe_func
def log_ai_commander_info(provider, model, rows, rows_processed_time):
logger.debug(
"command=ai, provider=%s, model=%s, rows=%d, rows_processed_time=%f"
% (provider, model, rows, rows_processed_time)
)
@safe_func
def log_fit_time(interval):
logger.debug("command=fit, fit_time=%f" % interval)
@safe_func
def log_scoring_time(interval):
logger.debug("command=score, scoringTimeSec=%f" % interval)
@safe_func
def log_partial_fit():
logger.debug("partialFit=True")
@safe_func
def log_experiment_details(model_name):
if model_name.startswith('_exp_'):
id_regex = re.compile(TELEMETRY_ID_REGEX)
id_match = id_regex.match(model_name)
number_match = re.search(r'(?:_)(\d+)$', model_name)
if id_match:
logger.debug("experiment_id=%s" % id_match.group(1))
if number_match:
logger.debug("pipeline_stage=%d" % int(number_match.group(1)))
@safe_func
def log_example_details(model_name):
if model_name.startswith('example_'):
model_name = model_name[8:]
if model_name in example_names:
logger.debug("example_name='%s'" % example_names[model_name])
@safe_func
def log_model_id(model_name):
hash_model_id = hashlib.sha256('{}'.format(model_name).encode())
logger.debug("modelId=%s" % (hash_model_id.hexdigest()))
@safe_func
def log_apply_details(app_name, algo_name, model_options):
options_params = model_options.get('params')
_log_algorithm_and_param_info(app_name, algo_name, options_params)
@safe_func
def log_app_details(app_name):
logger.debug("app_context=%s" % (app_name if app_name in apps_white_list else 'Other'))
def _log_algorithm_and_param_info(app_name, algo_name, params):
if app_name in apps_white_list:
# Log the name of the algorithm which exists in the white list, also log its parameters if in the white list
if algo_name in algorithm_and_parameter_white_list:
params_in_white_list = (
{
p: v
for p, v in list(params.items())
if p in algorithm_and_parameter_white_list[algo_name]
}
if params
else None
)
# Change format of params from dictionary to string while logging
params_to_log = json.dumps(params_in_white_list)
# Log also the number of customer parameters which are not white listed
num_custom_params = (
len(params) - len(params_in_white_list) if params and params_to_log else 0
)
params_to_log = (
f'{params_to_log}, num_custom_params: {num_custom_params}'
if num_custom_params
else params_to_log
)
params_to_log = '{%s}' % params_to_log if params_to_log else None
logger.debug(f'algo_name={algo_name}, params={params_to_log}')
# Log the hash of the algorithm name if it is not in the white list and do not log its parameters
else:
hash_algo_name = hashlib.sha256(f'{algo_name}'.encode())
logger.debug(
"algo_name={}, params=not_available".format(hash_algo_name.hexdigest())
)
else:
logger.debug("algo_name=custom_app_algo, params=not_available")
def _log_scoring_and_param_info(app_name, scoring_name, params):
if app_name in apps_white_list:
# Log the name of the scoring which exists in the white list, also log its parameters if in the white list
if scoring_name in scoring_and_parameter_white_list:
params_in_white_list = (
{
p: v
for p, v in list(params.items())
if p in scoring_and_parameter_white_list[scoring_name]
}
if params
else None
)
# Change format of params from dictionary to string while logging
params_to_log = json.dumps(params_in_white_list)
# Log also the number of custom parameters which are not white listed
num_custom_params = (
len(params) - len(params_in_white_list) if params and params_to_log else 0
)
params_to_log = (
f'{params_to_log}, num_custom_params: {num_custom_params}'
if num_custom_params
else params_to_log
)
params_to_log = '{%s}' % params_to_log if params_to_log else None
logger.debug(f'scoringName={scoring_name}, params={params_to_log}')
# Log the hash of the scoring name if it is not in the white list and do not log its parameters
else:
hash_scoring_name = hashlib.sha256(f'{scoring_name}'.encode())
logger.debug(
"scoringName={}, params=not_available".format(hash_scoring_name.hexdigest())
)
else:
logger.debug("scoringName=custom_app_scoring, params=not_available")
def _field_value(df, fieldname, row_idx=-1):
try:
return df[fieldname].values[row_idx]
except (KeyError, IndexError):
return ''
@safe_func
def log_sourcetype_inference(df, row_idx=-1):
"""
Log information required for sourcetype inference.
Args:
df (DataFrame): pandas data frame
row_idx (int): index of the row to log.
Negative index indicates a random row.
Returns:
True if logged, False otherwise.
"""
if row_idx < 0:
try:
row_idx = np.random.randint(df.shape[0])
except ValueError:
# If we can't get a random row, just log the first row.
row_idx = 0
raw = _field_value(df, '_raw', row_idx)
if raw:
full_punct = re.sub(r'\w', '', raw)
full_punct = re.sub(r'\n', '\\n', full_punct)
full_punct = re.sub(r'\t', '\\t', full_punct)
full_punct = re.sub(r' ', 's', full_punct)
else:
full_punct = ''
sourcetype = _field_value(df, '_sourcetype', row_idx)
if full_punct and sourcetype:
logger.debug(
"Sourcetype inference: Punct {} has sourcetype {}".format(full_punct, sourcetype)
)
return True
else:
return False
class Timer(object):
def __enter__(self):
self.start = time.time()
return self
def __exit__(self, *args):
self.end = time.time()
self.interval = self.end - self.start