### mlspl.conf ######### # This .conf file contains configuration for the "fit" and "apply" # commands included with the Machine Learning Toolkit. # # Put global settings in the [default] stanza and algorithm-specific # settings in a stanza named for the algorithm # (e.g. [LinearRegression] for the LinearRegression algorithm). [default] # Action to perform when new value(s) for categorical variable/explanatory variable is encountered in partial_fit # default : set all values of the column that corresponds to the new categorical value to 0's # skip : skip over rows that contain the new value(s) and raise a warning # stop : stop the operation by raising an error handle_new_cat = default # Maximum number of distinct values in categorical fields imposed # for one-hot encoding. max_distinct_cat_values = 100 # Maximum number of distinct values in categorical fields imposed # for use in classifiers. max_distinct_cat_values_for_classifiers = 100 # Maximum number of distinct values in categorical fields imposed # for use in scoring methods. max_distinct_cat_values_for_scoring = 100 # Maximum time (in seconds) to spend in the "fit" phase of an # algorithm (including down-sampling the input). This does not relate # to the other phases of a search (e.g. retrieving events from an # index). max_fit_time = 600 # max_inputs specifies the maximum number of events an algorithm will # consider when fitting a model. If this limit is exceeded and # use_sampling is true, then the fit command will downsample its input # using the "reservoir sampling" algorithm before fitting a model. If # use_sampling is false and this limit is exceeded, the fit command # will throw an error. max_inputs = 100000 # Maximum allowed memory usage by the fit or apply commands (in # megabytes) while fitting or applying a model, respectively. max_memory_usage_mb = 4000 # Maximum allowed size of a model (in megabytes) created by the fit # command. Some algorithms (e.g. SVM and RandomForest) may create # unusually large models, which can lead to performance problems with # bundle replication. max_model_size_mb = 30 # Maximum time (in seconds) to spend in the "score" phase of a scoring # method(including down-sampling the input). This does not relate # to the other phases of a search (e.g. retrieving events from an # index). max_score_time = 600 # Whether to use reservoir sampling for data sets that exceed # max_inputs or to instead throw an error. use_sampling = true ### Algorithm-specific configuration [ARIMA] use_sampling = false [Birch] # Works well at 20000, but models are quite large. max_inputs = 2000 [DecisionTreeClassifier] summary_depth_limit = 5 summary_return_json = false [DecisionTreeRegressor] summary_depth_limit = 5 summary_return_json = false [DensityFunction] # The default value for the area under the fitted probability density function curve, that is assigned as anomalous area. default_prob_threshold = 0.01 # The maximum number of fields that can be provided in the "by" clause. max_fields_in_by_clause = 8 # The maximum number of groups created with the "by" clause. max_groups = 5000 # The maximum number of data points as the parameter size for Gaussian KDE density function. max_kde_parameter_size = 10000 # The maximum number of thresholds that can be provided at the same time. max_threshold_num = 5 # The minimum number of data points required to fit a density function. min_data_size_to_fit = 50 [GradientBoostingClassifier] max_model_size_mb = 30 [KernelPCA] max_inputs = 5000 [KernelRidge] max_inputs = 5000 [MLPClassifier] max_fit_time = 1800 [NPR] # The maximum allowed size of the NPR matrix, in terms of number of matrix cells (rows times columns). # The size of the NPR matrix equals the number of unique values of the feature variable times the number # of unique values of the target variable. For example if |X| = 1000 and |Y|=100 then NPR matrix size is 100,000. # Increasing the value of npr_max_matrix_size results in longer fit/apply times and larger model files. npr_max_matrix_size = 10000000 [OneClassSVM] max_inputs = 10000 [SpectralClustering] # This algorithm is especially slow. max_fit_time = 1800 max_inputs = 2000 [StateSpaceForecast] max_inputs = 50000 [SVM] # Works well at 20000, but models are quite large. max_inputs = 10000 [TFIDF] max_inputs = 200000 [score:classification] [score:clustering] [score:pairwise] max_fields = 50 [score:regression] [score:statsfunctions] [score:statstest]