### mlspl.conf #########
# This .conf file contains configuration for the "fit" and "apply"
# commands included with the Machine Learning Toolkit.
#
# Put global settings in the [default] stanza and algorithm-specific
# settings in a stanza named for the algorithm
# (e.g. [LinearRegression] for the LinearRegression algorithm).

[default]
# Action to perform when new value(s) for categorical variable/explanatory variable is encountered in partial_fit
# default   : set all values of the column that corresponds to the new categorical value to 0's
# skip      : skip over rows that contain the new value(s) and raise a warning
# stop      : stop the operation by raising an error
handle_new_cat = default

# Maximum number of distinct values in categorical fields imposed
# for one-hot encoding.
max_distinct_cat_values = 100

# Maximum number of distinct values in categorical fields imposed
# for use in classifiers.
max_distinct_cat_values_for_classifiers = 100

# Maximum number of distinct values in categorical fields imposed
# for use in scoring methods.
max_distinct_cat_values_for_scoring = 100

# Maximum time (in seconds) to spend in the "fit" phase of an
# algorithm (including down-sampling the input). This does not relate
# to the other phases of a search (e.g. retrieving events from an
# index).
max_fit_time = 600

# max_inputs specifies the maximum number of events an algorithm will
# consider when fitting a model. If this limit is exceeded and
# use_sampling is true, then the fit command will downsample its input
# using the "reservoir sampling" algorithm before fitting a model. If
# use_sampling is false and this limit is exceeded, the fit command
# will throw an error.
max_inputs = 100000

# Maximum allowed memory usage by the fit or apply commands (in
# megabytes) while fitting or applying a model, respectively.
max_memory_usage_mb = 4000

# Maximum allowed size of a model (in megabytes) created by the fit
# command. Some algorithms (e.g. SVM and RandomForest) may create
# unusually large models, which can lead to performance problems with
# bundle replication.
max_model_size_mb = 30

# Maximum time (in seconds) to spend in the "score" phase of a scoring
# method(including down-sampling the input). This does not relate
# to the other phases of a search (e.g. retrieving events from an
# index).
max_score_time = 600

# Whether to use reservoir sampling for data sets that exceed
# max_inputs or to instead throw an error.
use_sampling = true


### Algorithm-specific configuration
[ARIMA]
use_sampling = false

[Birch]
# Works well at 20000, but models are quite large.
max_inputs = 2000

[DecisionTreeClassifier]
summary_depth_limit = 5
summary_return_json = false

[DecisionTreeRegressor]
summary_depth_limit = 5
summary_return_json = false

[DensityFunction]
# The default value for the area under the fitted probability density function curve, that is assigned as anomalous area.
default_prob_threshold = 0.01
# The maximum number of fields that can be provided in the "by" clause.
max_fields_in_by_clause = 8
# The maximum number of groups created with the "by" clause.
max_groups = 5000
# The maximum number of data points as the parameter size for Gaussian KDE density function.
max_kde_parameter_size = 10000
# The maximum number of thresholds that can be provided at the same time.
max_threshold_num = 5
# The minimum number of data points required to fit a density function.
min_data_size_to_fit = 50

[GradientBoostingClassifier]
max_model_size_mb = 30

[KernelPCA]
max_inputs = 5000

[KernelRidge]
max_inputs = 5000

[MLPClassifier]
max_fit_time = 1800

[NPR]
# The maximum allowed size of the NPR matrix, in terms of number of matrix cells (rows times columns).
# The size of the NPR matrix equals the number of unique values of the feature variable times the number
# of unique values of the target variable. For example if |X| = 1000 and |Y|=100 then NPR matrix size is 100,000.
# Increasing the value of npr_max_matrix_size results in longer fit/apply times and larger model files.
npr_max_matrix_size = 10000000

[OneClassSVM]
max_inputs = 10000

[SpectralClustering]
# This algorithm is especially slow.
max_fit_time = 1800
max_inputs = 2000

[StateSpaceForecast]
max_inputs = 50000

[SVM]
# Works well at 20000, but models are quite large.
max_inputs = 10000

[TFIDF]
max_inputs = 200000

[score:classification]

[score:clustering]

[score:pairwise]
max_fields = 50

[score:regression]

[score:statsfunctions]

[score:statstest]