You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
148 lines
4.6 KiB
148 lines
4.6 KiB
### mlspl.conf #########
|
|
# This .conf file contains configuration for the "fit" and "apply"
|
|
# commands included with the Machine Learning Toolkit.
|
|
#
|
|
# Put global settings in the [default] stanza and algorithm-specific
|
|
# settings in a stanza named for the algorithm
|
|
# (e.g. [LinearRegression] for the LinearRegression algorithm).
|
|
|
|
[default]
|
|
# Action to perform when new value(s) for categorical variable/explanatory variable is encountered in partial_fit
|
|
# default : set all values of the column that corresponds to the new categorical value to 0's
|
|
# skip : skip over rows that contain the new value(s) and raise a warning
|
|
# stop : stop the operation by raising an error
|
|
handle_new_cat = default
|
|
|
|
# Maximum number of distinct values in categorical fields imposed
|
|
# for one-hot encoding.
|
|
max_distinct_cat_values = 100
|
|
|
|
# Maximum number of distinct values in categorical fields imposed
|
|
# for use in classifiers.
|
|
max_distinct_cat_values_for_classifiers = 100
|
|
|
|
# Maximum number of distinct values in categorical fields imposed
|
|
# for use in scoring methods.
|
|
max_distinct_cat_values_for_scoring = 100
|
|
|
|
# Maximum time (in seconds) to spend in the "fit" phase of an
|
|
# algorithm (including down-sampling the input). This does not relate
|
|
# to the other phases of a search (e.g. retrieving events from an
|
|
# index).
|
|
max_fit_time = 600
|
|
|
|
# max_inputs specifies the maximum number of events an algorithm will
|
|
# consider when fitting a model. If this limit is exceeded and
|
|
# use_sampling is true, then the fit command will downsample its input
|
|
# using the "reservoir sampling" algorithm before fitting a model. If
|
|
# use_sampling is false and this limit is exceeded, the fit command
|
|
# will throw an error.
|
|
max_inputs = 100000
|
|
|
|
# Maximum allowed memory usage by the fit or apply commands (in
|
|
# megabytes) while fitting or applying a model, respectively.
|
|
max_memory_usage_mb = 4000
|
|
|
|
# Maximum allowed size of a model (in megabytes) created by the fit
|
|
# command. Some algorithms (e.g. SVM and RandomForest) may create
|
|
# unusually large models, which can lead to performance problems with
|
|
# bundle replication.
|
|
max_model_size_mb = 30
|
|
|
|
# Maximum time (in seconds) to spend in the "score" phase of a scoring
|
|
# method(including down-sampling the input). This does not relate
|
|
# to the other phases of a search (e.g. retrieving events from an
|
|
# index).
|
|
max_score_time = 600
|
|
|
|
# Whether to use reservoir sampling for data sets that exceed
|
|
# max_inputs or to instead throw an error.
|
|
use_sampling = true
|
|
|
|
|
|
### Algorithm-specific configuration
|
|
[ARIMA]
|
|
use_sampling = false
|
|
|
|
[Birch]
|
|
# Works well at 20000, but models are quite large.
|
|
max_inputs = 2000
|
|
|
|
[DecisionTreeClassifier]
|
|
summary_depth_limit = 5
|
|
summary_return_json = false
|
|
|
|
[DecisionTreeRegressor]
|
|
summary_depth_limit = 5
|
|
summary_return_json = false
|
|
|
|
[DensityFunction]
|
|
# The default value for the area under the fitted probability density function curve, that is assigned as anomalous area.
|
|
default_prob_threshold = 0.01
|
|
# The maximum number of fields that can be provided in the "by" clause.
|
|
max_fields_in_by_clause = 8
|
|
# The maximum number of groups created with the "by" clause.
|
|
max_groups = 5000
|
|
# The maximum number of data points as the parameter size for Gaussian KDE density function.
|
|
max_kde_parameter_size = 10000
|
|
# The maximum number of thresholds that can be provided at the same time.
|
|
max_threshold_num = 5
|
|
# The minimum number of data points required to fit a density function.
|
|
min_data_size_to_fit = 50
|
|
|
|
[GradientBoostingClassifier]
|
|
max_model_size_mb = 30
|
|
|
|
[KernelPCA]
|
|
max_inputs = 5000
|
|
|
|
[KernelRidge]
|
|
max_inputs = 5000
|
|
|
|
[MLPClassifier]
|
|
max_fit_time = 1800
|
|
|
|
[NPR]
|
|
# The maximum allowed size of the NPR matrix, in terms of number of matrix cells (rows times columns).
|
|
# The size of the NPR matrix equals the number of unique values of the feature variable times the number
|
|
# of unique values of the target variable. For example if |X| = 1000 and |Y|=100 then NPR matrix size is 100,000.
|
|
# Increasing the value of npr_max_matrix_size results in longer fit/apply times and larger model files.
|
|
npr_max_matrix_size = 10000000
|
|
|
|
[OneClassSVM]
|
|
max_inputs = 10000
|
|
|
|
[SpectralClustering]
|
|
# This algorithm is especially slow.
|
|
max_fit_time = 1800
|
|
max_inputs = 2000
|
|
|
|
[StateSpaceForecast]
|
|
max_inputs = 50000
|
|
|
|
[SVM]
|
|
# Works well at 20000, but models are quite large.
|
|
max_inputs = 10000
|
|
|
|
[TFIDF]
|
|
max_inputs = 200000
|
|
|
|
[score:classification]
|
|
|
|
[score:clustering]
|
|
|
|
[score:pairwise]
|
|
max_fields = 50
|
|
|
|
[score:regression]
|
|
|
|
[score:statsfunctions]
|
|
|
|
[score:statstest]
|
|
|
|
[ai:LLMIntegrations]
|
|
# Max number of retries while making LLM api call if rate limit error occurs
|
|
max_retries = 6
|
|
# backoff factor used in retrying while making LLM api call if rate limit error occurs
|
|
backoff_factor = 2
|