You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

148 lines
4.6 KiB

### mlspl.conf #########
# This .conf file contains configuration for the "fit" and "apply"
# commands included with the Machine Learning Toolkit.
#
# Put global settings in the [default] stanza and algorithm-specific
# settings in a stanza named for the algorithm
# (e.g. [LinearRegression] for the LinearRegression algorithm).
[default]
# Action to perform when new value(s) for categorical variable/explanatory variable is encountered in partial_fit
# default : set all values of the column that corresponds to the new categorical value to 0's
# skip : skip over rows that contain the new value(s) and raise a warning
# stop : stop the operation by raising an error
handle_new_cat = default
# Maximum number of distinct values in categorical fields imposed
# for one-hot encoding.
max_distinct_cat_values = 100
# Maximum number of distinct values in categorical fields imposed
# for use in classifiers.
max_distinct_cat_values_for_classifiers = 100
# Maximum number of distinct values in categorical fields imposed
# for use in scoring methods.
max_distinct_cat_values_for_scoring = 100
# Maximum time (in seconds) to spend in the "fit" phase of an
# algorithm (including down-sampling the input). This does not relate
# to the other phases of a search (e.g. retrieving events from an
# index).
max_fit_time = 600
# max_inputs specifies the maximum number of events an algorithm will
# consider when fitting a model. If this limit is exceeded and
# use_sampling is true, then the fit command will downsample its input
# using the "reservoir sampling" algorithm before fitting a model. If
# use_sampling is false and this limit is exceeded, the fit command
# will throw an error.
max_inputs = 100000
# Maximum allowed memory usage by the fit or apply commands (in
# megabytes) while fitting or applying a model, respectively.
max_memory_usage_mb = 4000
# Maximum allowed size of a model (in megabytes) created by the fit
# command. Some algorithms (e.g. SVM and RandomForest) may create
# unusually large models, which can lead to performance problems with
# bundle replication.
max_model_size_mb = 30
# Maximum time (in seconds) to spend in the "score" phase of a scoring
# method(including down-sampling the input). This does not relate
# to the other phases of a search (e.g. retrieving events from an
# index).
max_score_time = 600
# Whether to use reservoir sampling for data sets that exceed
# max_inputs or to instead throw an error.
use_sampling = true
### Algorithm-specific configuration
[ARIMA]
use_sampling = false
[Birch]
# Works well at 20000, but models are quite large.
max_inputs = 2000
[DecisionTreeClassifier]
summary_depth_limit = 5
summary_return_json = false
[DecisionTreeRegressor]
summary_depth_limit = 5
summary_return_json = false
[DensityFunction]
# The default value for the area under the fitted probability density function curve, that is assigned as anomalous area.
default_prob_threshold = 0.01
# The maximum number of fields that can be provided in the "by" clause.
max_fields_in_by_clause = 8
# The maximum number of groups created with the "by" clause.
max_groups = 5000
# The maximum number of data points as the parameter size for Gaussian KDE density function.
max_kde_parameter_size = 10000
# The maximum number of thresholds that can be provided at the same time.
max_threshold_num = 5
# The minimum number of data points required to fit a density function.
min_data_size_to_fit = 50
[GradientBoostingClassifier]
max_model_size_mb = 30
[KernelPCA]
max_inputs = 5000
[KernelRidge]
max_inputs = 5000
[MLPClassifier]
max_fit_time = 1800
[NPR]
# The maximum allowed size of the NPR matrix, in terms of number of matrix cells (rows times columns).
# The size of the NPR matrix equals the number of unique values of the feature variable times the number
# of unique values of the target variable. For example if |X| = 1000 and |Y|=100 then NPR matrix size is 100,000.
# Increasing the value of npr_max_matrix_size results in longer fit/apply times and larger model files.
npr_max_matrix_size = 10000000
[OneClassSVM]
max_inputs = 10000
[SpectralClustering]
# This algorithm is especially slow.
max_fit_time = 1800
max_inputs = 2000
[StateSpaceForecast]
max_inputs = 50000
[SVM]
# Works well at 20000, but models are quite large.
max_inputs = 10000
[TFIDF]
max_inputs = 200000
[score:classification]
[score:clustering]
[score:pairwise]
max_fields = 50
[score:regression]
[score:statsfunctions]
[score:statstest]
[ai:LLMIntegrations]
# Max number of retries while making LLM api call if rate limit error occurs
max_retries = 6
# backoff factor used in retrying while making LLM api call if rate limit error occurs
backoff_factor = 2