|
|
# Version 8.2.3
|
|
|
# DO NOT EDIT THIS FILE!
|
|
|
# Changes to default files will be lost on update and are difficult to
|
|
|
# manage and support.
|
|
|
#
|
|
|
# Please make any changes to system defaults by overriding them in
|
|
|
# apps or $SPLUNK_HOME/etc/system/local
|
|
|
# (See "Configuration file precedence" in the web documentation).
|
|
|
#
|
|
|
# To override a specific setting, copy the name of the stanza and
|
|
|
# setting to the file where you wish to override it.
|
|
|
#
|
|
|
# This file configures the splunkd health report.
|
|
|
#
|
|
|
[distributed_health_reporter]
|
|
|
disabled = 0
|
|
|
|
|
|
[health_reporter]
|
|
|
full_health_log_interval = 30
|
|
|
suppress_status_update_ms = 300
|
|
|
latency_tracker_log_interval_sec = 30
|
|
|
aggregate_ingestion_latency_health = 0
|
|
|
alert.disabled = 0
|
|
|
alert.actions = email
|
|
|
alert.min_duration_sec = 60
|
|
|
alert.threshold_color = red
|
|
|
alert.suppress_period = 10m
|
|
|
|
|
|
[alert_action:email]
|
|
|
disabled = 0
|
|
|
action.to =
|
|
|
action.cc =
|
|
|
action.bcc =
|
|
|
|
|
|
[alert_action:webhook]
|
|
|
disabled = 0
|
|
|
action.url =
|
|
|
|
|
|
[alert_action:pagerduty]
|
|
|
disabled = 0
|
|
|
action.integration_url_override =
|
|
|
|
|
|
[alert_action:mobile]
|
|
|
disabled = 0
|
|
|
action.alert_recipients =
|
|
|
|
|
|
[alert_action:victorops]
|
|
|
disabled = 0
|
|
|
action.message_type = CRITICAL
|
|
|
action.entity_id =
|
|
|
action.record_id =
|
|
|
action.routing_key_override =
|
|
|
|
|
|
[tree_view:health_subset]
|
|
|
|
|
|
[clustering]
|
|
|
disabled = 0
|
|
|
health_report_period = 20
|
|
|
|
|
|
|
|
|
# Health Report Features <In features tree order>
|
|
|
|
|
|
# File Monitor Input
|
|
|
[feature:batchreader]
|
|
|
display_name = Batch Reader
|
|
|
indicator:data_out_rate:description = This indicator reflects the number of consecutive times the Batch File Reader was unable to insert data into Splunk's processing queues for a period of 5 seconds. By default, this indicator becomes Yellow when this input stalls for 5 seconds, Red after 10 seconds.
|
|
|
indicator:data_out_rate:yellow = 1
|
|
|
indicator:data_out_rate:red = 2
|
|
|
alert.disabled = 1
|
|
|
|
|
|
[feature:tailreader]
|
|
|
display_name = Tail Reader
|
|
|
indicator:data_out_rate:description = This indicator reflects the number of consecutive times the Tail File Reader was unable to insert data into Splunk's processing queues for a period of 5 seconds. By default, this indicator becomes Yellow when this input stalls for 5 seconds, and Red after 10 seconds.
|
|
|
indicator:data_out_rate:yellow = 1
|
|
|
indicator:data_out_rate:red = 2
|
|
|
alert.disabled = 1
|
|
|
|
|
|
# Data Forwarding
|
|
|
[feature:s2s_autolb]
|
|
|
display_name = Auto Load Balanced TCP Output
|
|
|
indicator:s2s_connections:description = This indicator gauges whether this forwarder can successfully connect to all indexers configured in outputs.conf. By default, this indicator becomes Yellow when 20% of indexers are unreachable, and Red at 70%.
|
|
|
indicator:s2s_connections:yellow = 20
|
|
|
indicator:s2s_connections:red = 70
|
|
|
|
|
|
# Indexer Clustering
|
|
|
[feature:cluster_bundles]
|
|
|
display_name = Cluster Bundles
|
|
|
indicator:cluster_bundles:description = This indicator reflects whether there were validation errors in the last bundle that was pushed to cluster peers. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:cluster_bundles:yellow = 1
|
|
|
|
|
|
[feature:data_durability]
|
|
|
display_name = Data Durability
|
|
|
indicator:cluster_replication_factor:description = This indicator reflects whether or not the configured replication factor is met for an indexer cluster. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:cluster_replication_factor:red = 1
|
|
|
indicator:cluster_search_factor:description = This indicator reflects whether or not the configured search factor is met for an indexer cluster. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:cluster_search_factor:red = 1
|
|
|
|
|
|
[feature:data_searchable]
|
|
|
display_name = Data Searchable
|
|
|
indicator:data_searchable:description = This indicator reflects whether ALL indexed data in a cluster is available to be searched. Red occurs when one or more buckets of data lack a primary (searchable) copy. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:data_searchable:red = 1
|
|
|
|
|
|
[feature:indexers]
|
|
|
display_name = Indexers
|
|
|
indicator:detention:description = This indicator tracks whether any indexer cluster members are in detention mode. Yellow occurs when not less than 'indicator:detention:yellow' number of members are in manual detention, Red when not less than 'indicator:detention:red' number of members are in automatic detention. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:detention:yellow = 1
|
|
|
indicator:detention:red = 1
|
|
|
indicator:missing_peers:description = This indicator tracks whether any indexer cluster members are in transition. Yellow occurs when not less than 'indicator:missing_peers:yellow' number of members are in status like: stopping, stopped, decommissioning, pending or restarting, Red when not less than 'indicator:missing_peers:red' number of members are down. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:missing_peers:yellow = 1
|
|
|
indicator:missing_peers:red = 1
|
|
|
|
|
|
[feature:indexing_ready]
|
|
|
display_name = Indexing Ready
|
|
|
indicator:indexing_ready:description = This indicator becomes Green when indexer clustering becomes functional. This happens when enough peers join the cluster. Once Green, this indicator stays Green until the cluster master is restarted. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:indexing_ready:red = 1
|
|
|
|
|
|
# Search Head Clustering
|
|
|
[feature:master_connectivity]
|
|
|
display_name = Master Connectivity
|
|
|
indicator:master_connectivity:description = This indicator reflects whether the cluster peer can successfully connect to the cluster master. Any failure results in Red. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:master_connectivity:red = 1
|
|
|
|
|
|
[feature:replication_failures]
|
|
|
display_name = Replication Failures
|
|
|
indicator:replication_failures:description = This indicator tracks whether the cluster peer is encountering repeated bucket replication failures. Yellow occurs after 5 consecutive failures, Red after 10.
|
|
|
indicator:replication_failures:red = 10
|
|
|
indicator:replication_failures:yellow = 5
|
|
|
|
|
|
[feature:searchheadconnectivity]
|
|
|
display_name = Search Head Connectivity
|
|
|
indicator:master_connectivity:description = This indicator reflects whether or not this search head can successfully connect to the cluster master. When Red, searches might be inaccurate due to outdated cluster information. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:master_connectivity:red = 1
|
|
|
indicator:master_version_compatibility:description = This indicator checks version compatibility between the cluster master and search head. Yellow occurs when the cluster master version is older than the search head version. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:master_version_compatibility:yellow = 1
|
|
|
|
|
|
[feature:shc_members_overview]
|
|
|
display_name = SHC Cluster Members
|
|
|
indicator:status:description = This indicator tracks whether the required number of search head cluster members are up and running. Green occurs when all members are up, Yellow when 'indicator:status:yellow' members are down for less than (2* heartbeat_timeout) amount of time, and Red if 'indicator:status:red' members are down for more than (2*heartbeat_timeout) amount of time.
|
|
|
indicator:status:yellow = 1
|
|
|
indicator:status:red = 1
|
|
|
indicator:replication_factor:description = This indicator tracks whether enough search head cluster members exist to honor the configured search artifact replication factor. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:replication_factor:yellow = 1
|
|
|
indicator:detention:description = This indicator tracks whether any search head cluster members are in detention mode. Yellow occurs when not less than 'indicator:detention:yellow' number of members are in manual detention, Red when not less than 'indicator:detention:red' number of members are in automatic detention. Green occurs when no members are in manual/automatic detention.
|
|
|
indicator:detention:yellow = 1
|
|
|
indicator:detention:red = 1
|
|
|
|
|
|
[feature:shc_captain_election_overview]
|
|
|
display_name = SHC Captain Election
|
|
|
indicator:dynamic_captain_quorum:description = This indicator tracks whether quorum majority required to re-elect a dynamic captain has been lost. Yellow occurs when half or more members are down, Green otherwise. This feature can be disabled when a static captain is being used instead of a dynamic captain. However, we recommend keeping the feature enabled if static captain is being used only for disaster recovery.
|
|
|
indicator:dynamic_captain_quorum:yellow = 1
|
|
|
|
|
|
[feature:shc_captain_connection]
|
|
|
display_name = Captain Connection
|
|
|
indicator:captain_connection:description = This indicator checks whether a search head cluster member is able to communicate with the captain or not. Red occurs when a member cannot communicate with the captain, green otherwise.
|
|
|
indicator:captain_connection:red = 1
|
|
|
indicator:captain_existence:description = This indicator checks for the existence of a valid captain in the search head cluster. Red occurs when there is no valid captain in the SHC, green otherwise.
|
|
|
indicator:captain_existence:red = 1
|
|
|
|
|
|
[feature:shc_captain_common_baseline]
|
|
|
display_name = Common Baseline
|
|
|
indicator:common_baseline:description = This indicator checks whether the captain shares a common baseline with all the search head cluster members or not. This indicator is red if a shared baseline is missing between the captain and any of the members, green otherwise.
|
|
|
indicator:common_baseline:red = 1
|
|
|
|
|
|
[feature:shc_snapshot_creation]
|
|
|
display_name = Snapshot Creation
|
|
|
indicator:snapshot_creation:description = This indicator checks whether snapshots were created on each search head cluster member within a reasonable time. This indicator is green if snapshot creation happens in less than (indicator:snapshot_creation:yellow x conf_replication_summary.period) minutes, yellow if snapshot creation takes between (indicator:snapshot_creation:yellow * conf_replication_summary.period) and (indicator:snapshot_creation:red * conf_replication_summary.period) minutes, and red if it takes more than (indicator:snapshot_creation:red * conf_replication_summary.period) minutes.
|
|
|
indicator:snapshot_creation:yellow = 10
|
|
|
indicator:snapshot_creation:red = 20
|
|
|
|
|
|
[feature:slave_state]
|
|
|
display_name = Slave State
|
|
|
indicator:slave_state:description = This indicator gauges whether the cluster peer is in an abnormal state. For example, manual detention will result in Yellow, and automatic detention will result in Red. You can disable this indicator by setting the threshold to 0.
|
|
|
indicator:slave_state:red = 1
|
|
|
indicator:slave_state:yellow = 1
|
|
|
|
|
|
[feature:slave_version]
|
|
|
display_name = Slave Version
|
|
|
indicator:slave_version:description = This indicator checks version compatibility between the cluster master and cluster peer. Red occurs when the cluster master version is older than the cluster peer version.
|
|
|
indicator:slave_version:red = 1
|
|
|
|
|
|
# Index Processor
|
|
|
[feature:splunkoptimize_processes]
|
|
|
display_name = Bucket Optimization
|
|
|
indicator:concurrent_optimize_processes_percent:description = This indicator tracks whether index optimization is falling behind. By default, this indicator becomes Yellow when 100% of the maximum allowed "splunk-optimize" processes are running.
|
|
|
indicator:concurrent_optimize_processes_percent:yellow = 100
|
|
|
|
|
|
[feature:buckets]
|
|
|
display_name = Buckets
|
|
|
indicator:buckets_created_last_60m:description = This indicator gauges whether incoming data is being appropriately bucketed within the Splunk index. By default, Red occurs when any index has created more than 60 buckets within the last hour. A high rate of bucket creation can cause severe search performance degradation, and might indicate poorly configured data processing (for example, timestamping).
|
|
|
indicator:buckets_created_last_60m:red = 60
|
|
|
indicator:buckets_created_last_60m:yellow = 40
|
|
|
indicator:percent_small_buckets_created_last_24h:description = This indicator tracks the percentage of small buckets created over the last 24 hours. A small bucket is defined as less than 10 % of the ‘maxDataSize’ setting in indexes.conf.
|
|
|
indicator:percent_small_buckets_created_last_24h:red = 50
|
|
|
indicator:percent_small_buckets_created_last_24h:yellow = 30
|
|
|
|
|
|
[feature:disk_space]
|
|
|
display_name = Disk Space
|
|
|
indicator:disk_space_remaining_multiple_minfreespace:description = This indicator tracks whether all Splunk index filesystems contain sufficient free space to continue indexing. This calculation is based upon the 'minFreeSpace' setting in server.conf. By default, Yellow occurs when a filesystem's free space falls below (2* 'minFreeSpace'), and Red occurs when it falls below 'minFreeSpace'. If the index being reported on is a remote s2-enabled index, by default, Yellow occurs when a filesystem's free space falls below (1 * 'minFreeSpace'), and Red occurs when a filesystem's free space drops to 0.
|
|
|
indicator:disk_space_remaining_multiple_minfreespace:red = 1
|
|
|
indicator:disk_space_remaining_multiple_minfreespace:yellow = 2
|
|
|
|
|
|
# Dynamic Data Active Archive
|
|
|
[feature:ddaa_archived_buckets]
|
|
|
display_name = Dynamic Data Archived Buckets
|
|
|
indicator:archived_buckets_failed_last_24h:description = This indicator tracks the amount of buckets that were attempted to be archived into Glacier but failed. Green occurs when less than 40 buckets in the last 24 hours have failed, yellow occurs when 3 or more buckets have failed, and red occurs when 80 or more buckets have failed.
|
|
|
indicator:archived_buckets_failed_last_24h:yellow = 40
|
|
|
indicator:archived_buckets_failed_last_24h:red = 80
|
|
|
|
|
|
[feature:searches_skipped]
|
|
|
display_name = Search Scheduler Searches Skipped
|
|
|
indicator:percent_searches_skipped_high_priority_last_24h:description = This indicator tracks the skip rate for high priority scheduled searches. These are scheduled searches where the priority field is set to "higher" or "highest". By default, this indicator is yellow if the skipped search ratio over the last 24 hours is 5%, and red if it is 10%.
|
|
|
indicator:percent_searches_skipped_high_priority_last_24h:yellow = 5
|
|
|
indicator:percent_searches_skipped_high_priority_last_24h:red = 10
|
|
|
indicator:percent_searches_skipped_non_high_priority_last_24h:description = This indicator tracks the skip rate for scheduled searches whose priority field is set to "default". By default, this indicator is yellow if the skipped search ratio over the last 24 hours is 10%, and red if it is 20%.
|
|
|
indicator:percent_searches_skipped_non_high_priority_last_24h:yellow = 10
|
|
|
indicator:percent_searches_skipped_non_high_priority_last_24h:red = 20
|
|
|
tree_view:health_subset = enabled
|
|
|
|
|
|
[feature:searches_delayed]
|
|
|
display_name = Search Scheduler Searches Delayed
|
|
|
indicator:percent_searches_delayed_high_priority_last_24h:description = This indicator tracks the delayed search rate for high priority scheduled searches. These are scheduled searches where the priority field is set to "higher" or "highest". By default, this indicator is yellow if the delayed search ratio over the last 24 hours is 5%, and red if it is 10%.
|
|
|
indicator:percent_searches_delayed_high_priority_last_24h:yellow = 5
|
|
|
indicator:percent_searches_delayed_high_priority_last_24h:red = 10
|
|
|
indicator:percent_searches_delayed_non_high_priority_last_24h:description = This indicator tracks the delayed search rate for scheduled searches whose priority field is set to "default". By default, this indicator is yellow if the delayed search ratio over the last 24 hours is 10%, and red if it is 20%.
|
|
|
indicator:percent_searches_delayed_non_high_priority_last_24h:yellow = 10
|
|
|
indicator:percent_searches_delayed_non_high_priority_last_24h:red = 20
|
|
|
tree_view:health_subset = enabled
|
|
|
|
|
|
[feature:search_lag]
|
|
|
display_name = Search Scheduler Search Lag
|
|
|
indicator:percent_searches_lagged_high_priority_last_24h:description = This indicator tracks the lag rate of high priority scheduled searches. Search lag is a delay that occurs before a search starts. High priority scheduled searches are scheduled searches whose priority is set to "higher" or "highest". By default, this indicator is yellow if the search lag rate exceeds 10%.
|
|
|
indicator:percent_searches_lagged_high_priority_last_24h:yellow = 10
|
|
|
indicator:percent_searches_lagged_non_high_priority_last_24h:description = This indicator tracks the lag rate for scheduled searches whose priority is set to "default". Search lag is a delay that occurs before a search starts. By default, this indicator is yellow if the search lag rate over the last 24 hours exceeds 40%.
|
|
|
indicator:percent_searches_lagged_non_high_priority_last_24h:yellow = 40
|
|
|
indicator:count_extremely_lagged_searches_last_hour:description = This indicator checks whether there are any extremely lagged searches in the last hour. These are scheduled searches which have been unable to run within their configured interval. By default, this indicator is never Yellow, and is Red if there is at least 1 extremely lagged scheduled search. This is a numeric threshold rather than a percentage. Setting both thresholds to 0 will disable this indicator.
|
|
|
indicator:count_extremely_lagged_searches_last_hour:yellow = 0
|
|
|
indicator:count_extremely_lagged_searches_last_hour:red = 1
|
|
|
tree_view:health_subset = enabled
|
|
|
|
|
|
[feature:wlm_system_check]
|
|
|
display_name = System Check
|
|
|
indicator:system_check:description = This indicator checks whether the underlying Linux operating system is set up properly for workload management.
|
|
|
indicator:system_check:red = 1
|
|
|
|
|
|
[feature:wlm_configuration_check]
|
|
|
display_name = Configuration Check
|
|
|
indicator:configuration_check:description = This indicator checks whether the workload management configuration, including pools and rules, is valid.
|
|
|
indicator:configuration_check:yellow = 1
|
|
|
indicator:configuration_check:red = 2
|
|
|
|
|
|
[feature:ingestion_latency]
|
|
|
display_name = Ingestion Latency
|
|
|
indicator:ingestion_latency_lag_sec:description = This indicator tracks the difference between the logging time and processing time of locally generated events to determine ingestion latency. By default, this indicator will turn Yellow at 15 seconds of latency, and Red if the latency reaches 180 seconds. Setting both values to 0 will disable this indicator.
|
|
|
indicator:ingestion_latency_lag_sec:yellow = 15
|
|
|
indicator:ingestion_latency_lag_sec:red = 180
|
|
|
indicator:ingestion_latency_gap_multiplier:description = This indicator tracks locally generated events, and uses the time elapsed since the last event was ingested to determine the ingestion latency gap. By default, this indicator will turn Yellow when the latency gap reaches 45 seconds, and Red at 210 seconds. To calculate the warning value, multiple these values by their corresponding ingestion_latency_lag_sec and then add 30. Setting both values to 0 will disable this indicator.
|
|
|
indicator:ingestion_latency_gap_multiplier:yellow = 1
|
|
|
indicator:ingestion_latency_gap_multiplier:red = 1
|
|
|
|
|
|
[feature:ingestion_latency_reported]
|
|
|
display_name = Ingestion Latency Reported
|
|
|
indicator:ingestion_latency_indexer_health:description = This indicator tracks aggregated health of ingestion latencies as reported by forwarders. Thresholds are in percents, once 100*(number of forwarders reporting yellow + red colors)/(total number of forwarders) exceeds yellow threshold the indexer will report aggregated color as yellow color. Once 100*(number of forwarders reporting red colors)/(total number of forwarders) exceeds red threshold the indexer will report aggregated color as red color.
|
|
|
indicator:ingestion_latency_indexer_health:yellow = 1
|
|
|
indicator:ingestion_latency_indexer_health:red = 1
|
|
|
|
|
|
[feature:iowait]
|
|
|
display_name = IOWait
|
|
|
indicator:avg_cpu__max_perc_last_3m:description = This indicator tracks the average IOWait percentage across all CPUs on the machine running the Splunk Enterprise instance, over the last 3 minute window. By default, this indicator will turn Yellow if the percentage exceeds 1% and Red if it exceeds 3% during this window.
|
|
|
indicator:avg_cpu__max_perc_last_3m:red = 3
|
|
|
indicator:avg_cpu__max_perc_last_3m:yellow = 1
|
|
|
indicator:single_cpu__max_perc_last_3m:description = This indicator tracks the IOWait percentage for the single most bottle-necked CPU on the machine running the Splunk Enterprise instance, over the last 3 minute window. By default, this indicator will turn Yellow if the percentage exceeds 5% and Red if it exceeds 10% during this window.
|
|
|
indicator:single_cpu__max_perc_last_3m:red = 10
|
|
|
indicator:single_cpu__max_perc_last_3m:yellow = 5
|
|
|
indicator:sum_top3_cpu_percs__max_last_3m:description = This indicator tracks the sum of IOWait percentage for the three most bottle-necked CPUs on the machine running the Splunk Enterprise instance, over the last 3 minute window. By default, this indicator will turn Yellow if the sum exceeds 7% and Red if it exceeds 15% during this window.
|
|
|
indicator:sum_top3_cpu_percs__max_last_3m:red = 15
|
|
|
indicator:sum_top3_cpu_percs__max_last_3m:yellow = 7
|
|
|
|