You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1072 lines
43 KiB
1072 lines
43 KiB
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
#
|
|
# Copyright (C) 2005-2023 Splunk Inc. All Rights Reserved.
|
|
|
|
# CORE PYTHON IMPORTS
|
|
import logging
|
|
import sys
|
|
import os
|
|
import time
|
|
import json
|
|
try:
|
|
from queue import PriorityQueue, Empty
|
|
except:
|
|
from Queue import PriorityQueue, Empty
|
|
import threading
|
|
import socket
|
|
import signal
|
|
# Splunk Python does not bundle UUID, so we've included it in the hydra bin, but it is a core python module
|
|
import uuid
|
|
from . import six
|
|
|
|
# CORE SPLUNK IMPORTS
|
|
# import splunk
|
|
try:
|
|
from cheroot import wsgi
|
|
from cheroot.ssl import builtin
|
|
from cheroot.server import HTTPServer
|
|
except:
|
|
from cherrypy import wsgiserver
|
|
from splunk import getDefault
|
|
from splunk.clilib.bundle_paths import make_splunkhome_path
|
|
from splunk.clilib.cli_common import getConfStanza, getMergedConf
|
|
try:
|
|
from splunk.clilib.cli_common import decrypt
|
|
except:
|
|
pass
|
|
from splunk.util import normalizeBoolean
|
|
import splunk.version as ver
|
|
import ssl
|
|
|
|
# SA-HYDRA IMPORTS
|
|
from hydra import setupLogger
|
|
|
|
|
|
# END IMPORTS
|
|
# ------------------------------------------------------------------------------
|
|
|
|
|
|
# ===============================================================================
|
|
# Helper Class Definitions
|
|
# ===============================================================================
|
|
# Thread safe classes
|
|
class HydraCacheManager(object):
|
|
"""
|
|
A threadsafe cache system with built in pruning for outdated caches
|
|
"""
|
|
|
|
def __init__(self, cache_expiration=3600):
|
|
"""
|
|
Build the cache manager initializing internal properties.
|
|
ARGS:
|
|
cache_expiration - time in seconds after which a cache should be removed if it has not been touched
|
|
"""
|
|
self.cache_expiration = cache_expiration
|
|
|
|
# This is where we keep special expirations
|
|
# special_expirations looks like name -> special_expiration_time
|
|
self.special_expirations = {}
|
|
# This is where we keep stuff
|
|
# Cache looks like name -> (touch_time, value)
|
|
self.cache = {}
|
|
self.cache_lock = threading.Lock()
|
|
# Keep track of the last time we pruned
|
|
self.last_prune = time.time()
|
|
|
|
def prune(self):
|
|
"""
|
|
NOT THREADSAFE! execute only inside a lock.
|
|
|
|
Prunes the cache of all old entries, will only execute if last_prune
|
|
plus cache_expiration is less than now in epoch
|
|
"""
|
|
if (self.last_prune + self.cache_expiration) < time.time():
|
|
service_logger.info("[HydraCacheManager] current cache_length=%s, checking cache for outdated entries...",
|
|
len(self.cache))
|
|
to_delete = []
|
|
now = time.time()
|
|
# Check what we need to prune
|
|
for name, cache_item in six.iteritems(self.cache):
|
|
if (now > (cache_item[0] + self.cache_expiration)) and (now > self.special_expirations.get(name, 0)):
|
|
if name in self.special_expirations:
|
|
del self.special_expirations[name]
|
|
to_delete.append(name)
|
|
# Prune the offenders
|
|
if len(to_delete) > 0:
|
|
service_logger.info("[HydraCacheManager] found prune_count=%s entries to remove", len(to_delete))
|
|
for name in to_delete:
|
|
service_logger.debug("[HydraCacheManager] pruning cache_entry=%s", name)
|
|
del self.cache[name]
|
|
else:
|
|
service_logger.debug("[HydraCacheManager] found no entries to prune")
|
|
# Reset last prune time
|
|
self.last_prune = now
|
|
service_logger.info(
|
|
"[HydraCacheManager] finished checking cache for outdated entries, final cache_length=%s",
|
|
len(self.cache))
|
|
|
|
def set_cache(self, name, value, expiration=None):
|
|
"""
|
|
Sets the cache value for a particular item, thread safe
|
|
"""
|
|
with self.cache_lock:
|
|
now = time.time()
|
|
self.cache[name] = (now, value)
|
|
if isinstance(expiration, int):
|
|
self.special_expirations[name] = now + expiration
|
|
self.prune()
|
|
|
|
def get_cache(self, name):
|
|
"""
|
|
Gets the cache value for a particular item, threadsafe
|
|
"""
|
|
value = None
|
|
with self.cache_lock:
|
|
cache_item = self.cache.get(name, None)
|
|
if cache_item is not None:
|
|
value = cache_item[1]
|
|
self.cache[name] = (time.time(), value)
|
|
|
|
return value
|
|
|
|
|
|
class HydraJobExecutionInfoManager(object):
|
|
'''
|
|
Handle job execution time information by doing aggregation at target|task|metadata_id level
|
|
Handles atomic job completion and error at the job name level.
|
|
'''
|
|
|
|
def __init__(self):
|
|
#Dict which holds avg execution time of taregt|task|metadata_id,
|
|
#value is tuples of avg time and total cycles reported for this key
|
|
self.job_aggregate_execution_info = {}
|
|
self.job_aggregate_execution_info_lock = threading.Lock()
|
|
|
|
#Dict which holds active jobs count based upon unique target|task|metadata_id
|
|
#We use different dictionary for this of active job count does not block other dictionary
|
|
self.active_job_category_count = {}
|
|
self.active_job_category_lock = threading.Lock()
|
|
|
|
#Lists which hold the completed and error'd out atomic jobs, since
|
|
#appends are safe we only need one lock when we clear the lists
|
|
self.failed_atomic_jobs = []
|
|
self.completed_atomic_jobs = []
|
|
self.atomic_jobs_lock = threading.Lock()
|
|
|
|
#===============================================================================
|
|
# Atomic Job Information Methods
|
|
#===============================================================================
|
|
def add_completed_atomic_job(self, job_name):
|
|
"""
|
|
Thread safe method for appending a completed atomic task's job to the
|
|
completed atomic job list.
|
|
|
|
@type job_name: str
|
|
@param job_name: the name/id of the completed job
|
|
|
|
@rtype: None
|
|
@return: None
|
|
"""
|
|
self.completed_atomic_jobs.append(job_name)
|
|
|
|
def add_failed_atomic_job(self, job_name):
|
|
"""
|
|
Thread safe method for appending a failed atomic task's job to the
|
|
failed atomic job list.
|
|
|
|
@type job_name: str
|
|
@param job_name: the name/id of the failed job
|
|
|
|
@rtype: None
|
|
@return: None
|
|
"""
|
|
self.failed_atomic_jobs.append(job_name)
|
|
|
|
def get_atomic_job_info(self):
|
|
"""
|
|
Get and clear the lists of the completed and failed atomic jobs as a
|
|
dict.
|
|
|
|
@rtype: dict
|
|
@return: dict of the completed and failed atomic job lists { "completed_atomic_jobs": [...], "failed_atomic_jobs": [...] }
|
|
"""
|
|
temp_dict = {}
|
|
with self.atomic_jobs_lock:
|
|
temp_dict["completed_atomic_jobs"] = self.completed_atomic_jobs
|
|
temp_dict["failed_atomic_jobs"] = self.failed_atomic_jobs
|
|
self.completed_atomic_jobs = []
|
|
self.failed_atomic_jobs = []
|
|
|
|
return temp_dict
|
|
|
|
#===============================================================================
|
|
# Job Execution Time Information Methods
|
|
#===============================================================================
|
|
def _update_active_job_dict(self, key, difference=1, put_lock=True):
|
|
'''
|
|
Thread safe function to update active_job_category_count dict
|
|
@param key : Unique key based upon target|task|metadata_id
|
|
@param difference: increment count
|
|
@param put_lock: put lock while updating dict
|
|
@return: nothing
|
|
'''
|
|
|
|
def do():
|
|
if key in self.active_job_category_count:
|
|
self.active_job_category_count[key] = self.active_job_category_count[key] + difference
|
|
else:
|
|
self.active_job_category_count[key] = difference
|
|
# if final value is less than zero, then set it to zero
|
|
if self.active_job_category_count[key] < 0:
|
|
self.active_job_category_count[key] = 0
|
|
|
|
service_logger.debug("[ReduceJobCount] Reducing job count of category=%s by=%s", key, difference)
|
|
if put_lock:
|
|
with self.active_job_category_lock:
|
|
do()
|
|
else:
|
|
do()
|
|
|
|
def _get_key(self, job_string):
|
|
'''
|
|
Extract key from job_string
|
|
Call this function in try/except block as we are assuming that job_string will have four |
|
|
'''
|
|
name, target, task, metadata_id, extra = job_string.split('|', 4)
|
|
key = target + "|" + task + "|" + metadata_id
|
|
return key
|
|
|
|
def abstract_activejob_category(self, things):
|
|
'''
|
|
Updating active job count by passing job batch count
|
|
@param things: \n delimited strings of the form <priority_number>:<serialized JobTuple>
|
|
'''
|
|
for thing in things:
|
|
if thing.find(":") != -1:
|
|
priority_number, job_string = thing.split(":", 1)
|
|
self.update_activejob_category(job_string, 1)
|
|
|
|
def update_activejob_category(self, job_string, difference, put_lock=True):
|
|
'''
|
|
Update unclaimed job count for a specific category
|
|
@param job_string: job string
|
|
@param difference: count value in which unclaimed job count increases or decreases
|
|
@param put_lock: put lock while updating dict
|
|
'''
|
|
|
|
try:
|
|
service_logger.debug("[ReduceJobCount] Reducing job count of job=%s", job_string)
|
|
key = self._get_key(job_string)
|
|
self._update_active_job_dict(key, difference, put_lock)
|
|
service_logger.debug("Updated active job count successfully.")
|
|
except Exception as e:
|
|
service_logger.exception(e)
|
|
|
|
def update_job_execution_dict(self, job_string, exec_time, delete=False):
|
|
'''
|
|
Thread safe function to update job_aggregate_execution_info dict
|
|
@param job_string : job string
|
|
@param exec_time: Job execution time
|
|
@param delete: Delete flags if want to delete job_aggregate_execution_info
|
|
Pass other parameters as None if delete flag is true
|
|
@return: nothing
|
|
'''
|
|
try:
|
|
with self.job_aggregate_execution_info_lock:
|
|
if delete:
|
|
del self.job_aggregate_execution_info
|
|
self.job_aggregate_execution_info = {}
|
|
else:
|
|
key = self._get_key(job_string)
|
|
if key in self.job_aggregate_execution_info:
|
|
# value is tuple of avg execution so far, reported execution count which is needed to calculated the count
|
|
# Convert this value to float so we can store large number here
|
|
total_reported_count = self.job_aggregate_execution_info[key][1] + 1
|
|
avg_value = (float)((self.job_aggregate_execution_info[key][0] *
|
|
self.job_aggregate_execution_info[key][1] + float(
|
|
exec_time)) / total_reported_count)
|
|
self.job_aggregate_execution_info[key] = (avg_value, total_reported_count)
|
|
else:
|
|
self.job_aggregate_execution_info[key] = (float(exec_time), 1)
|
|
service_logger.debug("[JobTime] Successfully updated, key=%s, execution time=%s", key, exec_time)
|
|
except Exception as e:
|
|
service_logger.error(e)
|
|
service_logger.exception(e)
|
|
|
|
def get_aggregate_info(self):
|
|
'''
|
|
Return aggregation information
|
|
@return Dict which hold category key as target|task|metadata_id, value is list of following items
|
|
0 -- Avg execution time
|
|
1 -- Reported times
|
|
2 -- uncalimed job for this category
|
|
otherwise empty dictionary
|
|
'''
|
|
# Aggregate both dict in one so response size would be less
|
|
temp_dict = {}
|
|
# get lock
|
|
with self.job_aggregate_execution_info_lock:
|
|
for key, values in six.iteritems(self.job_aggregate_execution_info):
|
|
# value contains list of aggregate execution time, number of reported for exec time, unclaimed job count
|
|
temp_dict[key] = [values[0], values[1], 0]
|
|
# get another lock
|
|
with self.active_job_category_lock:
|
|
for key, value in six.iteritems(self.active_job_category_count):
|
|
if key in temp_dict:
|
|
# value contains list of aggregate execution time, number of reported for exec time, unclaimed job count
|
|
temp_dict[key] = [temp_dict[key][0], temp_dict[key][1], value]
|
|
else:
|
|
# value contains list of aggregate execution time, number of reported for exec time, unclaimed job count
|
|
temp_dict[key] = [0, 0, value]
|
|
|
|
# Note update_job_execution_dict put the lock so no external lock is required
|
|
# Delete these data as it is being reported to scheduler but self.active_job_category_count need to be present
|
|
self.update_job_execution_dict(None, None, delete=True)
|
|
|
|
return temp_dict
|
|
|
|
|
|
#Decorators
|
|
class HandleRequest(object):
|
|
"""
|
|
decorator for exception handling, validating and logging requests properly
|
|
"""
|
|
|
|
def __init__(self, expected_methods, enforce_auth=True):
|
|
"""
|
|
Request validation utility for expected HTTP verbs
|
|
ARGS:
|
|
expected_methods - array of supported methods
|
|
"""
|
|
self.expected_methods = expected_methods
|
|
self.enforce_auth = enforce_auth
|
|
|
|
def __call__(self, fn):
|
|
def wrapped_fn(environ, start_response):
|
|
start = time.time()
|
|
#Access logging through start response calls
|
|
def wrapped_start_response(status, response_headers):
|
|
end = time.time()
|
|
duration = int((end - start) * 100)
|
|
access_logger.info("%s %s '%s' - - - %sms", environ["REQUEST_METHOD"], environ.get("SCRIPT_NAME", "/"),
|
|
status, duration)
|
|
return start_response(status, response_headers)
|
|
|
|
global hydra_gateway_auth_token
|
|
|
|
#Authentication
|
|
if self.enforce_auth and environ.get("HTTP_X_HYDRA_AUTH", "") != hydra_gateway_auth_token:
|
|
service_logger.error("authentication invalid or missing for path='%s'", environ.get("SCRIPT_NAME", "/"))
|
|
status = "401 Unauthorized"
|
|
response_headers = [('Content-type', 'text/plain')]
|
|
wrapped_start_response(status, response_headers)
|
|
return []
|
|
#Method validation
|
|
elif environ["REQUEST_METHOD"] not in self.expected_methods:
|
|
service_logger.error("bad request for path='%s' got request_method='%s', expected_methods=%s",
|
|
environ.get("SCRIPT_NAME", "/"), environ["REQUEST_METHOD"], self.expected_methods)
|
|
status = "400 Bad Request"
|
|
response_headers = [('Content-type', 'text/plain')]
|
|
wrapped_start_response(status, response_headers)
|
|
return []
|
|
#Actual request handling
|
|
else:
|
|
try:
|
|
return fn(environ, wrapped_start_response)
|
|
except Exception as e:
|
|
service_logger.exception("Internal Server Error on request='%s %s' specific error: %s",
|
|
environ["REQUEST_METHOD"], environ.get("SCRIPT_NAME", "/"), str(e))
|
|
status = "500 Internal Server Error"
|
|
response_headers = [('Content-type', 'text/plain')]
|
|
wrapped_start_response(status, response_headers)
|
|
return []
|
|
|
|
return wrapped_fn
|
|
|
|
#------------------------------------------------------------------------------
|
|
|
|
#===============================================================================
|
|
# Utilities & Globals
|
|
#===============================================================================
|
|
|
|
access_logger = setupLogger(
|
|
logger_name="hydra-access",
|
|
log_name="hydra_access.log",
|
|
log_format='%(asctime)s %(levelname)s %(message)s')
|
|
|
|
service_logger = setupLogger(
|
|
logger_name="hydra-gateway",
|
|
log_name="hydra_gateway.log",
|
|
log_format='%(asctime)s %(levelname)s [HydraWSGI:%(process)d] %(message)s')
|
|
|
|
# Initialize our globals
|
|
|
|
job_queue = PriorityQueue()
|
|
cache_manager = HydraCacheManager()
|
|
hydra_gateway_auth_token = None
|
|
hydra_gateway_challenge_token = None
|
|
job_exec_info_manager = HydraJobExecutionInfoManager()
|
|
|
|
# Expired job count (It is not being used, but put it for future reference)
|
|
expired_job_count = 0
|
|
|
|
# Create a lock each shared object
|
|
expired_job_count_lock = threading.Lock()
|
|
|
|
# ------------------------------------------------------------------------------
|
|
|
|
# ===============================================================================
|
|
# Hydra Cache Services
|
|
# ===============================================================================
|
|
|
|
@HandleRequest(["GET", "POST"])
|
|
def control_cache(environ, start_response):
|
|
"""
|
|
Service for shared cache. A GET will pull the current cache value,
|
|
a POST will set the current cache value to the POST body of the
|
|
cache.
|
|
Required header of X-HYDRA-CACHE-NAME will determine which cache to
|
|
set.
|
|
Optional header of X-HYDRA-CACHE-EXPIRY will set a special expiration on
|
|
the cache value.
|
|
REQUEST
|
|
-> GET /hydra/cache
|
|
-> POST /hydra/cache
|
|
-> BODY of the cache value to be set
|
|
RESPONSE
|
|
status 200: cache set or got successfully
|
|
-> EMPTY OR TEXT <serialized cache>
|
|
status 400: could not find cache name to get or set
|
|
-> EMPTY
|
|
status 404: could not find cache of specified name
|
|
-> EMPTY
|
|
"""
|
|
|
|
# Validate the request
|
|
|
|
cache_name = environ.get("HTTP_X_HYDRA_CACHE_NAME", None)
|
|
response_headers = [("Content-type", "text/plain")]
|
|
|
|
if cache_name is None:
|
|
service_logger.error("bad request, no hydra cache name for path='%s'", environ.get("SCRIPT_NAME", "/"))
|
|
status = "400 Bad Request"
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
# Parse the HTTP method
|
|
|
|
if environ["REQUEST_METHOD"] == "GET":
|
|
response = cache_manager.get_cache(cache_name)
|
|
|
|
if response is None:
|
|
status = "404 Not Found"
|
|
start_response(status, response_headers)
|
|
return []
|
|
else:
|
|
response = response.encode('utf-8')
|
|
status = "200 OK"
|
|
start_response(status, response_headers)
|
|
return [response]
|
|
else:
|
|
# Implicit POST if not GET since we have method filtering
|
|
expiration = environ.get("HTTP_X_HYDRA_CACHE_EXPIRY", None)
|
|
if expiration is not None:
|
|
expiration = int(expiration)
|
|
req_in = environ.get("wsgi.input", None)
|
|
content = req_in.read(int(environ["CONTENT_LENGTH"]))
|
|
cache_manager.set_cache(cache_name, content.decode('utf-8').strip("\n"), expiration=expiration)
|
|
|
|
status = "200 OK"
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
|
|
@HandleRequest(["POST"])
|
|
def post_cache_batch(environ, start_response):
|
|
"""
|
|
Service for shared cache. A POST will allow the body to be processed into
|
|
multiple cache entries. The form of the body of the request must be \n
|
|
delimited cache entries of the form <cache_name>\t<cache_value>.
|
|
Optional header of X-HYDRA-CACHE-EXPIRY will set a special expiration on
|
|
the cache value.
|
|
REQUEST
|
|
-> POST /hydra/cache/batch
|
|
-> BODY of the \n delmited cache entries
|
|
RESPONSE
|
|
status 200: batch processed successfully
|
|
-> EMPTY
|
|
"""
|
|
# Validate the request
|
|
response_headers = [("Content-type", "text/plain")]
|
|
expiration = environ.get("HTTP_X_HYDRA_CACHE_EXPIRY", None)
|
|
if expiration is not None:
|
|
expiration = int(expiration)
|
|
req_in = environ.get("wsgi.input", None)
|
|
content = req_in.read(int(environ["CONTENT_LENGTH"]))
|
|
things = content.decode('utf-8').strip("\n").split("\n")
|
|
service_logger.info("[CacheBatchProcessor] parsed cache batch of count=%s", len(things))
|
|
for thing in things:
|
|
# TODO: we could probably make this more efficient by using the found index to sub string, but whatever
|
|
if thing.find("\t") != -1:
|
|
cache_name, cache_value = thing.split("\t", 1)
|
|
cache_manager.set_cache(cache_name, cache_value, expiration=expiration)
|
|
|
|
status = "200 OK"
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
|
|
# ===============================================================================
|
|
# Hydra Job Services
|
|
# ===============================================================================
|
|
@HandleRequest(["GET"])
|
|
def get_job_info(environ, start_response):
|
|
"""
|
|
Service for determining the current job queue length.
|
|
See the HydraJobExecutionInfoManager methods for detailed descriptions of
|
|
the contents of "dict of key!" used below.
|
|
REQUEST
|
|
-> GET /hydra/job/info
|
|
RESPONSE
|
|
-> JSON { count: <QUEUE LENGTH>,
|
|
expired_job_count: <EXPIRED_JOB_COUNT>,
|
|
job_aggregate_execution_info : dict of key!
|
|
atomic_job_info : dict of key!}
|
|
"""
|
|
# Technically you cannot guarantee the exact queue size but this is good enough
|
|
dict_data = {"count": job_queue.qsize()}
|
|
# Get data from the execution info manager
|
|
dict_data["job_aggregate_execution_info"] = job_exec_info_manager.get_aggregate_info()
|
|
dict_data["atomic_job_info"] = job_exec_info_manager.get_atomic_job_info()
|
|
response = json.dumps(dict_data).encode('utf-8')
|
|
status = "200 OK"
|
|
response_headers = [("Content-type", "application/json")]
|
|
start_response(status, response_headers)
|
|
return [response]
|
|
|
|
|
|
@HandleRequest(["GET"])
|
|
def get_pop_job(environ, start_response):
|
|
"""
|
|
Service for popping out the next job to do.
|
|
Optional arg block if 0 will 404 if it couldn't get a job, if 1 will blcok
|
|
for 3s before 404. defaults to 1
|
|
REQUEST
|
|
-> GET /hydra/job/pop
|
|
-> GET ARGS:
|
|
block - 0 or 1 (1 by default)
|
|
RESPONSE
|
|
status 200: job popped successfully
|
|
-> TEXT <serialized JobTuple>
|
|
status 404: could not get a job (empty or blocked)
|
|
-> EMPTY
|
|
"""
|
|
try:
|
|
# Parse get args
|
|
block = True
|
|
get_args = environ.get("QUERY_STRING", None)
|
|
if get_args is not None and get_args != '':
|
|
arg_list = get_args.split("&")
|
|
for arg in arg_list:
|
|
kv_pair = arg.split("=")
|
|
if kv_pair[0] == "block" and len(kv_pair) == 2:
|
|
if kv_pair[1] == "0":
|
|
block = False
|
|
else:
|
|
block = True
|
|
else:
|
|
service_logger.warn("[JobPopper] got unexpected get_arg=%s", kv_pair)
|
|
# Pop a job off the queue
|
|
priority_number, response = job_queue.get(block=block, timeout=3)
|
|
|
|
# Update_activejob_category
|
|
job_exec_info_manager.update_activejob_category(response, -1)
|
|
|
|
service_logger.debug("[JobPopper] popped out job with priority_number=%s", priority_number)
|
|
status = "200 OK"
|
|
response_headers = [("Content-type", "text/plain")]
|
|
start_response(status, response_headers)
|
|
response = response.encode('utf-8')
|
|
return [response]
|
|
except Empty:
|
|
status = "404 Not Found"
|
|
response_headers = [("Content-type", "text/plain")]
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
|
|
@HandleRequest(["POST"])
|
|
def post_job_batch(environ, start_response):
|
|
"""
|
|
Service for adding a batch of jobs to the queue
|
|
REQUEST
|
|
-> POST /hydra/job/batch
|
|
-> BODY:
|
|
\n delimited strings of the form <priority_number>:<serialized JobTuple>
|
|
RESPONSE
|
|
-> EMPTY
|
|
"""
|
|
response_headers = [("Content-type", "text/plain")]
|
|
service_logger.debug("[JobBatchProcessor] processing batch of content_length=%s", environ["CONTENT_LENGTH"])
|
|
req_in = environ.get("wsgi.input", None)
|
|
# TODO: slurping could get out of hand fast, we should keep a rolling buffer going, probably with StringIO
|
|
content = req_in.read(int(environ["CONTENT_LENGTH"]))
|
|
things = content.decode('utf-8').split("\n")
|
|
service_logger.debug("[JobBatchProcessor] parsed job batch of count=%s", len(things))
|
|
for thing in things:
|
|
# TODO: we could probably make this more efficient by using the found index to sub string, but whatever
|
|
if thing.find(":") != -1:
|
|
priority_number, job_string = thing.split(":", 1)
|
|
job_queue.put((int(priority_number), job_string))
|
|
|
|
# We are doing this after jobs is store in queue so worker will not block on this
|
|
# Update active job count which is approx.
|
|
job_exec_info_manager.abstract_activejob_category(things)
|
|
status = "200 OK"
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
|
|
def update_expired_job_count_var(increment_by=1):
|
|
'''
|
|
Thread safe function to update job_execution_info update
|
|
@param increment_by : increment value, default is one
|
|
|
|
@return: nothing
|
|
'''
|
|
with expired_job_count_lock:
|
|
expired_job_count = expired_job_count + increment_by
|
|
|
|
|
|
@HandleRequest(["POST"])
|
|
def update_job_failure(environ, start_response):
|
|
"""
|
|
Service for reporting a failed job. Right now only used for atomic jobs
|
|
REQUEST
|
|
-> POST /hydra/job/execution/failure
|
|
-> HEADERS:
|
|
X-HYDRA-ATOMIC-JOB: value is the job name, if not passed job is treated as not atomic (currently a bad request)
|
|
-> BODY:
|
|
EMPTY
|
|
RESPONSE
|
|
-> EMPTY
|
|
"""
|
|
# handle atomic job failure
|
|
atomic_job = environ.get("HTTP_X_HYDRA_ATOMIC_JOB", None)
|
|
if atomic_job is not None:
|
|
service_logger.info("[UpdateJobFailure] received notice of atomic job failure for job=%s", atomic_job)
|
|
job_exec_info_manager.add_failed_atomic_job(atomic_job)
|
|
status = "200 OK"
|
|
else:
|
|
status = "400 Bad Request"
|
|
|
|
response_headers = [("Content-type", "text/plain")]
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
|
|
@HandleRequest(["POST"])
|
|
def update_job_execution(environ, start_response):
|
|
"""
|
|
Service for aggregate execution time for JobTuple
|
|
REQUEST
|
|
-> POST /hydra/job/execution/info
|
|
-> HEADERS:
|
|
X-HYDRA-ATOMIC-JOB: value is the job name, if not passed job is treated as not atomic
|
|
-> BODY:
|
|
strings of the form execution time in sec:<serialized JobTuple>
|
|
RESPONSE
|
|
-> EMPTY
|
|
"""
|
|
service_logger.debug("[UpdateJobExecution] adding execution time")
|
|
req_in = environ.get("wsgi.input", None)
|
|
content = req_in.read(int(environ["CONTENT_LENGTH"]))
|
|
content = content.decode('utf-8')
|
|
# handle atomic job completion
|
|
atomic_job = environ.get("HTTP_X_HYDRA_ATOMIC_JOB", None)
|
|
if atomic_job is not None:
|
|
job_exec_info_manager.add_completed_atomic_job(atomic_job)
|
|
|
|
# handle job execution time
|
|
if content.find(":") != -1:
|
|
exec_time, job_string = content.split(":", 1)
|
|
job_exec_info_manager.update_job_execution_dict(job_string, float(exec_time))
|
|
else:
|
|
service_logger.error("[UpdateJobExecution] Failed to process content=%s", content)
|
|
status = "200 OK"
|
|
response_headers = [("Content-type", "text/plain")]
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
|
|
@HandleRequest(["POST"])
|
|
def update_expired_job_count(environ, start_response):
|
|
"""
|
|
Service to update expired job count
|
|
REQUEST
|
|
-> POST /hydra/job/execution/expired
|
|
-> BODY:
|
|
strings of the valid integer number
|
|
RESPONSE
|
|
-> EMPTY
|
|
"""
|
|
response_headers = [("Content-type", "text/plain")]
|
|
service_logger.debug("[JobExpired] adding execution time")
|
|
req_in = environ.get("wsgi.input", None)
|
|
content = req_in.read(int(environ["CONTENT_LENGTH"]))
|
|
content = content.decode('utf-8')
|
|
try:
|
|
update_expired_job_count_var(int(content))
|
|
service_logger.debug("successfully updated expired job count")
|
|
except Exception as e:
|
|
service_logger.error("[JobExpired] failed to update expired job count content=%s, exception=%s", content, e)
|
|
service_logger.exception(e)
|
|
status = "200 OK"
|
|
start_response(status, response_headers)
|
|
return []
|
|
|
|
|
|
#===============================================================================
|
|
# Test Services
|
|
#===============================================================================
|
|
@HandleRequest(["GET"])
|
|
def test_static(environ, start_response):
|
|
'''
|
|
Simple static resource
|
|
'''
|
|
service_logger.debug("in test static environ=%s", environ)
|
|
status = '200 OK'
|
|
response_headers = [('Content-type', 'text/plain')]
|
|
start_response(status, response_headers)
|
|
response = '\nHail Hydra!\n'.encode('utf-8')
|
|
return [response]
|
|
|
|
|
|
@HandleRequest(["POST"])
|
|
def test_echo(environ, start_response):
|
|
'''
|
|
Simple, non-streaming echo server
|
|
'''
|
|
service_logger.debug("in test echo environ=%s", environ)
|
|
status = '200 OK'
|
|
req_in = environ.get("wsgi.input", None)
|
|
content = req_in.read(int(environ["CONTENT_LENGTH"]))
|
|
response_headers = [('Content-type', 'text/plain')]
|
|
start_response(status, response_headers)
|
|
response = "\n###########\nECHO SERVER\n###########\n" + content + "\n\n"
|
|
response = response.encode('utf-8')
|
|
return [response]
|
|
|
|
|
|
#===============================================================================
|
|
# Admin Services
|
|
#===============================================================================
|
|
@HandleRequest(["GET"], enforce_auth=False)
|
|
def get_challenge_key(environ, start_response):
|
|
'''
|
|
Return the instance challenge key. This is used to detect mismatches between
|
|
the running gateway and splunkd's understanding of the running gateway. This
|
|
challenge key is NOT a valid auth key
|
|
'''
|
|
status = '200 OK'
|
|
response_headers = [('Content-type', 'text/plain')]
|
|
global hydra_gateway_challenge_token
|
|
byte_hydra_gateway_challenge_token = hydra_gateway_challenge_token.encode('utf-8')
|
|
service_logger.debug("responding to challenge request with hydra_gateway_challenge_token: %s",
|
|
hydra_gateway_challenge_token)
|
|
start_response(status, response_headers)
|
|
return [byte_hydra_gateway_challenge_token]
|
|
|
|
|
|
# ------------------------------------------------------------------------------
|
|
|
|
# ===============================================================================
|
|
# Web Service Constructor
|
|
# ===============================================================================
|
|
def recover_cache():
|
|
"""
|
|
Attempt to locate a serialized cache on disk and load it into the current cache.
|
|
"""
|
|
|
|
directory = make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run"])
|
|
if not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
try:
|
|
with open(make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run", "hydra_gateway.cache"]), 'r') as f:
|
|
serialized_cache = f.read()
|
|
if serialized_cache:
|
|
service_logger.info("attempting to deserialize cache...")
|
|
old_cache = json.loads(serialized_cache)
|
|
for name, cache, expiry in old_cache:
|
|
cache_manager.set_cache(name, cache, expiration=expiry)
|
|
service_logger.info("cache deserialized with %s entries", len(old_cache))
|
|
except Exception as e:
|
|
service_logger.error(str(e))
|
|
|
|
|
|
def serialize_cache():
|
|
"""
|
|
Serialize the special expiration items in the cache to disk. Note we
|
|
capture and ignore args and kwargs so this can be used as a signal handler.
|
|
"""
|
|
try:
|
|
service_logger.info("attempting to serialize cache...")
|
|
directory = make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run"])
|
|
if not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
with open(make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run", "hydra_gateway.cache"]), 'w') as f:
|
|
now = time.time()
|
|
serializable_cache = []
|
|
for name, expiration_time in six.iteritems(cache_manager.special_expirations):
|
|
serializable_cache.append((name, cache_manager.cache[name][1], (expiration_time - now)))
|
|
service_logger.info("serializing %s cache entries", len(serializable_cache))
|
|
f.write(json.dumps(serializable_cache))
|
|
service_logger.info("cache serialized")
|
|
except Exception as e:
|
|
service_logger.warning("unable to serialize cache, message=%s", str(e))
|
|
|
|
def normalize_value(value):
|
|
"""
|
|
Convert a string value into an integer or boolean if appropriate
|
|
"""
|
|
if not isinstance(value, six.string_types):
|
|
return value
|
|
|
|
# first try to cast to ints; if fail, try to cast to boolean
|
|
try:
|
|
value = int(value)
|
|
except ValueError:
|
|
value = normalizeBoolean(value)
|
|
|
|
return value
|
|
|
|
def bootstrap_web_service(port=8008, service_log_level="INFO", access_log_level="INFO"):
|
|
"""
|
|
Start up the hydra web service from conf file definitions
|
|
|
|
RETURNS reference to un-started server
|
|
"""
|
|
# Establish the route dispatcher
|
|
routes = {'/hydra/cache': control_cache,
|
|
'/hydra/cache/batch': post_cache_batch,
|
|
'/hydra/job/info': get_job_info,
|
|
'/hydra/job/execution/info': update_job_execution,
|
|
'/hydra/job/execution/failure': update_job_failure,
|
|
'/hydra/job/execution/expired': update_expired_job_count,
|
|
'/hydra/job/pop': get_pop_job,
|
|
'/hydra/job/batch': post_job_batch,
|
|
'/test/static': test_static,
|
|
'/test/echo': test_echo,
|
|
'/hydra/admin/challenge': get_challenge_key}
|
|
|
|
if ver.__version__ < '8.0.0':
|
|
dispatch = wsgiserver.WSGIPathInfoDispatcher(routes)
|
|
else:
|
|
dispatch = wsgi.PathInfoDispatcher(routes)
|
|
|
|
# Set auth and challenge tokens
|
|
new_key = str(uuid.uuid4())
|
|
global hydra_gateway_auth_token
|
|
hydra_gateway_auth_token = new_key
|
|
new_key = str(uuid.uuid4())
|
|
global hydra_gateway_challenge_token
|
|
hydra_gateway_challenge_token = new_key
|
|
|
|
# Set log levels
|
|
|
|
access_log_level = access_log_level.upper()
|
|
|
|
if access_log_level not in ["DEBUG", "INFO", "WARN", "WARNING", "ERROR"]:
|
|
access_logger.setLevel(logging.INFO)
|
|
access_logger.warning(
|
|
"unrecognizable configured access log level: %s, resetting log level to INFO", access_log_level)
|
|
else:
|
|
access_logger.setLevel(access_log_level)
|
|
|
|
service_log_level = service_log_level.upper()
|
|
|
|
if service_log_level not in ["DEBUG", "INFO", "WARN", "WARNING", "ERROR"]:
|
|
service_logger.setLevel(logging.INFO)
|
|
service_logger.warning(
|
|
"unrecognizable configured service logging level: %s, resetting logging level to INFO", service_log_level)
|
|
else:
|
|
service_logger.setLevel(service_log_level)
|
|
|
|
# Get basic configuration
|
|
global_cfg = {}
|
|
for k, v in six.iteritems(getConfStanza('web', 'settings')):
|
|
global_cfg[k] = normalize_value(v)
|
|
|
|
if "caCertPath" in global_cfg: # copy deprecated caCertPath to serverCert
|
|
global_cfg["serverCert"] = global_cfg["caCertPath"]
|
|
host_name = getDefault("host")
|
|
|
|
# Get SSL configuration
|
|
service_logger.info('parsing SSL config from splunk web.conf...')
|
|
|
|
"""Added this condition to support the Splunk Ivory release - Jira ticket VMW-4377 and NETAPP-638 """
|
|
if ver.__version__ > '6.4.9':
|
|
private_key_path = str(global_cfg['privKeyPath']).replace('$SPLUNK_HOME/', '')
|
|
ssl_certificate = str(global_cfg['serverCert']).replace('$SPLUNK_HOME/', '')
|
|
else:
|
|
private_key_path = str(global_cfg['privKeyPath'])
|
|
ssl_certificate = str(global_cfg['caCertPath'])
|
|
if os.path.isabs(private_key_path):
|
|
global_cfg['server.ssl_private_key'] = private_key_path
|
|
else:
|
|
global_cfg['server.ssl_private_key'] = make_splunkhome_path([private_key_path])
|
|
if os.path.isabs(ssl_certificate):
|
|
global_cfg['server.ssl_certificate'] = ssl_certificate
|
|
else:
|
|
global_cfg['server.ssl_certificate'] = make_splunkhome_path([ssl_certificate])
|
|
|
|
# Validate Configuration
|
|
if not os.path.exists(global_cfg['server.ssl_private_key']):
|
|
service_logger.error("Failed to bootstrap hydra service due to configured ssl key missing: %s",
|
|
global_cfg['server.ssl_private_key'])
|
|
raise ValueError("Private Key: '%s' Not Found" % global_cfg['server.ssl_private_key'])
|
|
if not os.path.exists(global_cfg['server.ssl_certificate']):
|
|
service_logger.error("Failed to bootstrap hydra service due to configured ssl cert missing: %s",
|
|
global_cfg['server.ssl_certificate'])
|
|
raise ValueError("Certificate: '%s' Not Found" % global_cfg['server.ssl_certificate'])
|
|
|
|
|
|
|
|
global_cfg['server.ssl_options'] = 0
|
|
|
|
if 'cipherSuite' in global_cfg and global_cfg.get('cipherSuite'):
|
|
global_cfg['server.ssl_ciphers'] = str(global_cfg['cipherSuite'])
|
|
|
|
if 'sslPassword' in global_cfg:
|
|
"""getConfStanza method returns "REDACTED_ENCRYPTED_PASSWORD" string for encrypted sslPassword.
|
|
Below logic is to get the encrypted sslPassword stored in Splunk web.conf file and
|
|
overwrite it in the global_cfg dictionary as mentioned in VMW-6500."""
|
|
mergedWebConf = getMergedConf('web')
|
|
global_cfg['sslPassword'] = mergedWebConf['settings']['sslPassword']
|
|
try:
|
|
global_cfg['server.ssl_password'] = decrypt(str(global_cfg['sslPassword']))
|
|
except:
|
|
service_logger.info("Found sslPassword in web.conf but unable to decrypt it")
|
|
|
|
if 'sslVersions' in global_cfg:
|
|
try:
|
|
from ssl import PROTOCOL_SSLv3, PROTOCOL_SSLv23, PROTOCOL_TLSv1, PROTOCOL_TLSv1_1, PROTOCOL_TLSv1_2, OP_NO_SSLv2, OP_NO_SSLv3
|
|
|
|
acceptedSSLVersions = {
|
|
'all': {'server.ssl_version': PROTOCOL_SSLv23},
|
|
'ssl3': {'server.ssl_version': PROTOCOL_SSLv3},
|
|
'tls1.0': {'server.ssl_version': PROTOCOL_TLSv1},
|
|
'tls1.1': {'server.ssl_version': PROTOCOL_TLSv1_1},
|
|
'tls1.2': {'server.ssl_version': PROTOCOL_TLSv1_2},
|
|
'ssl3, tls': {'server.ssl_version': PROTOCOL_SSLv23,
|
|
'server.ssl_options': OP_NO_SSLv2},
|
|
'tls': {'server.ssl_version': PROTOCOL_SSLv23,
|
|
'server.ssl_options': OP_NO_SSLv2 | OP_NO_SSLv3}
|
|
}
|
|
|
|
except ImportError:
|
|
from ssl import PROTOCOL_SSLv3, PROTOCOL_SSLv23, PROTOCOL_TLSv1
|
|
acceptedSSLVersions = {
|
|
'all': {'server.ssl_version': PROTOCOL_SSLv23},
|
|
'ssl3': {'server.ssl_version': PROTOCOL_SSLv3},
|
|
'tls1.0': {'server.ssl_version': PROTOCOL_TLSv1}
|
|
}
|
|
|
|
if global_cfg['sslVersions'] in acceptedSSLVersions:
|
|
global_cfg.update(acceptedSSLVersions[global_cfg['sslVersions']])
|
|
|
|
|
|
|
|
else:
|
|
#default case ssl2+
|
|
service_logger.warn("Undefined sslVersion='%s'. Please select from 'all', 'ssl3, tls' or 'tls'." % global_cfg.get('sslVersions'))
|
|
service_logger.warn("Defaulting sslVersion to 'all'")
|
|
global_cfg['server.ssl_version'] = PROTOCOL_SSLv23
|
|
|
|
else:
|
|
# No 'sslVersions'-- old and possibly POODLE-problematic.
|
|
try:
|
|
from _ssl import PROTOCOL_SSLv23_NO23
|
|
global_cfg['server.ssl_version'] = PROTOCOL_SSLv23_NO23
|
|
except ImportError:
|
|
service_logger.warning("POODLE Vulnerable: Please update to Splunk v6.0.7, v6.1.5, or v6.2.1 or higher for protection.")
|
|
|
|
# Validate port availability--since we can't start the server--and then write the key files we need to validate
|
|
# prior to start
|
|
try:
|
|
sock = socket.socket()
|
|
sock.connect((host_name, port))
|
|
sock.close()
|
|
service_logger.error("[%s:%d] could not bootstrap gateway because port=%d is in use", host_name, port, port)
|
|
sys.exit(1)
|
|
except socket.error as e:
|
|
if e.errno == 10061:
|
|
service_logger.debug('[%s:%d] port=%d is available.', host_name, port, port)
|
|
else:
|
|
service_logger.warning('[%s:%d] unexpected socket error: %s', host_name, port, e)
|
|
pass
|
|
|
|
# Build server
|
|
if ver.__version__ < '8.0.0':
|
|
server = wsgiserver.CherryPyWSGIServer(('0.0.0.0', port), dispatch, server_name=host_name)
|
|
global_cfg['server.ssl_options'] = global_cfg['server.ssl_options'] | ssl.OP_NO_COMPRESSION
|
|
for key in ('ssl_private_key', 'ssl_options', 'ssl_version', 'ssl_certificate', 'ssl_ciphers', 'ssl_password'):
|
|
if 'server.'+key in global_cfg:
|
|
setattr(server, key, global_cfg['server.'+key])
|
|
else:
|
|
server = wsgi.Server(('0.0.0.0', port), dispatch, server_name=host_name)
|
|
global_cfg['server.ssl_options'] = global_cfg['server.ssl_options'] | ssl.OP_NO_COMPRESSION
|
|
|
|
ctx = ssl.SSLContext(global_cfg['server.ssl_version'] if global_cfg['server.ssl_version'] >= 0 else ssl.PROTOCOL_SSLv23)
|
|
ctx.options |= global_cfg['server.ssl_options']
|
|
if global_cfg['server.ssl_certificate']:
|
|
ctx.load_cert_chain(global_cfg['server.ssl_certificate'], global_cfg['server.ssl_private_key'], global_cfg.get('server.ssl_password', None))
|
|
if global_cfg['server.ssl_ciphers']:
|
|
ctx.set_ciphers(global_cfg['server.ssl_ciphers'])
|
|
|
|
if ver.__version__ == '8.0.0':
|
|
HTTPServer.ssl_adapter = builtin.BuiltinSSLAdapter(global_cfg['server.ssl_certificate'], global_cfg['server.ssl_private_key'], ciphers=global_cfg['server.ssl_ciphers'])
|
|
else:
|
|
HTTPServer.ssl_adapter = builtin.BuiltinSSLAdapter(global_cfg['server.ssl_certificate'], global_cfg['server.ssl_private_key'], ciphers=global_cfg['server.ssl_ciphers'], password=global_cfg.get('server.ssl_password', None))
|
|
|
|
HTTPServer.ssl_adapter.context = ctx
|
|
|
|
# Commit tokens to filesystem
|
|
directory = make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run"])
|
|
if not os.path.exists(directory):
|
|
os.makedirs(directory)
|
|
with open(make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run", "hydra_gateway.key"]), 'w') as f:
|
|
f.write("\n".join([hydra_gateway_challenge_token, hydra_gateway_auth_token, "\n"]))
|
|
|
|
# Attempt to recover existing cache, if any
|
|
try:
|
|
recover_cache()
|
|
except Exception as e:
|
|
service_logger.warning("unable to recover a serialized cache, message=%s", str(e))
|
|
|
|
# Bind a cache serialization to SIGTERM and SIGINT
|
|
def signal_handler(sig, frame):
|
|
"""
|
|
Handle signals and terminate the process while preserving state.
|
|
"""
|
|
service_logger.info("initiating safe shutdown after receiving signal=%s", sig)
|
|
service_logger.info("stopping cherrypy wsgi server")
|
|
server.stop()
|
|
service_logger.info("cherrypy wsgi server stopped")
|
|
serialize_cache()
|
|
service_logger.info("exiting parent process")
|
|
sys.exit(0)
|
|
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
return server
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
server = bootstrap_web_service()
|
|
server.start()
|
|
except ValueError as e:
|
|
service_logger.exception("Failed to bootstrap hydra service due to configuration error: %s", str(e))
|