Deploiement_Server/deployment-apps/SA-Hydra/bin/hydra/hydra_gateway.py

#!/usr/bin/env python
# coding=utf-8
#
# Copyright (C) 2005-2023 Splunk Inc. All Rights Reserved.

# CORE PYTHON IMPORTS
import logging
import sys
import os
import time
import json
try:
    from queue import PriorityQueue, Empty
except:
    from Queue import PriorityQueue, Empty
import threading
import socket
import signal
# Splunk Python does not bundle UUID, so we've included it in the hydra bin, but it is a core python module
import uuid
from . import six

# CORE SPLUNK IMPORTS
# import splunk
try:
    from cheroot import wsgi
    from cheroot.ssl import builtin
    from cheroot.server import HTTPServer
except:
    from cherrypy import wsgiserver
from splunk import getDefault
from splunk.clilib.bundle_paths import make_splunkhome_path
from splunk.clilib.cli_common import getConfStanza, getMergedConf
try:
    from splunk.clilib.cli_common import decrypt
except:
    pass
from splunk.util import normalizeBoolean
import splunk.version as ver
import ssl

# SA-HYDRA IMPORTS
from hydra import setupLogger


# END IMPORTS
# ------------------------------------------------------------------------------


# ===============================================================================
# Helper Class Definitions
# ===============================================================================
# Thread safe classes
class HydraCacheManager(object):
    """
    A threadsafe cache system with built in pruning for outdated caches
    """

    def __init__(self, cache_expiration=3600):
        """
        Build the cache manager initializing internal properties.
        ARGS:
            cache_expiration - time in seconds after which a cache should be removed if it has not been touched
        """
        self.cache_expiration = cache_expiration

        # This is where we keep special expirations
        # special_expirations looks like name -> special_expiration_time
        self.special_expirations = {}
        # This is where we keep stuff
        # Cache looks like name -> (touch_time, value)
        self.cache = {}
        self.cache_lock = threading.Lock()
        # Keep track of the last time we pruned
        self.last_prune = time.time()

    def prune(self):
        """
        NOT THREADSAFE! execute only inside a lock.

        Prunes the cache of all old entries, will only execute if last_prune
        plus cache_expiration is less than now in epoch
        """
        if (self.last_prune + self.cache_expiration) < time.time():
            service_logger.info("[HydraCacheManager] current cache_length=%s, checking cache for outdated entries...",
                                len(self.cache))
            to_delete = []
            now = time.time()
            # Check what we need to prune
            for name, cache_item in six.iteritems(self.cache):
                if (now > (cache_item[0] + self.cache_expiration)) and (now > self.special_expirations.get(name, 0)):
                    if name in self.special_expirations:
                        del self.special_expirations[name]
                    to_delete.append(name)
            # Prune the offenders
            if len(to_delete) > 0:
                service_logger.info("[HydraCacheManager] found prune_count=%s entries to remove", len(to_delete))
                for name in to_delete:
                    service_logger.debug("[HydraCacheManager] pruning cache_entry=%s", name)
                    del self.cache[name]
            else:
                service_logger.debug("[HydraCacheManager] found no entries to prune")
            # Reset last prune time
            self.last_prune = now
            service_logger.info(
                "[HydraCacheManager] finished checking cache for outdated entries, final cache_length=%s",
                len(self.cache))

    def set_cache(self, name, value, expiration=None):
        """
        Sets the cache value for a particular item, thread safe
        """
        with self.cache_lock:
            now = time.time()
            self.cache[name] = (now, value)
            if isinstance(expiration, int):
                self.special_expirations[name] = now + expiration
            self.prune()

    def get_cache(self, name):
        """
        Gets the cache value for a particular item, threadsafe
        """
        value = None
        with self.cache_lock:
            cache_item = self.cache.get(name, None)
            if cache_item is not None:
                value = cache_item[1]
                self.cache[name] = (time.time(), value)

        return value


class HydraJobExecutionInfoManager(object):
    '''
        Handle job execution time information by doing aggregation at target|task|metadata_id level
        Handles atomic job completion and error at the job name level.
    '''

    def __init__(self):
        #Dict which holds avg execution time of taregt|task|metadata_id,
        #value is tuples of avg time and total cycles reported for this key
        self.job_aggregate_execution_info = {}
        self.job_aggregate_execution_info_lock = threading.Lock()

        #Dict which holds active jobs count based upon unique target|task|metadata_id
        #We use different dictionary for this of active job count does not block other dictionary
        self.active_job_category_count = {}
        self.active_job_category_lock = threading.Lock()

        #Lists which hold the completed and error'd out atomic jobs, since
        #appends are safe we only need one lock when we clear the lists
        self.failed_atomic_jobs = []
        self.completed_atomic_jobs = []
        self.atomic_jobs_lock = threading.Lock()

    #===============================================================================
    # Atomic Job Information Methods
    #===============================================================================
    def add_completed_atomic_job(self, job_name):
        """
        Thread safe method for appending a completed atomic task's job to the
        completed atomic job list.

        @type job_name: str
        @param job_name: the name/id of the completed job

        @rtype: None
        @return: None
        """
        self.completed_atomic_jobs.append(job_name)

    def add_failed_atomic_job(self, job_name):
        """
        Thread safe method for appending a failed atomic task's job to the
        failed atomic job list.

        @type job_name: str
        @param job_name: the name/id of the failed job

        @rtype: None
        @return: None
        """
        self.failed_atomic_jobs.append(job_name)

    def get_atomic_job_info(self):
        """
        Get and clear the lists of the completed and failed atomic jobs as a
        dict.

        @rtype: dict
        @return: dict of the completed and failed atomic job lists { "completed_atomic_jobs": [...], "failed_atomic_jobs": [...] }
        """
        temp_dict = {}
        with self.atomic_jobs_lock:
            temp_dict["completed_atomic_jobs"] = self.completed_atomic_jobs
            temp_dict["failed_atomic_jobs"] = self.failed_atomic_jobs
            self.completed_atomic_jobs = []
            self.failed_atomic_jobs = []

        return temp_dict

    #===============================================================================
    # Job Execution Time Information Methods
    #===============================================================================
    def _update_active_job_dict(self, key, difference=1, put_lock=True):
        '''
           Thread safe function to update active_job_category_count dict
           @param key : Unique key based upon target|task|metadata_id
           @param difference: increment count
           @param put_lock: put lock while updating dict
           @return: nothing
        '''

        def do():
            if key in self.active_job_category_count:
                self.active_job_category_count[key] = self.active_job_category_count[key] + difference
            else:
                self.active_job_category_count[key] = difference
            # if final value is less than zero, then set it to zero
            if self.active_job_category_count[key] < 0:
                self.active_job_category_count[key] = 0

        service_logger.debug("[ReduceJobCount] Reducing job count of category=%s by=%s", key, difference)
        if put_lock:
            with self.active_job_category_lock:
                do()
        else:
            do()

    def _get_key(self, job_string):
        '''
            Extract key from job_string
            Call this function in try/except block as we are assuming that job_string will have four |
        '''
        name, target, task, metadata_id, extra = job_string.split('|', 4)
        key = target + "|" + task + "|" + metadata_id
        return key

    def abstract_activejob_category(self, things):
        '''
            Updating active job count by passing job batch count
            @param things: \n delimited strings of the form <priority_number>:<serialized JobTuple>
        '''
        for thing in things:
            if thing.find(":") != -1:
                priority_number, job_string = thing.split(":", 1)
                self.update_activejob_category(job_string, 1)

    def update_activejob_category(self, job_string, difference, put_lock=True):
        '''
            Update unclaimed job count for a specific category
            @param job_string: job string
            @param difference: count value in which unclaimed job count increases or decreases
            @param put_lock: put lock while updating dict
        '''

        try:
            service_logger.debug("[ReduceJobCount] Reducing job count of job=%s", job_string)
            key = self._get_key(job_string)
            self._update_active_job_dict(key, difference, put_lock)
            service_logger.debug("Updated active job count successfully.")
        except Exception as e:
            service_logger.exception(e)

    def update_job_execution_dict(self, job_string, exec_time, delete=False):
        '''
           Thread safe function to update job_aggregate_execution_info dict
           @param job_string : job string
           @param exec_time: Job execution time
           @param delete: Delete flags if want to delete job_aggregate_execution_info
                               Pass other parameters as None if delete flag is true
           @return: nothing
        '''
        try:
            with self.job_aggregate_execution_info_lock:
                if delete:
                    del self.job_aggregate_execution_info
                    self.job_aggregate_execution_info = {}
                else:
                    key = self._get_key(job_string)
                    if key in self.job_aggregate_execution_info:
                        # value is tuple of avg execution so far, reported execution count which is needed to calculated the count
                        # Convert this value to float so we can store large number here
                        total_reported_count = self.job_aggregate_execution_info[key][1] + 1
                        avg_value = (float)((self.job_aggregate_execution_info[key][0] *
                                             self.job_aggregate_execution_info[key][1] + float(
                            exec_time)) / total_reported_count)
                        self.job_aggregate_execution_info[key] = (avg_value, total_reported_count)
                    else:
                        self.job_aggregate_execution_info[key] = (float(exec_time), 1)
                    service_logger.debug("[JobTime] Successfully updated, key=%s, execution time=%s", key, exec_time)
        except Exception as e:
            service_logger.error(e)
            service_logger.exception(e)

    def get_aggregate_info(self):
        '''
            Return aggregation information
            @return Dict which hold category key as target|task|metadata_id, value is list of following items
              0 -- Avg execution time
              1 -- Reported times
              2 -- uncalimed job for this category
            otherwise empty dictionary
        '''
        # Aggregate both dict in one so response size would be less
        temp_dict = {}
        # get lock
        with self.job_aggregate_execution_info_lock:
            for key, values in six.iteritems(self.job_aggregate_execution_info):
                # value contains list of aggregate execution time, number of reported for exec time, unclaimed job count
                temp_dict[key] = [values[0], values[1], 0]
        # get another lock
        with self.active_job_category_lock:
            for key, value in six.iteritems(self.active_job_category_count):
                if key in temp_dict:
                    # value contains list of aggregate execution time, number of reported for exec time, unclaimed job count
                    temp_dict[key] = [temp_dict[key][0], temp_dict[key][1], value]
                else:
                    # value contains list of aggregate execution time, number of reported for exec time, unclaimed job count
                    temp_dict[key] = [0, 0, value]

        # Note update_job_execution_dict put the lock so no external lock is required
        # Delete these data as it is being reported to scheduler but self.active_job_category_count need to be present
        self.update_job_execution_dict(None, None, delete=True)

        return temp_dict


#Decorators
class HandleRequest(object):
    """
    decorator for exception handling, validating and logging requests properly
    """

    def __init__(self, expected_methods, enforce_auth=True):
        """
        Request validation utility for expected HTTP verbs
        ARGS:
            expected_methods - array of supported methods
        """
        self.expected_methods = expected_methods
        self.enforce_auth = enforce_auth

    def __call__(self, fn):
        def wrapped_fn(environ, start_response):
            start = time.time()
            #Access logging through start response calls
            def wrapped_start_response(status, response_headers):
                end = time.time()
                duration = int((end - start) * 100)
                access_logger.info("%s %s '%s' - - - %sms", environ["REQUEST_METHOD"], environ.get("SCRIPT_NAME", "/"),
                                   status, duration)
                return start_response(status, response_headers)

            global hydra_gateway_auth_token

            #Authentication
            if self.enforce_auth and environ.get("HTTP_X_HYDRA_AUTH", "") != hydra_gateway_auth_token:
                service_logger.error("authentication invalid or missing for path='%s'", environ.get("SCRIPT_NAME", "/"))
                status = "401 Unauthorized"
                response_headers = [('Content-type', 'text/plain')]
                wrapped_start_response(status, response_headers)
                return []
            #Method validation
            elif environ["REQUEST_METHOD"] not in self.expected_methods:
                service_logger.error("bad request for path='%s' got request_method='%s', expected_methods=%s",
                                     environ.get("SCRIPT_NAME", "/"), environ["REQUEST_METHOD"], self.expected_methods)
                status = "400 Bad Request"
                response_headers = [('Content-type', 'text/plain')]
                wrapped_start_response(status, response_headers)
                return []
            #Actual request handling
            else:
                try:
                    return fn(environ, wrapped_start_response)
                except Exception as e:
                    service_logger.exception("Internal Server Error on request='%s %s' specific error: %s",
                                             environ["REQUEST_METHOD"], environ.get("SCRIPT_NAME", "/"), str(e))
                    status = "500 Internal Server Error"
                    response_headers = [('Content-type', 'text/plain')]
                    wrapped_start_response(status, response_headers)
                    return []

        return wrapped_fn

#------------------------------------------------------------------------------

#===============================================================================
# Utilities & Globals
#===============================================================================

access_logger = setupLogger(
    logger_name="hydra-access",
    log_name="hydra_access.log",
    log_format='%(asctime)s %(levelname)s %(message)s')

service_logger = setupLogger(
    logger_name="hydra-gateway",
    log_name="hydra_gateway.log",
    log_format='%(asctime)s %(levelname)s [HydraWSGI:%(process)d] %(message)s')

# Initialize our globals

job_queue = PriorityQueue()
cache_manager = HydraCacheManager()
hydra_gateway_auth_token = None
hydra_gateway_challenge_token = None
job_exec_info_manager = HydraJobExecutionInfoManager()

# Expired job count (It is not being used, but put it for future reference)
expired_job_count = 0

# Create a lock each shared object
expired_job_count_lock = threading.Lock()

# ------------------------------------------------------------------------------

# ===============================================================================
# Hydra Cache Services
# ===============================================================================

@HandleRequest(["GET", "POST"])
def control_cache(environ, start_response):
    """
    Service for shared cache. A GET will pull the current cache value,
    a POST will set the current cache value to the POST body of the
    cache.
    Required header of X-HYDRA-CACHE-NAME will determine which cache to
    set.
    Optional header of X-HYDRA-CACHE-EXPIRY will set a special expiration on
    the cache value.
    REQUEST
        -> GET /hydra/cache
        -> POST /hydra/cache
            -> BODY of the cache value to be set
    RESPONSE
    status 200: cache set or got successfully
        -> EMPTY OR TEXT <serialized cache>
    status 400: could not find cache name to get or set
        -> EMPTY
    status 404: could not find cache of specified name
        -> EMPTY
    """

    # Validate the request

    cache_name = environ.get("HTTP_X_HYDRA_CACHE_NAME", None)
    response_headers = [("Content-type", "text/plain")]

    if cache_name is None:
        service_logger.error("bad request, no hydra cache name for path='%s'", environ.get("SCRIPT_NAME", "/"))
        status = "400 Bad Request"
        start_response(status, response_headers)
        return []

    # Parse the HTTP method

    if environ["REQUEST_METHOD"] == "GET":
        response = cache_manager.get_cache(cache_name)

        if response is None:
            status = "404 Not Found"
            start_response(status, response_headers)
            return []
        else:
            response = response.encode('utf-8')
            status = "200 OK"
            start_response(status, response_headers)
            return [response]
    else:
        # Implicit POST if not GET since we have method filtering
        expiration = environ.get("HTTP_X_HYDRA_CACHE_EXPIRY", None)
        if expiration is not None:
            expiration = int(expiration)
        req_in = environ.get("wsgi.input", None)
        content = req_in.read(int(environ["CONTENT_LENGTH"]))
        cache_manager.set_cache(cache_name, content.decode('utf-8').strip("\n"), expiration=expiration)

        status = "200 OK"
        start_response(status, response_headers)
        return []


@HandleRequest(["POST"])
def post_cache_batch(environ, start_response):
    """
    Service for shared cache. A POST will allow the body to be processed into
    multiple cache entries. The form of the body of the request must be \n
    delimited cache entries of the form <cache_name>\t<cache_value>.
    Optional header of X-HYDRA-CACHE-EXPIRY will set a special expiration on
    the cache value.
    REQUEST
        -> POST /hydra/cache/batch
            -> BODY of the \n delmited cache entries
    RESPONSE
    status 200: batch processed successfully
        -> EMPTY
    """
    # Validate the request
    response_headers = [("Content-type", "text/plain")]
    expiration = environ.get("HTTP_X_HYDRA_CACHE_EXPIRY", None)
    if expiration is not None:
        expiration = int(expiration)
    req_in = environ.get("wsgi.input", None)
    content = req_in.read(int(environ["CONTENT_LENGTH"]))
    things = content.decode('utf-8').strip("\n").split("\n")
    service_logger.info("[CacheBatchProcessor] parsed cache batch of count=%s", len(things))
    for thing in things:
        # TODO: we could probably make this more efficient by using the found index to sub string, but whatever
        if thing.find("\t") != -1:
            cache_name, cache_value = thing.split("\t", 1)
            cache_manager.set_cache(cache_name, cache_value, expiration=expiration)

    status = "200 OK"
    start_response(status, response_headers)
    return []


# ===============================================================================
# Hydra Job Services
# ===============================================================================
@HandleRequest(["GET"])
def get_job_info(environ, start_response):
    """
    Service for determining the current job queue length.
    See the HydraJobExecutionInfoManager methods for detailed descriptions of
    the contents of "dict of key!" used below.
    REQUEST
        -> GET /hydra/job/info
    RESPONSE
        -> JSON { count: <QUEUE LENGTH>,
                expired_job_count: <EXPIRED_JOB_COUNT>,
                job_aggregate_execution_info : dict of key!
                atomic_job_info : dict of key!}
    """
    # Technically you cannot guarantee the exact queue size but this is good enough
    dict_data = {"count": job_queue.qsize()}
    # Get data from the execution info manager
    dict_data["job_aggregate_execution_info"] = job_exec_info_manager.get_aggregate_info()
    dict_data["atomic_job_info"] = job_exec_info_manager.get_atomic_job_info()
    response = json.dumps(dict_data).encode('utf-8')
    status = "200 OK"
    response_headers = [("Content-type", "application/json")]
    start_response(status, response_headers)
    return [response]


@HandleRequest(["GET"])
def get_pop_job(environ, start_response):
    """
    Service for popping out the next job to do.
    Optional arg block if 0 will 404 if it couldn't get a job, if 1 will blcok
    for 3s before 404. defaults to 1
    REQUEST
        -> GET /hydra/job/pop
        -> GET ARGS:
            block - 0 or 1 (1 by default)
    RESPONSE
    status 200: job popped successfully
        -> TEXT <serialized JobTuple>
    status 404: could not get a job (empty or blocked)
        -> EMPTY
    """
    try:
        # Parse get args
        block = True
        get_args = environ.get("QUERY_STRING", None)
        if get_args is not None and get_args != '':
            arg_list = get_args.split("&")
            for arg in arg_list:
                kv_pair = arg.split("=")
                if kv_pair[0] == "block" and len(kv_pair) == 2:
                    if kv_pair[1] == "0":
                        block = False
                    else:
                        block = True
                else:
                    service_logger.warn("[JobPopper] got unexpected get_arg=%s", kv_pair)
        # Pop a job off the queue
        priority_number, response = job_queue.get(block=block, timeout=3)

        # Update_activejob_category
        job_exec_info_manager.update_activejob_category(response, -1)

        service_logger.debug("[JobPopper] popped out job with priority_number=%s", priority_number)
        status = "200 OK"
        response_headers = [("Content-type", "text/plain")]
        start_response(status, response_headers)
        response = response.encode('utf-8')
        return [response]
    except Empty:
        status = "404 Not Found"
        response_headers = [("Content-type", "text/plain")]
        start_response(status, response_headers)
        return []


@HandleRequest(["POST"])
def post_job_batch(environ, start_response):
    """
    Service for adding a batch of jobs to the queue
    REQUEST
        -> POST /hydra/job/batch
        -> BODY:
            \n delimited strings of the form <priority_number>:<serialized JobTuple>
    RESPONSE
        -> EMPTY
    """
    response_headers = [("Content-type", "text/plain")]
    service_logger.debug("[JobBatchProcessor] processing batch of content_length=%s", environ["CONTENT_LENGTH"])
    req_in = environ.get("wsgi.input", None)
    # TODO: slurping could get out of hand fast, we should keep a rolling buffer going, probably with StringIO
    content = req_in.read(int(environ["CONTENT_LENGTH"]))
    things = content.decode('utf-8').split("\n")
    service_logger.debug("[JobBatchProcessor] parsed job batch of count=%s", len(things))
    for thing in things:
        # TODO: we could probably make this more efficient by using the found index to sub string, but whatever
        if thing.find(":") != -1:
            priority_number, job_string = thing.split(":", 1)
            job_queue.put((int(priority_number), job_string))

    # We are doing this after jobs is store in queue so worker will not block on this
    # Update active job count which is approx.
    job_exec_info_manager.abstract_activejob_category(things)
    status = "200 OK"
    start_response(status, response_headers)
    return []


def update_expired_job_count_var(increment_by=1):
    '''
    Thread safe function to update job_execution_info update
    @param increment_by : increment value, default is one

    @return: nothing
    '''
    with expired_job_count_lock:
        expired_job_count = expired_job_count + increment_by


@HandleRequest(["POST"])
def update_job_failure(environ, start_response):
    """
    Service for reporting a failed job. Right now only used for atomic jobs
    REQUEST
        -> POST /hydra/job/execution/failure
        -> HEADERS:
            X-HYDRA-ATOMIC-JOB: value is the job name, if not passed job is treated as not atomic (currently a bad request)
        -> BODY:
            EMPTY
    RESPONSE
        -> EMPTY
    """
    # handle atomic job failure
    atomic_job = environ.get("HTTP_X_HYDRA_ATOMIC_JOB", None)
    if atomic_job is not None:
        service_logger.info("[UpdateJobFailure] received notice of atomic job failure for job=%s", atomic_job)
        job_exec_info_manager.add_failed_atomic_job(atomic_job)
        status = "200 OK"
    else:
        status = "400 Bad Request"

    response_headers = [("Content-type", "text/plain")]
    start_response(status, response_headers)
    return []


@HandleRequest(["POST"])
def update_job_execution(environ, start_response):
    """
    Service for aggregate execution time for JobTuple
    REQUEST
        -> POST /hydra/job/execution/info
        -> HEADERS:
            X-HYDRA-ATOMIC-JOB: value is the job name, if not passed job is treated as not atomic
        -> BODY:
            strings of the form execution time in sec:<serialized JobTuple>
    RESPONSE
        -> EMPTY
    """
    service_logger.debug("[UpdateJobExecution] adding execution time")
    req_in = environ.get("wsgi.input", None)
    content = req_in.read(int(environ["CONTENT_LENGTH"]))
    content = content.decode('utf-8')
    # handle atomic job completion
    atomic_job = environ.get("HTTP_X_HYDRA_ATOMIC_JOB", None)
    if atomic_job is not None:
        job_exec_info_manager.add_completed_atomic_job(atomic_job)

    # handle job execution time
    if content.find(":") != -1:
        exec_time, job_string = content.split(":", 1)
        job_exec_info_manager.update_job_execution_dict(job_string, float(exec_time))
    else:
        service_logger.error("[UpdateJobExecution] Failed to process content=%s", content)
    status = "200 OK"
    response_headers = [("Content-type", "text/plain")]
    start_response(status, response_headers)
    return []


@HandleRequest(["POST"])
def update_expired_job_count(environ, start_response):
    """
    Service to update expired job count
    REQUEST
        -> POST /hydra/job/execution/expired
        -> BODY:
            strings of the valid integer number
    RESPONSE
        -> EMPTY
    """
    response_headers = [("Content-type", "text/plain")]
    service_logger.debug("[JobExpired] adding execution time")
    req_in = environ.get("wsgi.input", None)
    content = req_in.read(int(environ["CONTENT_LENGTH"]))
    content = content.decode('utf-8')
    try:
        update_expired_job_count_var(int(content))
        service_logger.debug("successfully updated expired job count")
    except Exception as e:
        service_logger.error("[JobExpired] failed to update expired job count content=%s, exception=%s", content, e)
        service_logger.exception(e)
    status = "200 OK"
    start_response(status, response_headers)
    return []


#===============================================================================
# Test Services
#===============================================================================
@HandleRequest(["GET"])
def test_static(environ, start_response):
    '''
    Simple static resource
    '''
    service_logger.debug("in test static environ=%s", environ)
    status = '200 OK'
    response_headers = [('Content-type', 'text/plain')]
    start_response(status, response_headers)
    response = '\nHail Hydra!\n'.encode('utf-8')
    return [response]


@HandleRequest(["POST"])
def test_echo(environ, start_response):
    '''
    Simple, non-streaming echo server
    '''
    service_logger.debug("in test echo environ=%s", environ)
    status = '200 OK'
    req_in = environ.get("wsgi.input", None)
    content = req_in.read(int(environ["CONTENT_LENGTH"]))
    response_headers = [('Content-type', 'text/plain')]
    start_response(status, response_headers)
    response = "\n###########\nECHO SERVER\n###########\n" + content + "\n\n"
    response = response.encode('utf-8')
    return [response]


#===============================================================================
# Admin Services
#===============================================================================
@HandleRequest(["GET"], enforce_auth=False)
def get_challenge_key(environ, start_response):
    '''
    Return the instance challenge key. This is used to detect mismatches between
    the running gateway and splunkd's understanding of the running gateway. This
    challenge key is NOT a valid auth key
    '''
    status = '200 OK'
    response_headers = [('Content-type', 'text/plain')]
    global hydra_gateway_challenge_token
    byte_hydra_gateway_challenge_token = hydra_gateway_challenge_token.encode('utf-8')
    service_logger.debug("responding to challenge request with hydra_gateway_challenge_token: %s",
                         hydra_gateway_challenge_token)
    start_response(status, response_headers)
    return [byte_hydra_gateway_challenge_token]


# ------------------------------------------------------------------------------

# ===============================================================================
# Web Service Constructor
# ===============================================================================
def recover_cache():
    """
    Attempt to locate a serialized cache on disk and load it into the current cache.
    """

    directory = make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run"])
    if not os.path.exists(directory):
        os.makedirs(directory)
    try:
        with open(make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run", "hydra_gateway.cache"]), 'r') as f:
            serialized_cache = f.read()
            if serialized_cache:
                service_logger.info("attempting to deserialize cache...")
                old_cache = json.loads(serialized_cache)
                for name, cache, expiry in old_cache:
                    cache_manager.set_cache(name, cache, expiration=expiry)
                service_logger.info("cache deserialized with %s entries", len(old_cache))
    except Exception as e:
        service_logger.error(str(e))


def serialize_cache():
    """
    Serialize the special expiration items in the cache to disk. Note we
    capture and ignore args and kwargs so this can be used as a signal handler.
    """
    try:
        service_logger.info("attempting to serialize cache...")
        directory = make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run"])
        if not os.path.exists(directory):
            os.makedirs(directory)
        with open(make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run", "hydra_gateway.cache"]), 'w') as f:
            now = time.time()
            serializable_cache = []
            for name, expiration_time in six.iteritems(cache_manager.special_expirations):
                serializable_cache.append((name, cache_manager.cache[name][1], (expiration_time - now)))
            service_logger.info("serializing %s cache entries", len(serializable_cache))
            f.write(json.dumps(serializable_cache))
            service_logger.info("cache serialized")
    except Exception as e:
        service_logger.warning("unable to serialize cache, message=%s", str(e))

def normalize_value(value):
    """
    Convert a string value into an integer or boolean if appropriate
    """
    if not isinstance(value, six.string_types):
        return value

    # first try to cast to ints; if fail, try to cast to boolean
    try:
        value = int(value)
    except ValueError:
        value = normalizeBoolean(value)

    return value

def bootstrap_web_service(port=8008, service_log_level="INFO", access_log_level="INFO"):
    """
    Start up the hydra web service from conf file definitions

    RETURNS reference to un-started server
    """
    # Establish the route dispatcher
    routes = {'/hydra/cache': control_cache,
              '/hydra/cache/batch': post_cache_batch,
              '/hydra/job/info': get_job_info,
              '/hydra/job/execution/info': update_job_execution,
              '/hydra/job/execution/failure': update_job_failure,
              '/hydra/job/execution/expired': update_expired_job_count,
              '/hydra/job/pop': get_pop_job,
              '/hydra/job/batch': post_job_batch,
              '/test/static': test_static,
              '/test/echo': test_echo,
              '/hydra/admin/challenge': get_challenge_key}

    if ver.__version__ < '8.0.0':
        dispatch = wsgiserver.WSGIPathInfoDispatcher(routes)
    else:
        dispatch = wsgi.PathInfoDispatcher(routes)

    # Set auth and challenge tokens
    new_key = str(uuid.uuid4())
    global hydra_gateway_auth_token
    hydra_gateway_auth_token = new_key
    new_key = str(uuid.uuid4())
    global hydra_gateway_challenge_token
    hydra_gateway_challenge_token = new_key

    # Set log levels

    access_log_level = access_log_level.upper()

    if access_log_level not in ["DEBUG", "INFO", "WARN", "WARNING", "ERROR"]:
        access_logger.setLevel(logging.INFO)
        access_logger.warning(
            "unrecognizable configured access log level: %s, resetting log level to INFO", access_log_level)
    else:
        access_logger.setLevel(access_log_level)

    service_log_level = service_log_level.upper()

    if service_log_level not in ["DEBUG", "INFO", "WARN", "WARNING", "ERROR"]:
        service_logger.setLevel(logging.INFO)
        service_logger.warning(
            "unrecognizable configured service logging level: %s, resetting logging level to INFO", service_log_level)
    else:
        service_logger.setLevel(service_log_level)

    # Get basic configuration
    global_cfg = {}
    for k, v in six.iteritems(getConfStanza('web', 'settings')):
        global_cfg[k] = normalize_value(v)

    if "caCertPath" in global_cfg: # copy deprecated caCertPath to serverCert
        global_cfg["serverCert"] = global_cfg["caCertPath"]
    host_name = getDefault("host")

    # Get SSL configuration
    service_logger.info('parsing SSL config from splunk web.conf...')

    """Added this condition to support the Splunk Ivory release - Jira ticket VMW-4377 and NETAPP-638 """
    if ver.__version__ > '6.4.9':
        private_key_path = str(global_cfg['privKeyPath']).replace('$SPLUNK_HOME/', '')
        ssl_certificate = str(global_cfg['serverCert']).replace('$SPLUNK_HOME/', '')
    else:
        private_key_path = str(global_cfg['privKeyPath'])
        ssl_certificate = str(global_cfg['caCertPath'])
    if os.path.isabs(private_key_path):
        global_cfg['server.ssl_private_key'] = private_key_path
    else:
        global_cfg['server.ssl_private_key'] = make_splunkhome_path([private_key_path])
    if os.path.isabs(ssl_certificate):
        global_cfg['server.ssl_certificate'] = ssl_certificate
    else:
        global_cfg['server.ssl_certificate'] = make_splunkhome_path([ssl_certificate])

    # Validate Configuration
    if not os.path.exists(global_cfg['server.ssl_private_key']):
        service_logger.error("Failed to bootstrap hydra service due to configured ssl key missing: %s",
                             global_cfg['server.ssl_private_key'])
        raise ValueError("Private Key: '%s' Not Found" % global_cfg['server.ssl_private_key'])
    if not os.path.exists(global_cfg['server.ssl_certificate']):
        service_logger.error("Failed to bootstrap hydra service due to configured ssl cert missing: %s",
                             global_cfg['server.ssl_certificate'])
        raise ValueError("Certificate: '%s' Not Found" % global_cfg['server.ssl_certificate'])


    global_cfg['server.ssl_options'] = 0

    if 'cipherSuite' in global_cfg and global_cfg.get('cipherSuite'):
        global_cfg['server.ssl_ciphers'] = str(global_cfg['cipherSuite'])

    if 'sslPassword' in global_cfg:
        """getConfStanza method returns "REDACTED_ENCRYPTED_PASSWORD" string for encrypted sslPassword.
        Below logic is to get the encrypted sslPassword stored in Splunk web.conf file and
        overwrite it in the global_cfg dictionary as mentioned in VMW-6500."""
        mergedWebConf = getMergedConf('web')
        global_cfg['sslPassword'] = mergedWebConf['settings']['sslPassword']
        try:
            global_cfg['server.ssl_password'] = decrypt(str(global_cfg['sslPassword']))
        except:
            service_logger.info("Found sslPassword in web.conf but unable to decrypt it")

    if 'sslVersions' in global_cfg:
        try:
            from ssl import PROTOCOL_SSLv3, PROTOCOL_SSLv23, PROTOCOL_TLSv1, PROTOCOL_TLSv1_1, PROTOCOL_TLSv1_2, OP_NO_SSLv2, OP_NO_SSLv3

            acceptedSSLVersions = {
                'all':       {'server.ssl_version': PROTOCOL_SSLv23},
                'ssl3':      {'server.ssl_version': PROTOCOL_SSLv3},
                'tls1.0':    {'server.ssl_version': PROTOCOL_TLSv1},
                'tls1.1':    {'server.ssl_version': PROTOCOL_TLSv1_1},
                'tls1.2':    {'server.ssl_version': PROTOCOL_TLSv1_2},
                'ssl3, tls': {'server.ssl_version': PROTOCOL_SSLv23,
                              'server.ssl_options': OP_NO_SSLv2},
                'tls':       {'server.ssl_version': PROTOCOL_SSLv23,
                              'server.ssl_options': OP_NO_SSLv2 | OP_NO_SSLv3}
            }

        except ImportError:
            from ssl import PROTOCOL_SSLv3, PROTOCOL_SSLv23, PROTOCOL_TLSv1
            acceptedSSLVersions = {
                'all':       {'server.ssl_version': PROTOCOL_SSLv23},
                'ssl3':      {'server.ssl_version': PROTOCOL_SSLv3},
                'tls1.0':    {'server.ssl_version': PROTOCOL_TLSv1}
        }

        if global_cfg['sslVersions'] in acceptedSSLVersions:
            global_cfg.update(acceptedSSLVersions[global_cfg['sslVersions']])


        else:
            #default case ssl2+
            service_logger.warn("Undefined sslVersion='%s'. Please select from 'all', 'ssl3, tls' or 'tls'." % global_cfg.get('sslVersions'))
            service_logger.warn("Defaulting sslVersion to 'all'")
            global_cfg['server.ssl_version'] = PROTOCOL_SSLv23

    else:
        # No 'sslVersions'-- old and possibly POODLE-problematic.
        try:
            from _ssl import PROTOCOL_SSLv23_NO23
            global_cfg['server.ssl_version'] = PROTOCOL_SSLv23_NO23
        except ImportError:
            service_logger.warning("POODLE Vulnerable: Please update to Splunk v6.0.7, v6.1.5, or v6.2.1 or higher for protection.")

    # Validate port availability--since we can't start the server--and then write the key files we need to validate
    # prior to start
    try:
        sock = socket.socket()
        sock.connect((host_name, port))
        sock.close()
        service_logger.error("[%s:%d] could not bootstrap gateway because port=%d is in use", host_name, port, port)
        sys.exit(1)
    except socket.error as e:
        if e.errno == 10061:
            service_logger.debug('[%s:%d] port=%d is available.', host_name, port, port)
        else:
            service_logger.warning('[%s:%d] unexpected socket error: %s', host_name, port, e)
        pass

    # Build server
    if ver.__version__ < '8.0.0':
        server = wsgiserver.CherryPyWSGIServer(('0.0.0.0', port), dispatch, server_name=host_name)
        global_cfg['server.ssl_options'] = global_cfg['server.ssl_options'] | ssl.OP_NO_COMPRESSION
        for key in ('ssl_private_key', 'ssl_options', 'ssl_version', 'ssl_certificate', 'ssl_ciphers', 'ssl_password'):
            if 'server.'+key in global_cfg:
                setattr(server, key, global_cfg['server.'+key])
    else:
        server = wsgi.Server(('0.0.0.0', port), dispatch, server_name=host_name)
        global_cfg['server.ssl_options'] = global_cfg['server.ssl_options'] | ssl.OP_NO_COMPRESSION

        ctx = ssl.SSLContext(global_cfg['server.ssl_version'] if global_cfg['server.ssl_version'] >= 0 else ssl.PROTOCOL_SSLv23)
        ctx.options |= global_cfg['server.ssl_options']
        if global_cfg['server.ssl_certificate']:
            ctx.load_cert_chain(global_cfg['server.ssl_certificate'], global_cfg['server.ssl_private_key'], global_cfg.get('server.ssl_password', None))
        if global_cfg['server.ssl_ciphers']:
            ctx.set_ciphers(global_cfg['server.ssl_ciphers'])

        if ver.__version__ == '8.0.0':
            HTTPServer.ssl_adapter = builtin.BuiltinSSLAdapter(global_cfg['server.ssl_certificate'], global_cfg['server.ssl_private_key'], ciphers=global_cfg['server.ssl_ciphers'])
        else:
            HTTPServer.ssl_adapter = builtin.BuiltinSSLAdapter(global_cfg['server.ssl_certificate'], global_cfg['server.ssl_private_key'], ciphers=global_cfg['server.ssl_ciphers'], password=global_cfg.get('server.ssl_password', None))

        HTTPServer.ssl_adapter.context = ctx

    # Commit tokens to filesystem
    directory = make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run"])
    if not os.path.exists(directory):
        os.makedirs(directory)
    with open(make_splunkhome_path(["etc", "apps", "SA-Hydra", "local", "run", "hydra_gateway.key"]), 'w') as f:
        f.write("\n".join([hydra_gateway_challenge_token, hydra_gateway_auth_token, "\n"]))

    # Attempt to recover existing cache, if any
    try:
        recover_cache()
    except Exception as e:
        service_logger.warning("unable to recover a serialized cache, message=%s", str(e))

    # Bind a cache serialization to SIGTERM and SIGINT
    def signal_handler(sig, frame):
        """
        Handle signals and terminate the process while preserving state.
        """
        service_logger.info("initiating safe shutdown after receiving signal=%s", sig)
        service_logger.info("stopping cherrypy wsgi server")
        server.stop()
        service_logger.info("cherrypy wsgi server stopped")
        serialize_cache()
        service_logger.info("exiting parent process")
        sys.exit(0)

    signal.signal(signal.SIGTERM, signal_handler)
    signal.signal(signal.SIGINT, signal_handler)
    return server


if __name__ == "__main__":
    try:
        server = bootstrap_web_service()
        server.start()
    except ValueError as e:
        service_logger.exception("Failed to bootstrap hydra service due to configuration error: %s", str(e))