Deploiement_Server/deployment-apps/Splunk_ML_Toolkit/bin/sampler.py

#!/usr/bin/env python

import pandas as pd
import numpy as np


class ReservoirSampler(object):
    def __init__(self, limit, random_state=None):
        self.limit = limit
        self._dfs = []
        self._dfs_count = 0
        self._df = pd.DataFrame()
        self.count = 0

        if random_state:
            np.random.seed(random_state)
        self.random_state = np.random.get_state()

        self._max_dfs_len = self.limit / 5

    def _concat_df(self, df):
        self._dfs.append(df)
        self._dfs_count += len(df)

        self._max_dfs_len = max(self._max_dfs_len, len(df))

    def _combine_dfs(self):
        if len(self._dfs) > 0:
            self._df = self._df.append(self._dfs)
            self._df.drop_duplicates(subset='_slot', keep='last', inplace=True)
            self._dfs = []
            self._dfs_count = 0

    def get_df(self):
        if self.count == 0 and len(self._dfs) == 0:
            return self._df

        self._combine_dfs()
        self._df.reset_index(drop=True, inplace=True)

        return self._df.sort_values('_gindex').drop(['_gindex', '_slot'], axis=1)

    def append(self, new_df, copy=False):
        if len(new_df) == 0:
            return

        if copy:
            new_df = new_df.copy()

        # Assign counter to new_df
        new_df['_gindex'] = np.arange(len(new_df)) + self.count
        self.count += len(new_df)

        if self.limit <= 0 or self.count <= self.limit:
            new_df['_slot'] = new_df['_gindex']
            self._concat_df(new_df)
            return

        # Move the head of new_df to self._dfs
        if self.count - len(new_df) < self.limit:
            head_count = self.limit - (self.count - len(new_df))
            new_df['_slot'] = new_df['_gindex']
            self._concat_df(new_df[0:head_count])
            new_df = new_df[head_count:].copy()

        np.random.set_state(self.random_state)
        rnd = np.random.rand(len(new_df))
        self.random_state = np.random.get_state()

        new_df['_slot'] = np.floor(rnd * new_df['_gindex']).astype(int)
        keepers = new_df[new_df['_slot'] < self.limit]
        new_df = None

        self._concat_df(keepers)

        if self._dfs_count > self._max_dfs_len:
            self._combine_dfs()

        return