#!/usr/bin/env python # This work contains confidential material of Splunk Inc. Its use or disclosure in #whole or in part without the express written permission of Splunk Inc. is prohibited. import os import re import random import glob import urllib import sys import splunk.Intersplunk from builtins import chr, range from splunk.mining.DateParser import _validateDate, _validateTime from splunk.clilib.bundle_paths import make_splunkhome_path WORD_REGEX = re.compile(r'[^a-zA-Z0-9]+') WORD_SPLIT = re.compile(r'([^a-zA-Z0-9]+)') def _generateReplacement(term, nameterms): replacement = "" if looksLikeWord(term): # get list of names with the same length as the term names = nameterms.get(len(term), None) if names != None: nameCount = len(names) if nameCount > 0: index = random.randint(1, nameCount) replacement = names[index-1] del names[index-1] return replacement for ch in term: if ch.isdigit(): # return a new number that is randomly less than the given value, so that ip addresses, and codes # are not higher than the value given. otherwise we wil get ip addresses like 554.785.455.545. # this assumes that if given a number, a number lower than it will be equally valid maxVal = int(ch) newch = str(random.randint(0,maxVal)) elif ch.isalpha(): if ch.islower(): newch = chr(random.randint(97,122)) else: newch = chr(random.randint(65,90)) else: newch = ch replacement += newch return replacement def lengthLists(terms): result = dict() for key in terms.keys(): addToMapList(result, len(key), key) return result ############################# DATEFINDER def findAllDatesAndTimes(text, timeInfoTuplet): global today, _MIN_YEAR, _MAX_YEAR timeExpressions = timeInfoTuplet[0] dateExpressions = timeInfoTuplet[1] matches = getAllMatches(text, dateExpressions, _validateDate) matches.extend(getAllMatches(text, timeExpressions, _validateTime)) return matches def getAllMatches(text, expressions, validator): index = -1 matches = list() for expression in expressions: index += 1 for match in expression.finditer(text): values = match.groupdict() isvalid = validator(values) if isvalid: matches.append(match.span()) return matches # return true if position is between any start-end in list of regions def inRegions(position, regions): for region in regions: start = region[0] end = region[1] if start <= position <= end: return True return False def compilePatterns(formats): compiledList = list() for format in formats: compiledList.append(re.compile(format, re.I)) return compiledList def getTimeInfoTuplet(timestampconfilename): root = os.path.realpath(make_splunkhome_path(['etc', 'anonymizer'])) if not os.path.isabs(root): root = os.path.abspath(root) timestampconfilename = os.path.realpath(os.path.normpath(timestampconfilename)) if not os.path.isabs(timestampconfilename): timestampconfilename = os.path.abspath(timestampconfilename) if root != os.path.commonprefix([root, timestampconfilename]): print('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root)) raise Exception('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root)) text = readText(timestampconfilename) text = text.replace('\\n', '\n').replace('\n\n', '\n') results = {} exec(text, {"__builtins__":None}, results) compiledTimePatterns = compilePatterns(results['timePatterns']) compiledDatePatterns = compilePatterns(results['datePatterns']) timeInfoTuplet = [compiledTimePatterns, compiledDatePatterns, results['minYear'], results['maxYear']] return timeInfoTuplet ############################# DATEFINDER ################################### BEGIN COPIED FROM DCUTILS.PY def addToMapList(map, key, value): if key in map: l = map[key] else: l = list() map[key] = l safeAppend(l, value) return l def fileWords(filename, lowercase): terms = dict() with open(filename) as f: count = 1 while (True): line = f.readline() if (lowercase): line = line.lower() if len(line) == 0: break tokenize(line, False, terms) ##Is it possible to do previews from a search script? #if count % 100000 == 0: # print('\t%u processed...' % count) count += 1 return terms def readText(filename): # really, this needs a function? with open(filename) as f: text = f.read() return text MAX_SEGMENT = 1024 def findBreak(start, segSize, text): segEnd = start + segSize - 1 if segEnd > len(text): return len(text)-1 for end in range(segEnd, max(start+1, segEnd-100), -1): if not text[end].isalnum(): return end # failed to find break by going back 100 chars. give up and break at will. return segEnd # returns maps of terms and phrases to their count def tokenize(text, wordsOnlyP, vector = dict()): segCount = int((len(text) + MAX_SEGMENT-1) / MAX_SEGMENT) segStart = 0 for seg in range(0, segCount): segEnd = findBreak(segStart, MAX_SEGMENT, text) segText = text[segStart:segEnd+1] tokens = WORD_REGEX.split(segText) for token in tokens: if len(token) == 0: continue if not wordsOnlyP or looksLikeWord(token): incCount(vector, token, 1) segStart = segEnd+1 return vector def looksLikeWord(token): upper = lower = 0 for c in token: if not c.isalpha(): return False if c.isupper(): upper += 1 else: lower += 1 return len(token) > 2 and (upper == 0 or lower == 0 or upper == 1) def incCount(map, val, count): if val in map: map[val] += count else: map[val] = count def safeAppend(list, val): if val not in list: list.append(val) ################################### END COPIED FROM DCUTILS.PY def isInt(token): if len(token) > 0 and token[0].isdigit(): try: int(token) return True except: pass return False def caseSame(caseSource, textSource): result = ""; for i in range(0, len(caseSource)): casech = caseSource[i] textch = textSource[i] if casech.isupper(): textch = textch.upper() elif casech.islower(): textch = textch.lower() result += textch; return result; def scrubValue(result, val, isRaw, allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet): regions = [] if isRaw: regions = findAllDatesAndTimes(val, timeInfoTuplet) position = 0 tokens = re.split(WORD_SPLIT, val) newtokens = list() for token in tokens: lower = token.lower() newtoken = token incCount(allterms, token, 1) inDateRegion = inRegions(position, regions) # if term is name of not an attribute and not in a date region. # double check for numbers of public terms because date regions sometimes # have extraineous text if the regex matches contains a noise term or end of expression match if (result.get(lower, None) == None) and not (inDateRegion and (isInt(token) or (lower in publicTerms and lower not in privateTerms))): # if we haven't already made a replacement for this term and it's a private term or not a public term if lower not in replacements and (lower in privateTerms or lower not in publicTerms): replacements[lower] = newtoken = _generateReplacement(token, nameTerms) # make a replacement term newtoken = replacements.get(lower, token) newtoken = caseSame(token, newtoken) position += len(token) newtokens.append(newtoken) return ''.join(newtokens) def scrub(results, publictermsfilename, privatefilename, nametermsfilename, dictionaryfilename, timestampconfigfilename): replacements = dict() privateTerms = fileWords(privatefilename, True) publicTerms = fileWords(dictionaryfilename, True) userpublicTerms = fileWords(publictermsfilename, True) nameTerms = lengthLists(fileWords(nametermsfilename, True)) # add user public terms to default publicterms for t in userpublicTerms: publicTerms[t] = userpublicTerms[t] # add named entities to default publicterms protectedKeys = set(["eventtype", "linecount", "punct", "sourcetype", "timeendpos", "timestartpos"]) timeInfoTuplet = getTimeInfoTuplet(timestampconfigfilename) allterms = dict() # for each result for r in results: # for each attribute for key,val in r.items(): # only scrub attributes if doesn't start with '_' (except _raw) and if not a protected attribute and doesn't start with date_ if (not key.startswith("_") or key == "_raw") and not key in protectedKeys and not key.startswith("date_"): r[key] = scrubValue(r, val, key=="_raw", allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet) def locate_anonymize_file(filename, app_dir, err_collection): # paths aren't accepted if "/" in filename or "\\" in filename or ".." in filename: msg = ("Pathnames are not accepted for any of the filename arguments. " + "The file specifier '%s' is not permitted.") err_collection.append(msg % filename) return None anonymize_dir = 'anonymizer' if app_dir: app_file_path = os.path.join(app_dir, anonymize_dir, filename) if os.path.isfile(app_file_path): return app_file_path global_file_path = make_splunkhome_path(['etc', anonymize_dir, filename]) if os.path.isfile(global_file_path): return global_file_path # we couldn't find the file, so.. msg = "The filename '%s' could not be found in the " % filename if app_dir: msg += "app or " msg += "the global directory. Checked " if app_dir: msg += "'%s' and " % app_file_path msg += "'%s', but did not locate the file." % global_file_path err_collection.append(msg) return None if __name__ == '__main__': try: results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults() argc = len(sys.argv) argv = sys.argv # if this is nonempty later, we'll write it out as error err_results = [] # DEFAULT CONFIG FILE NAMES publictermsfilename = "public-terms.txt" privatetermsfilename = "private-terms.txt" nametermsfilename = "names.txt" dictionaryfilename = "dictionary.txt" timestampconfigfilename = 'anonymizer-time.ini' # GET ARGS keywords, argvals = splunk.Intersplunk.getKeywordsAndOptions() # argvals = splunk.dcutils.getArgValues() # ALLOW ARGS TO OVERRIDE DEFAULTS publictermsfilename = argvals.get("public-terms", publictermsfilename) privatetermsfilename = argvals.get("private-terms", privatetermsfilename) nametermsfilename = argvals.get("name-terms", nametermsfilename) dictionaryfilename = argvals.get("dictionary", dictionaryfilename) timestampconfigfilename = argvals.get("time-config", timestampconfigfilename) # locate the files app = argvals.get("namespace") # first find the app, if it exists app_dir = None if app: if "/" in app or "\\" in app or ".." in app: msg = "Error: namespace name may not include the '/' '\\' or '..' sequences" err_results.append(msg) else: app_dir = make_splunkhome_path(['etc', 'apps', app]) if not os.path.isdir(app_dir): app_dir = make_splunkhome_path(['etc', 'slave-apps', app]) if not os.path.isdir(app_dir): msg = "Error: could not find specified app '%s' on disk" % app err_results.append(msg) app_dir = None # now find each file in either the app or the global dir publicterms_path = locate_anonymize_file(publictermsfilename, app_dir, err_results) privateterms_path = locate_anonymize_file(privatetermsfilename, app_dir, err_results) nameterms_path = locate_anonymize_file(nametermsfilename, app_dir, err_results) dictionary_path = locate_anonymize_file(dictionaryfilename, app_dir, err_results) timestampconfig_path = locate_anonymize_file(timestampconfigfilename, app_dir, err_results) if not err_results: scrub(results, publicterms_path, privateterms_path, nameterms_path, dictionary_path, timestampconfig_path) except: import traceback stack = traceback.format_exc() results = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack)) err_results=[] # pass back explicitly determined errors if err_results: results = splunk.Intersplunk.generateErrorResults("\n".join(err_results)) splunk.Intersplunk.outputResults( results )