You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
415 lines
12 KiB
415 lines
12 KiB
#!/usr/bin/env python
|
|
|
|
import base64
|
|
import importlib
|
|
from io import BytesIO
|
|
|
|
import pandas as pd
|
|
|
|
CODECS = {
|
|
("builtins", "object"): "NoopCodec",
|
|
("builtins", "slice"): "SliceCodec",
|
|
("builtins", "set"): "SetCodec",
|
|
("builtins", "type"): "TypeCodec",
|
|
("builtins", "bytes"): "BytesCodec",
|
|
("numpy", "ndarray"): "NDArrayCodec",
|
|
("numpy", "int8"): "NDArrayWrapperCodec",
|
|
("numpy", "int16"): "NDArrayWrapperCodec",
|
|
("numpy", "int32"): "NDArrayWrapperCodec",
|
|
("numpy", "int64"): "NDArrayWrapperCodec",
|
|
("numpy", "uint8"): "NDArrayWrapperCodec",
|
|
("numpy", "uint16"): "NDArrayWrapperCodec",
|
|
("numpy", "uint32"): "NDArrayWrapperCodec",
|
|
("numpy", "uint64"): "NDArrayWrapperCodec",
|
|
("numpy", "float16"): "NDArrayWrapperCodec",
|
|
("numpy", "float32"): "NDArrayWrapperCodec",
|
|
("numpy", "float64"): "NDArrayWrapperCodec",
|
|
("numpy", "float128"): "NDArrayWrapperCodec",
|
|
("numpy", "complex64"): "NDArrayWrapperCodec",
|
|
("numpy", "complex128"): "NDArrayWrapperCodec",
|
|
("numpy", "complex256"): "NDArrayWrapperCodec",
|
|
("numpy", "dtype"): "DTypeCodec",
|
|
("numpy.random.mtrand", "RandomState"): "mtrandCodec",
|
|
("scipy.sparse.csr", "csr_matrix"): "SparseMatrixCodec",
|
|
("sklearn.dummy", "DummyClassifier"): "SimpleObjectCodec",
|
|
("sklearn.dummy", "DummyRegressor"): "SimpleObjectCodec",
|
|
("pandas.core.frame", "DataFrame"): "SimpleObjectCodec",
|
|
("pandas.core.index", "Index"): "IndexCodec",
|
|
("pandas.core.indexes.base", "Index"): "IndexCodec",
|
|
("pandas.core.indexes.range", "RangeIndex"): "IndexCodec",
|
|
("pandas.core.index", "Int64Index"): "IndexCodec",
|
|
("pandas.core.internals.managers", "BlockManager"): "BlockManagerCodec",
|
|
}
|
|
|
|
|
|
class BaseCodec(object):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
raise NotImplementedError("Encoder not implemented")
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
raise NotImplementedError("Decoder not implemented")
|
|
|
|
|
|
class NoopCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
return {"__mlspl_type": [type(obj).__module__, type(obj).__name__]}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
module_name, name = obj["__mlspl_type"]
|
|
module = importlib.import_module(module_name)
|
|
class_ref = getattr(module, name)
|
|
|
|
new_obj = class_ref.__new__(class_ref)
|
|
|
|
return new_obj
|
|
|
|
|
|
class BytesCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"str": obj.decode("utf8"),
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
return obj["str"].encode("utf")
|
|
|
|
|
|
class SliceCodec(BaseCodec):
|
|
whitelist = [k for k, v in list(CODECS.items()) if v == "SliceCodec"]
|
|
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
name, module = type(obj).__name__, type(obj).__module__
|
|
assert (module, name) in cls.whitelist
|
|
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"slice": obj.__reduce__()[1],
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
module_name, name = obj["__mlspl_type"]
|
|
assert (module_name, name) in cls.whitelist
|
|
|
|
module = importlib.import_module(module_name)
|
|
class_ref = getattr(module, name)
|
|
|
|
new_obj = class_ref(*obj["slice"])
|
|
|
|
return new_obj
|
|
|
|
|
|
class SimpleObjectCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
name, module = type(obj).__name__, type(obj).__module__
|
|
return {"__mlspl_type": [module, name], "dict": obj.__dict__}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
module_name, name = obj["__mlspl_type"]
|
|
|
|
module = importlib.import_module(module_name)
|
|
class_ref = getattr(module, name)
|
|
new_obj = class_ref.__new__(class_ref)
|
|
new_obj.__dict__ = obj["dict"]
|
|
for key in new_obj.__dict__:
|
|
if isinstance(new_obj.__dict__[key], list) or isinstance(
|
|
new_obj.__dict__[key], pd.Index
|
|
):
|
|
new_obj.__dict__[key] = [
|
|
item if isinstance(item, str) else item for item in new_obj.__dict__[key]
|
|
]
|
|
elif isinstance(new_obj.__dict__[key], str):
|
|
new_obj.__dict__[key] = new_obj.__dict__[key]
|
|
return new_obj
|
|
|
|
|
|
class IndexCodec(BaseCodec):
|
|
whitelist = [k for k, v in list(CODECS.items()) if v == "IndexCodec"]
|
|
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
name, module = type(obj).__name__, type(obj).__module__
|
|
assert (module, name) in cls.whitelist
|
|
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"init_args": obj.__reduce__()[1][1],
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
module_name, name = obj["__mlspl_type"]
|
|
assert (module_name, name) in cls.whitelist
|
|
|
|
module = importlib.import_module(module_name)
|
|
class_ref = getattr(module, name)
|
|
|
|
new_obj = class_ref(**obj["init_args"])
|
|
|
|
return new_obj # pandas.core.index.Index(**obj['init_args'])
|
|
|
|
|
|
class DTypeCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"descr": obj.descr if obj.names is not None else obj.str,
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
import numpy as np
|
|
|
|
return np.dtype(obj["descr"])
|
|
|
|
|
|
class NDArrayWrapperCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
import numpy as np
|
|
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"ndarray": np.array([obj]),
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
return obj["ndarray"][0]
|
|
|
|
|
|
class NDArrayCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
import numpy as np
|
|
|
|
assert type(obj) == np.ndarray
|
|
|
|
if obj.dtype.hasobject:
|
|
try:
|
|
obj = obj.astype("U")
|
|
except Exception:
|
|
raise ValueError("Cannot encode numpy.ndarray with objects")
|
|
|
|
bio = BytesIO()
|
|
np.save(bio, obj, allow_pickle=False)
|
|
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"npy": base64.b64encode(bio.getvalue()).decode("utf8"),
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
import numpy as np
|
|
|
|
as_string = obj["npy"].encode("utf8")
|
|
b64decoded = base64.b64decode(as_string)
|
|
bio = BytesIO(b64decoded)
|
|
return np.load(bio, allow_pickle=False)
|
|
|
|
|
|
class TreeCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
import sklearn.tree
|
|
|
|
assert type(obj) == sklearn.tree._tree.Tree
|
|
|
|
init_args = obj.__reduce__()[1]
|
|
state = obj.__getstate__()
|
|
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"init_args": init_args,
|
|
"state": state,
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
import sklearn.tree
|
|
|
|
init_args = obj["init_args"]
|
|
|
|
state = obj["state"]
|
|
|
|
# Add max_depth for backwards compatibility with PSC 1.2
|
|
# Previous version did not set the max_depth in the state when calling __getstate__
|
|
# https://github.com/scikit-learn/scikit-learn/blob/51a765acfa4c5d1ec05fc4b406968ad233c75162/sklearn/tree/_tree.pyx#L615
|
|
|
|
# and has been added in sklearn 0.18 to be used in both __getstate__ and __setstate__
|
|
# https://github.com/scikit-learn/scikit-learn/blob/ef5cb84a805efbe4bb06516670a9b8c690992bd7/sklearn/tree/_tree.pyx#L649
|
|
|
|
# Older models will not have the max_depth in their stored state, such that a key error is raised.
|
|
# the max_depth is only used in the decision path method, which we don't currently use
|
|
# and is used to init an np array of zeros in version 0.18:
|
|
# https://github.com/scikit-learn/scikit-learn/blob/ef5cb84a805efbe4bb06516670a9b8c690992bd7/sklearn/tree/_tree.pyx#L926
|
|
# https://github.com/scikit-learn/scikit-learn/blob/ef5cb84a805efbe4bb06516670a9b8c690992bd7/sklearn/tree/_tree.pyx#L991
|
|
state["max_depth"] = state.get("max_depth", 0)
|
|
|
|
t = sklearn.tree._tree.Tree(*init_args)
|
|
|
|
t.__setstate__(state)
|
|
|
|
return t
|
|
|
|
|
|
class BlockManagerCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
from pandas.core.internals import BlockManager, SingleBlockManager
|
|
|
|
assert type(obj) == BlockManager or type(obj) == SingleBlockManager
|
|
|
|
state = obj.__getstate__()
|
|
|
|
return {"__mlspl_type": [type(obj).__module__, type(obj).__name__], "state": state}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
from pandas.core.internals import BlockManager, SingleBlockManager
|
|
|
|
state = obj["state"]
|
|
|
|
if obj['__mlspl_type'][1] == 'BlockManager':
|
|
t = BlockManager.__new__(BlockManager)
|
|
elif obj['__mlspl_type'][1] == 'SingleBlockManager':
|
|
t = SingleBlockManager.__new__(SingleBlockManager)
|
|
|
|
if isinstance(state, list):
|
|
state = tuple(state)
|
|
t.__setstate__(state)
|
|
|
|
return t
|
|
|
|
|
|
class SetCodec(BaseCodec):
|
|
whitelist = [k for k, v in list(CODECS.items()) if v == "SetCodec"]
|
|
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
name, module = type(obj).__name__, type(obj).__module__
|
|
assert (module, name) in cls.whitelist
|
|
|
|
return {"__mlspl_type": [type(obj).__module__, type(obj).__name__], "set": list(obj)}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
module_name, name = obj["__mlspl_type"]
|
|
assert (module_name, name) in cls.whitelist
|
|
|
|
return set(obj["set"])
|
|
|
|
|
|
class TypeCodec(BaseCodec):
|
|
whitelist = [k for k, v in list(CODECS.items()) if v == "TypeCodec"]
|
|
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
name, module = type(obj).__name__, type(obj).__module__
|
|
assert (module, name) in cls.whitelist
|
|
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"type": [obj.__module__, obj.__name__],
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
module_name, name = obj["__mlspl_type"]
|
|
assert (module_name, name) in cls.whitelist
|
|
assert (obj["type"][0], obj["type"][1]) in CODECS
|
|
|
|
module = importlib.import_module(obj["type"][0])
|
|
|
|
return getattr(module, obj["type"][1])
|
|
|
|
|
|
class mtrandCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
import numpy as np
|
|
|
|
assert type(obj) == np.random.mtrand.RandomState
|
|
|
|
init_args = obj.__reduce__()[1]
|
|
state = obj.__getstate__()
|
|
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"init_args": init_args,
|
|
"state": state,
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
from numpy.random.mtrand import RandomState
|
|
|
|
state = obj["state"]
|
|
|
|
t = RandomState()
|
|
t.__setstate__(state)
|
|
|
|
return t
|
|
|
|
|
|
class SparseMatrixCodec(BaseCodec):
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
import numpy as np
|
|
from scipy import sparse
|
|
|
|
assert type(obj) == sparse.csr.csr_matrix
|
|
|
|
bio = BytesIO()
|
|
np.savez(bio, data=obj.data, indices=obj.indices, indptr=obj.indptr, shape=obj.shape)
|
|
return {
|
|
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
|
|
"sparse_npy": base64.b64encode(bio.getvalue()),
|
|
}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
import numpy as np
|
|
from scipy.sparse.csr import csr_matrix
|
|
|
|
bio = BytesIO(base64.b64decode(obj["sparse_npy"]))
|
|
loader = np.load(bio)
|
|
return csr_matrix(
|
|
(loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"]
|
|
)
|
|
|
|
|
|
class SeriesCodec(BaseCodec):
|
|
whitelist = [k for k, v in CODECS.items() if v == 'SeriesCodec']
|
|
|
|
@classmethod
|
|
def encode(cls, obj):
|
|
name, module = type(obj).__name__, type(obj).__module__
|
|
assert (module, name) in cls.whitelist
|
|
state = obj.__getstate__()
|
|
return {'__mlspl_type': [type(obj).__module__, type(obj).__name__], 'state': state}
|
|
|
|
@classmethod
|
|
def decode(cls, obj):
|
|
import pandas.core.series
|
|
|
|
module_name, name = obj['__mlspl_type']
|
|
assert (module_name, name) in cls.whitelist
|
|
|
|
state = obj['state']
|
|
t = pandas.core.series.Series.__new__(pandas.core.series.Series)
|
|
t.__setstate__(state)
|
|
return t
|