You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

415 lines
12 KiB

#!/usr/bin/env python
import base64
import importlib
from io import BytesIO
import pandas as pd
CODECS = {
("builtins", "object"): "NoopCodec",
("builtins", "slice"): "SliceCodec",
("builtins", "set"): "SetCodec",
("builtins", "type"): "TypeCodec",
("builtins", "bytes"): "BytesCodec",
("numpy", "ndarray"): "NDArrayCodec",
("numpy", "int8"): "NDArrayWrapperCodec",
("numpy", "int16"): "NDArrayWrapperCodec",
("numpy", "int32"): "NDArrayWrapperCodec",
("numpy", "int64"): "NDArrayWrapperCodec",
("numpy", "uint8"): "NDArrayWrapperCodec",
("numpy", "uint16"): "NDArrayWrapperCodec",
("numpy", "uint32"): "NDArrayWrapperCodec",
("numpy", "uint64"): "NDArrayWrapperCodec",
("numpy", "float16"): "NDArrayWrapperCodec",
("numpy", "float32"): "NDArrayWrapperCodec",
("numpy", "float64"): "NDArrayWrapperCodec",
("numpy", "float128"): "NDArrayWrapperCodec",
("numpy", "complex64"): "NDArrayWrapperCodec",
("numpy", "complex128"): "NDArrayWrapperCodec",
("numpy", "complex256"): "NDArrayWrapperCodec",
("numpy", "dtype"): "DTypeCodec",
("numpy.random.mtrand", "RandomState"): "mtrandCodec",
("scipy.sparse.csr", "csr_matrix"): "SparseMatrixCodec",
("sklearn.dummy", "DummyClassifier"): "SimpleObjectCodec",
("sklearn.dummy", "DummyRegressor"): "SimpleObjectCodec",
("pandas.core.frame", "DataFrame"): "SimpleObjectCodec",
("pandas.core.index", "Index"): "IndexCodec",
("pandas.core.indexes.base", "Index"): "IndexCodec",
("pandas.core.indexes.range", "RangeIndex"): "IndexCodec",
("pandas.core.index", "Int64Index"): "IndexCodec",
("pandas.core.internals.managers", "BlockManager"): "BlockManagerCodec",
}
class BaseCodec(object):
@classmethod
def encode(cls, obj):
raise NotImplementedError("Encoder not implemented")
@classmethod
def decode(cls, obj):
raise NotImplementedError("Decoder not implemented")
class NoopCodec(BaseCodec):
@classmethod
def encode(cls, obj):
return {"__mlspl_type": [type(obj).__module__, type(obj).__name__]}
@classmethod
def decode(cls, obj):
module_name, name = obj["__mlspl_type"]
module = importlib.import_module(module_name)
class_ref = getattr(module, name)
new_obj = class_ref.__new__(class_ref)
return new_obj
class BytesCodec(BaseCodec):
@classmethod
def encode(cls, obj):
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"str": obj.decode("utf8"),
}
@classmethod
def decode(cls, obj):
return obj["str"].encode("utf")
class SliceCodec(BaseCodec):
whitelist = [k for k, v in list(CODECS.items()) if v == "SliceCodec"]
@classmethod
def encode(cls, obj):
name, module = type(obj).__name__, type(obj).__module__
assert (module, name) in cls.whitelist
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"slice": obj.__reduce__()[1],
}
@classmethod
def decode(cls, obj):
module_name, name = obj["__mlspl_type"]
assert (module_name, name) in cls.whitelist
module = importlib.import_module(module_name)
class_ref = getattr(module, name)
new_obj = class_ref(*obj["slice"])
return new_obj
class SimpleObjectCodec(BaseCodec):
@classmethod
def encode(cls, obj):
name, module = type(obj).__name__, type(obj).__module__
return {"__mlspl_type": [module, name], "dict": obj.__dict__}
@classmethod
def decode(cls, obj):
module_name, name = obj["__mlspl_type"]
module = importlib.import_module(module_name)
class_ref = getattr(module, name)
new_obj = class_ref.__new__(class_ref)
new_obj.__dict__ = obj["dict"]
for key in new_obj.__dict__:
if isinstance(new_obj.__dict__[key], list) or isinstance(
new_obj.__dict__[key], pd.Index
):
new_obj.__dict__[key] = [
item if isinstance(item, str) else item for item in new_obj.__dict__[key]
]
elif isinstance(new_obj.__dict__[key], str):
new_obj.__dict__[key] = new_obj.__dict__[key]
return new_obj
class IndexCodec(BaseCodec):
whitelist = [k for k, v in list(CODECS.items()) if v == "IndexCodec"]
@classmethod
def encode(cls, obj):
name, module = type(obj).__name__, type(obj).__module__
assert (module, name) in cls.whitelist
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"init_args": obj.__reduce__()[1][1],
}
@classmethod
def decode(cls, obj):
module_name, name = obj["__mlspl_type"]
assert (module_name, name) in cls.whitelist
module = importlib.import_module(module_name)
class_ref = getattr(module, name)
new_obj = class_ref(**obj["init_args"])
return new_obj # pandas.core.index.Index(**obj['init_args'])
class DTypeCodec(BaseCodec):
@classmethod
def encode(cls, obj):
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"descr": obj.descr if obj.names is not None else obj.str,
}
@classmethod
def decode(cls, obj):
import numpy as np
return np.dtype(obj["descr"])
class NDArrayWrapperCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import numpy as np
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"ndarray": np.array([obj]),
}
@classmethod
def decode(cls, obj):
return obj["ndarray"][0]
class NDArrayCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import numpy as np
assert type(obj) == np.ndarray
if obj.dtype.hasobject:
try:
obj = obj.astype("U")
except Exception:
raise ValueError("Cannot encode numpy.ndarray with objects")
bio = BytesIO()
np.save(bio, obj, allow_pickle=False)
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"npy": base64.b64encode(bio.getvalue()).decode("utf8"),
}
@classmethod
def decode(cls, obj):
import numpy as np
as_string = obj["npy"].encode("utf8")
b64decoded = base64.b64decode(as_string)
bio = BytesIO(b64decoded)
return np.load(bio, allow_pickle=False)
class TreeCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import sklearn.tree
assert type(obj) == sklearn.tree._tree.Tree
init_args = obj.__reduce__()[1]
state = obj.__getstate__()
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"init_args": init_args,
"state": state,
}
@classmethod
def decode(cls, obj):
import sklearn.tree
init_args = obj["init_args"]
state = obj["state"]
# Add max_depth for backwards compatibility with PSC 1.2
# Previous version did not set the max_depth in the state when calling __getstate__
# https://github.com/scikit-learn/scikit-learn/blob/51a765acfa4c5d1ec05fc4b406968ad233c75162/sklearn/tree/_tree.pyx#L615
# and has been added in sklearn 0.18 to be used in both __getstate__ and __setstate__
# https://github.com/scikit-learn/scikit-learn/blob/ef5cb84a805efbe4bb06516670a9b8c690992bd7/sklearn/tree/_tree.pyx#L649
# Older models will not have the max_depth in their stored state, such that a key error is raised.
# the max_depth is only used in the decision path method, which we don't currently use
# and is used to init an np array of zeros in version 0.18:
# https://github.com/scikit-learn/scikit-learn/blob/ef5cb84a805efbe4bb06516670a9b8c690992bd7/sklearn/tree/_tree.pyx#L926
# https://github.com/scikit-learn/scikit-learn/blob/ef5cb84a805efbe4bb06516670a9b8c690992bd7/sklearn/tree/_tree.pyx#L991
state["max_depth"] = state.get("max_depth", 0)
t = sklearn.tree._tree.Tree(*init_args)
t.__setstate__(state)
return t
class BlockManagerCodec(BaseCodec):
@classmethod
def encode(cls, obj):
from pandas.core.internals import BlockManager, SingleBlockManager
assert type(obj) == BlockManager or type(obj) == SingleBlockManager
state = obj.__getstate__()
return {"__mlspl_type": [type(obj).__module__, type(obj).__name__], "state": state}
@classmethod
def decode(cls, obj):
from pandas.core.internals import BlockManager, SingleBlockManager
state = obj["state"]
if obj['__mlspl_type'][1] == 'BlockManager':
t = BlockManager.__new__(BlockManager)
elif obj['__mlspl_type'][1] == 'SingleBlockManager':
t = SingleBlockManager.__new__(SingleBlockManager)
if isinstance(state, list):
state = tuple(state)
t.__setstate__(state)
return t
class SetCodec(BaseCodec):
whitelist = [k for k, v in list(CODECS.items()) if v == "SetCodec"]
@classmethod
def encode(cls, obj):
name, module = type(obj).__name__, type(obj).__module__
assert (module, name) in cls.whitelist
return {"__mlspl_type": [type(obj).__module__, type(obj).__name__], "set": list(obj)}
@classmethod
def decode(cls, obj):
module_name, name = obj["__mlspl_type"]
assert (module_name, name) in cls.whitelist
return set(obj["set"])
class TypeCodec(BaseCodec):
whitelist = [k for k, v in list(CODECS.items()) if v == "TypeCodec"]
@classmethod
def encode(cls, obj):
name, module = type(obj).__name__, type(obj).__module__
assert (module, name) in cls.whitelist
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"type": [obj.__module__, obj.__name__],
}
@classmethod
def decode(cls, obj):
module_name, name = obj["__mlspl_type"]
assert (module_name, name) in cls.whitelist
assert (obj["type"][0], obj["type"][1]) in CODECS
module = importlib.import_module(obj["type"][0])
return getattr(module, obj["type"][1])
class mtrandCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import numpy as np
assert type(obj) == np.random.mtrand.RandomState
init_args = obj.__reduce__()[1]
state = obj.__getstate__()
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"init_args": init_args,
"state": state,
}
@classmethod
def decode(cls, obj):
from numpy.random.mtrand import RandomState
state = obj["state"]
t = RandomState()
t.__setstate__(state)
return t
class SparseMatrixCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import numpy as np
from scipy import sparse
assert type(obj) == sparse.csr.csr_matrix
bio = BytesIO()
np.savez(bio, data=obj.data, indices=obj.indices, indptr=obj.indptr, shape=obj.shape)
return {
"__mlspl_type": [type(obj).__module__, type(obj).__name__],
"sparse_npy": base64.b64encode(bio.getvalue()),
}
@classmethod
def decode(cls, obj):
import numpy as np
from scipy.sparse.csr import csr_matrix
bio = BytesIO(base64.b64decode(obj["sparse_npy"]))
loader = np.load(bio)
return csr_matrix(
(loader["data"], loader["indices"], loader["indptr"]), shape=loader["shape"]
)
class SeriesCodec(BaseCodec):
whitelist = [k for k, v in CODECS.items() if v == 'SeriesCodec']
@classmethod
def encode(cls, obj):
name, module = type(obj).__name__, type(obj).__module__
assert (module, name) in cls.whitelist
state = obj.__getstate__()
return {'__mlspl_type': [type(obj).__module__, type(obj).__name__], 'state': state}
@classmethod
def decode(cls, obj):
import pandas.core.series
module_name, name = obj['__mlspl_type']
assert (module_name, name) in cls.whitelist
state = obj['state']
t = pandas.core.series.Series.__new__(pandas.core.series.Series)
t.__setstate__(state)
return t