Using codecs
The Splunk Machine Learning Toolkit (MLTK) uses codecs to serialize, save, or encode, and deserialize, load, or decode) algorithm models. A codec facilitates the core part of the serialization or deserialization process of a Python object in memory to file.
MLTK does not use pickles to serialize objects in Python. Instead, it uses a string representation of __dict__
or usess __getstate__
and __setstate__
to save and recreate objects. Python objects are converted to JSON objects, then saved into CSV files, and used as lookups within Splunk Enterprise.
To save the model of the algorithm, the algorithm must implement the register_codecs()
method. This method is invoked when algorithm.save_model()
is called. When algorithm.save_model()
is called, the following image shows the process that occurs to find the right codec for your algorithm class.
Built-in codecs
MLTK ships with built-in codecs. The following shows examples of how to use built-in codecs to implement the register_codecs()
method in your custom algorithm.
Pre-registered classes
The following classes are always loaded into the codec manager, so there is no need to explicitly define objects of these classes in register_codecs()
.
__buildin__.object
__buildin__.slice
__buildin__.set
__buildin__.type
numpy.ndarray
numpy.int8
numpy.int16
numpy.int32
numpy.int64
numpy.uint8
numpy.uint16
numpy.uint32
numpy.uint64
numpy.float16
numpy.float32
numpy.float64
numpy.float128
numpy.complex64
numpy.complex128
numpy.complex256
numpy.dtype
pandas.core.frame.DataFrame
pandas.core.index.Index
pandas.core.index.Int64Index
pandas.core.internals.BlockManager
The list of pre-registered codecs can be found in $SPLUNK_HOME/etc/apps/Splunk_ML_Toolkit/bin/codec/codecs.py
.
SimpleObjectCodec
You can use the SimpleObjectCodec
for any object that can be represented as a dictionary or a list.
For an example of this codec in action, see Support Vector Regressor example.
In the following custom algorithm, the codecs have already been configured:
@staticmethod
def register_codecs():
from codec.codecs import SimpleObjectCodec
from codec import codecs_manager
codecs_manager.add_codec('algos.SVR', 'SVR', SimpleObjectCodec)
codecs_manager.add_codec('sklearn.svm.classes', 'SVR', SimpleObjectCodec)
You need codecs for both algos.SVR.SVR
and sklearn.svm.classes.SVR
. In most situations, you can use SimpleObjectCodec for the wrapper class (algos.SVR.SVR
).
__dict__
.For this example, you can add the following in Python terminal:
>>> from sklearn.svm import SVR
>>> classifier = SVR()
>>> X = [[1,2],[3,4]]
>>> y = [55, 66]
>>> classifier.fit(X, y)
>>> classifier.__dict__
That action returns the following result:
{'C': 1.0,
'_dual_coef_': array([[-1., 1.]]),
'_gamma': 0.5,
'_impl': 'epsilon_svr',
'_intercept_': array([ 60.5]),
'_sparse': False,
'cache_size': 200,
'class_weight': None,
'class_weight_': array([], dtype=float64),
'coef0': 0.0,
'degree': 3,
'dual_coef_': array([[-1., 1.]]),
'epsilon': 0.1,
'fit_status_': 0,
'gamma': 'auto',
'intercept_': array([ 60.5]),
'kernel': 'rbf',
'max_iter': -1,
'n_support_': array([ 0, 1073741824], dtype=int32),
'nu': 0.0,
'probA_': array([], dtype=float64),
'probB_': array([], dtype=float64),
'probability': False,
'random_state': None,
'shape_fit_': (2, 2),
'shrinking': True,
'support_': array([0, 1], dtype=int32),
'support_vectors_': array([[ 1., 2.],
[ 3., 4.]]),
'tol': 0.001,
'verbose': False}
The returned __dict__
object contains objects/values that are either supported by the json.JSONEncoder
, or is one of the pre-registered classes shown in the example.
If one or more objects in __dict__
do not have built-in codec support, you can write a custom codec for them.
Write a custom codec
If the SimpleObjectCodec does not suffice, you can use the following example to learn how write a custom codec for KNeighborsClassifier
algorithm.
KNClassifier.py
#!/usr/bin/env python
from sklearn.neighbors import KNeighborsClassifier
from codec import codecs_manager
from base import BaseAlgo, ClassifierMixin
from util.param_util import convert_params
class KNClassifier(ClassifierMixin, BaseAlgo):
def __init__(self, options):
self.handle_options(options)
params = options.get('params', {})
out_params = convert_params(
params,
ints=['k'],
aliases={'k': 'n_neighbors'}
)
self.estimator = KNeighborsClassifier(**out_params)
@staticmethod
def register_codecs():
from codec.codecs import SimpleObjectCodec
codecs_manager.add_codec('algos.KNClassifier', 'KNClassifier', SimpleObjectCodec)
codecs_manager.add_codec('sklearn.neighbors.classification', 'KNeighborsClassifier', SimpleObjectCodec)
Investigate an object for a custom codec
In the event that SimpleObjectCodec
is not sufficient, when you run ... | fit KNClassifier into my_model
you see the following error message:
The error message indicated that part of the model sklearn.neighbors.kd_tree.KDTree
is not serializable. You can investigate the object in Python terminal:
>>> from sklearn.datasets import load_iris
>>> from sklearn.neighbors import KNeighborsClassifier
>>> iris = load_iris()
>>> X = iris.data
>>> y = iris.target
>>> classifier = KNeighborsClassifier()
>>> classifier.fit(X, y)
>>> classifier.__dict__
which gives us back:
{'_fit_X': array([[ 5.1, 3.5, 1.4, 0.2],
...
[ 5.9, 3. , 5.1, 1.8]]),
'_fit_method': 'kd_tree',
'_tree': <sklearn.neighbors.kd_tree.KDTree at 0x7ffe07902500>,
'_y': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
...
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]),
'algorithm': 'auto',
'classes_': array([0, 1, 2]),
'effective_metric_': 'euclidean',
'effective_metric_params_': {},
'leaf_size': 30,
'metric': 'minkowski',
'metric_params': None,
'n_jobs': 1,
'n_neighbors': 5,
'outputs_2d_': False,
'p': 2,
'radius': None,
'weights': 'uniform'}
In this case, '_tree': <sklearn.neighbors.kd_tree.KDTree at 0x7ffe07902500>
is not an object SimpleObjectCodec
can encode or decode.
You have the following two options to move forward:
Option 1: Avoid writing the codec by limiting the algorithm choice
A simple and quick solution, and a way to avoid writing a custom codec, is to add a parameter to the estimator to avoid using a KDTree:self.estimator = KNeighborsClassifier(algorithm='brute', **out_params)
Option 2: Write a Custom Codec
If you must use a codec, you can save the KDTree state and reconstruct it using a custom codec. In Python terminal, run the following:>>> kdtree_in_memory = classifier.__dict__['_tree']
>>> kdtree_in_memory.__getstate__()
This action prints the state of "_tree" in classifier
:
(array([[ 5.1, 3.5, 1.4, 0.2],
...
[ 5.9, 3. , 5.1, 1.8]]),
array([ 2, 13, 14, 16, 22, 35, 36, 38, 40, 41, 42, 49, 12,
...
143, 144, 145, 107, 120, 102, 122]),
array([(0, 150, 0, 10.29635857961444), (0, 75, 0, 3.5263295365010903),
(75, 150, 0, 4.506106967216822), (0, 37, 1, 0.8774964387392121),
(37, 75, 1, 3.0364452901377956), (75, 112, 1, 3.0401480227120525),
(112, 150, 1, 2.874456470360963)],
dtype=[('idx_start', '<i8'), ('idx_end', '<i8'), ('is_leaf', '<i8'), ('radius', '<f8')]),
array([[[ 4.3, 2. , 1. , 0.1],
...
[ 7.9, 3.8, 6.9, 2.5]]]),
30,
3,
7,
0,
0,
0,
0,
<sklearn.neighbors.dist_metrics.EuclideanDistance at 0x10d94d320>)
Most of the objects are numbers and arrays, which are covered by Python built-in and pre-registered codecs. At the end of the printed state, there is a second embedded object that is not supported by Python build-in or pre-registered codecs:
<sklearn.neighbors.dist_metrics.EuclideanDistance at 0x10d94d320>
You can investigate the state of the embedded object in Python terminal:
>>> dist_metric = kd_tree_in_memory.__getstate__()[-1]
>>> dist_metric.__getstate__()
The following is returned:
(2.0, array([ 0.]), array([ 0.]))
Custom codec implementation
All of the codecs must inherit from BaseCodec
in bin/codec/codecs.py
.
Custom codec implemented based on BaseCodec
is required to define two class methods - encode()
and decode()
class KDTreeCodec(BaseCodec):
@classmethod
def encode(cls, obj):
# Let's ensure the object is the one we think it is
import sklearn.neighbors
assert type(obj) == sklearn.neighbors.kd_tree.KDTree
# Let's retrieve our state from our previous exploration
state = obj.__getstate__()
# Return a dictionary
return {
'__mlspl_type': [type(obj).__module__, type(obj).__name__],
'state': state
}
@classmethod
def decode(cls, obj):
# Import the class we want to initialize
from sklearn.neighbors.kd_tree import KDTree
# Get our state from our saved obj
state = obj['state']
# Here is where we create the new object
# doing whatever is required for this particular class
t = KDTree.__new__(KDTree)
# Set the state
t.__setstate__(state)
# And we're done!
return t
Next, write a codec for sklearn.neighbors.dist_metrics.EuclideanDistance:
class EuclideanDistanceCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import sklearn.neighbors.dist_metrics
assert type(obj) == sklearn.neighbors.dist_metrics.EuclideanDistance
state = obj.__getstate__()
return {
'__mlspl_type': [type(obj).__module__, type(obj).__name__],
'state': state
}
@classmethod
def decode(cls, obj):
import sklearn.neighbors.dist_metrics
state = obj['state']
d = sklearn.neighbors.dist_metrics.EuclideanDistance()
d.__setstate__(state)
return d
The last step is to make sure that all of the necessary codecs are registered in the register_codecs()
method of the algorithm:
@staticmethod
def register_codecs():
from codec.codecs import SimpleObjectCodec
codecs_manager.add_codec('algos.KNClassifier', 'KNClassifier', SimpleObjectCodec)
codecs_manager.add_codec('sklearn.neighbors.classification', 'KNeighborsClassifier', SimpleObjectCodec)
codecs_manager.add_codec('sklearn.neighbors.kd_tree', 'KDTree', KDTreeCodec)
codecs_manager.add_codec('sklearn.neighbors.dist_metrics', 'EuclideanDistance', EuclideanDistanceCodec)
Complete example
KNClassifier.py
#!/usr/bin/env python
from sklearn.neighbors import KNeighborsClassifier
from codec import codecs_manager
from codec.codecs import BaseCodec
from base import BaseAlgo, ClassifierMixin
from util.param_util import convert_params
class KNClassifier(ClassifierMixin, BaseAlgo):
def __init__(self, options):
self.handle_options(options)
params = options.get('params', {})
out_params = convert_params(
params,
ints=['k'],
strs=['algorithm'],
aliases={'k': 'n_neighbors'}
)
if 'algorithm' in out_params:
if out_params['algorithm'] not in ['brute', 'KDTree']:
raise RuntimeError("algorithm must be either 'brute' or 'KDTree'")
self.estimator = KNeighborsClassifier(**out_params)
@staticmethod
def register_codecs():
from codec.codecs import SimpleObjectCodec
codecs_manager.add_codec('algos.KNClassifier', 'KNClassifier', SimpleObjectCodec)
codecs_manager.add_codec('sklearn.neighbors.classification', 'KNeighborsClassifier', SimpleObjectCodec)
codecs_manager.add_codec('sklearn.neighbors.kd_tree', 'KDTree', KDTreeCodec)
codecs_manager.add_codec('sklearn.neighbors.dist_metrics', 'EuclideanDistance', EuclideanDistanceCodec)
class KDTreeCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import sklearn.neighbors
assert type(obj) == sklearn.neighbors.kd_tree.KDTree
state = obj.__getstate__()
return {
'__mlspl_type': [type(obj).__module__, type(obj).__name__],
'state': state
}
@classmethod
def decode(cls, obj):
from sklearn.neighbors.kd_tree import KDTree
state = obj['state']
t = KDTree.__new__(KDTree)
t.__setstate__(state)
return t
class EuclideanDistanceCodec(BaseCodec):
@classmethod
def encode(cls, obj):
import sklearn.neighbors.dist_metrics
assert type(obj) == sklearn.neighbors.dist_metrics.EuclideanDistance
state = obj.__getstate__()
return {
'__mlspl_type': [type(obj).__module__, type(obj).__name__],
'state': state
}
@classmethod
def decode(cls, obj):
import sklearn.neighbors.dist_metrics
state = obj['state']
d = sklearn.neighbors.dist_metrics.EuclideanDistance()
d.__setstate__(state)
return d